In [None]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
from glob import glob
from tqdm import tqdm
import os
import pickle as pkl
from time import time
import uuid
import zipfile

## Reading the dataset:

In [None]:
# Uncomment if you are in Google Colab:

# !pip install wget
# import wget, os
# url = "https://github.com/IvanSol/ml-course/raw/refs/heads/master/Class05_classic_CV/data/barcodes.zip"
# os.makedirs('data', exist_ok=True)   
# wget.download(url, out=os.path.join('data', 'barcodes.zip'))

In [None]:
with zipfile.ZipFile('data/barcodes.zip', 'r') as z:
    z.extractall('data/barcodes/')

In [None]:
X_imgs = []
y = []
for label_path in glob('data/barcodes/train/*'):
    label = os.path.basename(label_path)
    for fn in tqdm(glob(os.path.join(label_path, '*.png')), desc=f'Reading label {label}'):
        X_imgs.append(cv2.imread(fn, cv2.IMREAD_GRAYSCALE))
        y.append(label)
y = np.array(y).astype(int)
X_imgs = np.array(X_imgs, dtype="object")

In [None]:
for label in sorted(list(set(y))):
    print(label)
    plt.subplot(121)
    plt.imshow(X_imgs[y == label][0], cmap='gray')
    plt.subplot(122)
    plt.imshow(X_imgs[y == label][100], cmap='gray')
    plt.show()

## Let us try simpliest feature -- ratio of white color on the image.

In [None]:
def get_features(X_imgs):
    X = []
    for img in tqdm(X_imgs, desc='Extracting features...'):
        X.append([
            (img.astype(float) / 255).sum() / img.size,
        ])
    X = np.array(X)
    return X

In [None]:
X = get_features(X_imgs)

In [None]:
from sklearn.linear_model import LogisticRegression as LR

### Only training part of the dataset is labeled. So, it is the only part of the dataset we can use for training and testing.
### Let us try test on the same part we used for training (not a good idea, actually):

In [None]:
# Here we are training Logistic Regression:
model = LR()
model.fit(X, y)

#### Calculating the score:

In [None]:
model.score(X, y)

#### Score above is accuracy score. We may calculate it manually by calculating ratio of correct classifications:

In [None]:
y_pred = model.predict(X)
print('Accuracy =', (y_pred == y).sum() / len(y))

## Let us review testing data and generate resulting file:

In [None]:
X_test_imgs = []
test_names = []
for fn in tqdm(glob('data/barcodes/test/*'), desc='Reading test set'):
    X_test_imgs.append(cv2.imread(fn, cv2.IMREAD_GRAYSCALE))
    test_names.append(os.path.basename(fn))
X_test_imgs = np.array(X_test_imgs, dtype="object")

In [None]:
for im in X_test_imgs[:5]:
    plt.imshow(im, cmap='gray')
    plt.show()

## As you may see, images in the dataset can be rotated. You should be able to deal with it.

## Let us generate the answer:

In [None]:
X_test = get_features(X_test_imgs)

In [None]:
y_test_pred = model.predict(X_test)

In [None]:
for label in sorted(list(set(y))):
    print(label)
    if (y_test_pred == label).sum() < 6:
        continue
    plt.subplot(121)
    plt.imshow(X_test_imgs[y_test_pred == label][0], cmap='gray')
    plt.subplot(122)
    plt.imshow(X_test_imgs[y_test_pred == label][5], cmap='gray')
    plt.show() 

# Your task is to:
- Find better way to evaluate your model (not to make testing on training set).
- Get higher accuracy (>95% -- 1 point, >97% -- 2 points, >98% -- 3 points, >99% -- 4 points, >99.5% -- 5 pointis).
- Plot confusion matrix for your model (1 point) and ROC curves for each class (1 point).

In [None]:
# store matrix and ROC curves as images here:
conf_matrix = None
ROC_curves = []

In [None]:
answer_to_submit = {fn: yy for fn, yy in zip(test_names, y_test_pred)}

In [None]:
with open('my_answer.pkl', 'wb') as f:
    pkl.dump((answer_to_submit, conf_matrix, ROC_curves, uuid.uuid4().hex, time()), f)

### Now submit `my_answer.pkl` (either to google classroom or to yandex contest, I will let you know the proper way after the weekend).