In [2]:
import pandas as pd
import pathlib
import hashlib
import numpy as np
import random
from PIL import Image
import tqdm
import sys
from sklearn.model_selection import train_test_split
from sklearn.utils import check_random_state

STUDENT_ID = "glebnaz"

train_directory = pathlib.Path("data/train")
sample_size = 5000

def initialize_random_seed():
    """Инициализирует ГПСЧ из STUDENT_ID"""
    sha256 = hashlib.sha256()
    sha256.update(STUDENT_ID.encode("utf-8"))
    
    fingerprint = int(sha256.hexdigest(), 16) % (2**32) 
    
    random.seed(fingerprint)
    np.random.seed(fingerprint)


def read_target_variable():
    """Прочитаем разметку фотографий из названий файлов"""
    target_variable = {
        "filename": [],
        "is_cat": []
    }
    image_paths = list(train_directory.glob("*.jpg"))
    random.shuffle(image_paths)
    for image_path in image_paths[:sample_size]:
        filename = image_path.name
        class_name = filename.split(".")[0]
        target_variable["filename"].append(filename)
        target_variable["is_cat"].append(class_name == "cat")

    return pd.DataFrame(data=target_variable)


initialize_random_seed()

target_df = read_target_variable()
target_df

Unnamed: 0,filename,is_cat
0,cat.3485.jpg,True
1,cat.12496.jpg,True
2,cat.10120.jpg,True
3,dog.7293.jpg,False
4,dog.5402.jpg,False
...,...,...
4995,cat.9121.jpg,True
4996,cat.7054.jpg,True
4997,dog.4704.jpg,False
4998,cat.11267.jpg,True


In [3]:
#строим признаковое описание
def read_data(target_df):
    """Читает данные изображений и строит их признаковое описание"""
    image_size = (100, 100)
    features = []
    target = []
    for i, image_name, is_cat in tqdm.tqdm(target_df.itertuples(), total=len(target_df)):
        image_path = str(train_directory / image_name)
        image = Image.open(image_path)
        image = image.resize(image_size) # уменьшаем изображения
        image = image.convert('LA') # преобразуем в Ч\Б
        pixels = np.asarray(image)[:, :, 0]
        pixels = pixels.flatten()
        features.append(pixels)
        target.append(is_cat)
    return np.array(features), np.array(target)

features, target = read_data(target_df)
features

100%|██████████| 5000/5000 [00:34<00:00, 145.00it/s]


array([[193, 213, 219, ..., 151, 157, 204],
       [221, 221, 222, ...,  35,  38,  33],
       [ 96,  99, 100, ...,   4,   3,   4],
       ...,
       [249, 252, 234, ..., 164, 164, 159],
       [ 46,  50,  54, ..., 135, 144, 143],
       [132, 133, 134, ...,  62,  57,  54]], dtype=uint8)

In [4]:
features.shape

(5000, 10000)

In [5]:
#Разбиение данных на обучающую и тренеровочную модель
random_state = check_random_state(0)
permutation = random_state.permutation(features.shape[0])
features = features[permutation]
target = target[permutation]
features = features.reshape((features.shape[0], -1))

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)
X_train, X_valid,y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25)

In [6]:
y_train

array([ True, False,  True, ...,  True,  True,  True])

In [7]:
#переходим к обучению модели
from sklearn import linear_model
eps = 0.11
i=0.1
print("enter")
model = linear_model.SGDClassifier(loss="log",max_iter=5000)
model.fit(X_train, y_train)
score = 0.1
while i<1:
    print(i)
    model = linear_model.SGDClassifier(loss="log",max_iter=5000,eta0=i)
    model.fit(X_train, y_train)
    i=i+0.1
    score_s = model.score(X_test,y_test)
    if score_s>score:
        print(score)
        score = score_s
        eps=i
print(eps)
model = linear_model.SGDClassifier(loss="log",max_iter=5000,eta0=i)
model.fit(X_train, y_train)
print("finish to find eps")
print(model.score(X_valid,y_valid))




enter
0.1
0.1
0.2
0.30000000000000004
0.5
0.4
0.5
0.6
0.7
0.7999999999999999
0.8999999999999999
0.9999999999999999
0.4
finish to find eps
0.544


In [8]:
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

0.7296666666666667
0.518


In [13]:
# настройка гипер параметров, циклом подбираем eps
# все делаю циклом что бы модель не зависила от данных
i = 0.1
eps = 0 
max_score = 0.1
while i < 1:
    print("start find eps")
    sgd_tuned_clf = linear_model.SGDClassifier(loss='log', learning_rate='constant', eta0=i, penalty='l1')
    sgd_tuned_clf.fit(X_train, y_train)
    score_s = sgd_tuned_clf.score(X_valid, y_valid)
    if max_score < score_s:
        max_score = score_s
        print("new max score:"+" "+str(score_s))
        eps = i
    i=i+1    

print("final result")
sgd_tuned_clf = linear_model.SGDClassifier(loss='log', learning_rate='constant', eta0=eps, penalty='l1')
sgd_tuned_clf.fit(X_train, y_train)
print(sgd_tuned_clf.score(X_valid, y_valid))

start find eps
new max score: 0.493
final result
0.546


In [14]:
#оценка результатов
print(sgd_tuned_clf.score(X_train, y_train))

0.8433333333333334


In [15]:
#нормальзация данных
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.fit_transform(X_valid)
X_test = scaler.transform(X_test)

i = 0.1
eps = 0.1 
max_score = 0.1
while i<1:
    sgd_scaled_and_tuned_clf = linear_model.SGDClassifier(loss='log', learning_rate='constant', eta0=i)
    sgd_scaled_and_tuned_clf.fit(X_train, y_train)
    score_s = sgd_scaled_and_tuned_clf.score(X_valid, y_valid)
    if max_score < score_s:
        max_score = score_s
        print("new max score:"+" "+str(score_s))
        eps = i
    i=i+1 

print("rezult after normalization data")
sgd_scaled_and_tuned_clf = linear_model.SGDClassifier(loss='log', learning_rate='constant', eta0=eps)
sgd_scaled_and_tuned_clf.fit(X_train, y_train)
print(sgd_scaled_and_tuned_clf.score(X_valid, y_valid))
print(sgd_scaled_and_tuned_clf.score(X_test, y_test))

#LogisticRegression
print("LogisticRegression rezult")
log_reg_scaled_clf = LogisticRegression(C=0.05, penalty='l1', solver='saga', tol=eps)
log_reg_scaled_clf.fit(X_train, y_train)

print(log_reg_scaled_clf.score(X_valid, y_valid))
print(log_reg_scaled_clf.score(X_test, y_test))

new max score: 0.526
rezult after normalization data
0.498
0.51
LogisticRegression rezult
0.537
0.576


In [24]:
##random forest
from sklearn.ensemble import RandomForestClassifier


X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

i=100
est = i
score = 0.1
while i<400:
    print("iteration number "+str(i))
    rand_forest_clf = RandomForestClassifier(n_estimators=i)
    rand_forest_clf.fit(X_train, y_train)
    score_s = rand_forest_clf.score(X_test, y_test)
    if score_s>score:
        score = score_s
        est = i
    i=i+50
    
print("final rezult")
rand_forest_clf = RandomForestClassifier(n_estimators=est)
rand_forest_clf.fit(X_train, y_train)
print("validation set")
print(rand_forest_clf.score(X_valid, y_valid))
print("train set")
print(rand_forest_clf.score(X_train, y_train))

iteration number 100
iteration number 150
iteration number 200
iteration number 250
iteration number 300
iteration number 350
final rezult
validation set
0.479
train set
1.0


In [27]:
from sklearn.model_selection import cross_val_score

print("cross validation")

scores = cross_val_score(rand_forest_clf, X_train, y_train, cv=5)
print(scores)

cross validation
[0.60875 0.62625 0.6125  0.65625 0.6275 ]
