In [0]:
!rm -rf /content/data/
!mkdir -p /content/data/
!unzip "/content/train.zip" -d /content/data/

In [0]:
import pandas as pd
from glob import glob
import os
from PIL import Image
import numpy as np
import random

In [3]:
%tensorflow_version 1.x

from keras.applications.resnet50 import ResNet50
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input, decode_predictions
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

random.seed(34)

TensorFlow 1.x selected.


Using TensorFlow backend.


In [0]:
def read_dataset(path):
    X = []
    y = []
    
    image_paths_list = glob(os.path.join(path, 'train', '*.jpg'))
    image_paths_sample = random.sample(image_paths_list, 10000)

    for image_path in image_paths_sample:
        image_name = os.path.basename(image_path)
        image_name_parts = image_name.split('.')
        label = image_name_parts[0] if len(image_name_parts) == 3 else None

        if label:
            y.append(int(label == 'cat'))
            
        
        x = image.img_to_array(image.load_img(image_path, target_size=(224, 224)))
        x = preprocess_input(x)
            
        X.append(x)
            
    return np.array(X), np.array(y)

In [0]:
from tqdm import tqdm_notebook

train_directory = "data/"
batch_size = 1000
images_number = 10000

def read_dataset(path):
    """ X should be numpy array from the start. Otherwise it takes too much memory.
    """

    X = np.array([])
    y = np.array([])
    
    image_paths_list = glob(os.path.join(path, 'train', '*.jpg'))
    image_paths_list = random.sample(image_paths_list, images_number)
    
    np.random.shuffle(image_paths_list)
    
    # split into array of batch_size length arrays
    splitted_image_paths_list = np.split(np.array(image_paths_list), int(len(image_paths_list) / batch_size))
    
    for image_path_batch in tqdm_notebook(splitted_image_paths_list, desc='Batches'):
        x_batch = []
        y_batch = []
        
        for image_path in tqdm_notebook(image_path_batch, desc='Current batch', leave=False):
            image_name = os.path.basename(image_path)
            image_name_parts = image_name.split('.')
            label = image_name_parts[0] if len(image_name_parts) == 3 else None
            
            if label:
                y_batch.append(int(label == 'cat'))
            
            x = image.img_to_array(image.load_img(image_path, target_size=(224, 224)))#, dtype=np.uint8)

            '''if x_batch.size == 0:
              x_batch = x[None,:,:,:]
            else:
              x_batch = np.concatenate([x_batch, x[None,:,:,:]], axis=0)'''
            x_batch.append(x)
        

        x_batch = preprocess_input(np.array(x_batch))
        y_batch = np.array(y_batch)
        
        if X.size == 0:
          X = x_batch
          y = y_batch
        else:
          X = np.concatenate([X, x_batch], axis=0)
          y = np.concatenate([y, y_batch], axis=0)
    
    return X, y

In [6]:
# Используйте функцию read_dataset чтобы получить обучающую выборку
features, labels = read_dataset("/content/data")

# Проверьте размерности загруженных данных
print(features.shape)
print(labels.shape)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, description='Batches', max=10.0, style=ProgressStyle(description_width…

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, description='Current batch', max=1000.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Current batch', max=1000.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Current batch', max=1000.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Current batch', max=1000.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Current batch', max=1000.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Current batch', max=1000.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Current batch', max=1000.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Current batch', max=1000.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Current batch', max=1000.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Current batch', max=1000.0, style=ProgressStyle(descripti…


(10000, 224, 224, 3)
(10000,)


In [7]:
# с помощью функции train_test_split поделите выборку на train и test в отношении 70/30
# В качестве признаков используйте эмбеддинги, полученные на предыдущем шаге
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3)
print(features_train.shape)
print(labels_train.shape)
print(features_test.shape)
print(labels_test.shape)

(7000, 224, 224, 3)
(7000,)
(3000, 224, 224, 3)
(3000,)


In [8]:
# Создаём модель для получения эмбеддингов
model = ResNet50(weights='imagenet')

# Используя объект model постройте эмбеддинги для обучающей выборки
%time embeddings_train = model.predict(features_train)
%time embeddings_test = model.predict(features_test)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5

CPU times: user 52min 17s, sys: 19 s, total: 52min 36s
Wall time: 13min 23s
CPU times: user 22min 34s, sys: 8.81 s, total: 22min 43s
Wall time: 5min 46s


Попытка классифицировать свою кошку с помощью ResNet.

Создайте объект XGBClassifier со стандартными параметрами

https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier

Обучите его

In [0]:
# Получите предсказания на тестовом и на обучающем множестве
clf = XGBClassifier()

clf.fit(embeddings_train, labels_train)
predicted_test = clf.predict(embeddings_test)

In [10]:
# С помощью функции accuracy_score оцените результаты
print(accuracy_score(labels_test, predicted_test))

0.9863333333333333


In [11]:
tuned_clf = XGBClassifier(n_estimators=305, learning_rate=0.02)

eval_set = [(embeddings_train, labels_train), (embeddings_test, labels_test)]


%time tuned_clf.fit(embeddings_train, labels_train, eval_set=eval_set, verbose=False)
tuned_test = tuned_clf.predict(embeddings_test)
print("\nAccuracy:", accuracy_score(labels_test, tuned_test))



CPU times: user 2min 25s, sys: 41.3 ms, total: 2min 25s
Wall time: 2min 25s

Accuracy: 0.9846666666666667


Оцените важность признаков с помощью поля feature_importances_

Пользуясь списком https://gist.github.com/yrevar/942d3a0ac09ec9e5eb3a посмотрите, какие классы оказались наиболее важными (со значениями importance больше 0)
    
Изменяйте параметры, чтобы оставить только действительно важные признаки

In [12]:
all_important_features_indices = np.where(tuned_clf.feature_importances_ > 0)
print('All non-zero importance classes: ', all_important_features_indices)

top_ten_features = np.argpartition(tuned_clf.feature_importances_, -10)[-10:]
print('Top 10 important features indices: ', top_ten_features)
print('Importance of top 10 features: ',tuned_clf.feature_importances_[top_ten_features])

All non-zero importance classes:  (array([  6,   9,  41,  48,  52,  55,  59,  61,  80,  82, 100, 132, 148,
       151, 158, 159, 162, 164, 166, 167, 170, 172, 175, 176, 177, 178,
       179, 180, 182, 183, 184, 185, 186, 193, 194, 200, 201, 202, 205,
       207, 208, 209, 211, 212, 215, 219, 220, 224, 225, 226, 227, 234,
       235, 236, 238, 242, 243, 244, 246, 251, 259, 260, 262, 264, 265,
       269, 273, 275, 276, 278, 280, 281, 282, 283, 284, 285, 287, 293,
       298, 308, 313, 318, 321, 330, 332, 346, 348, 357, 358, 360, 361,
       371, 378, 386, 400, 431, 436, 450, 453, 458, 459, 460, 472, 476,
       489, 493, 501, 516, 518, 526, 527, 546, 548, 562, 564, 572, 574,
       585, 586, 588, 590, 595, 602, 608, 610, 617, 632, 648, 659, 664,
       674, 676, 679, 681, 684, 689, 705, 721, 734, 738, 741, 742, 753,
       772, 781, 785, 795, 796, 810, 826, 840, 848, 852, 855, 859, 861,
       863, 865, 870, 875, 876, 894, 900, 917, 918, 929, 947, 956, 975,
       997]),)
Top 10 importa