Lets first import libs

In [27]:
import numpy as np
import os
import pandas as pd
import random
from tqdm import tqdm
import xgboost as xgb

from skimage.feature import hog
from skimage import data, transform, color, exposure, io

import scipy
from sklearn.metrics import fbeta_score

Now define paths

In [28]:
random_seed = 420
random.seed(random_seed)
np.random.seed(random_seed)

# Load data
train_path = '../input/train-jpg/'
test_path = '../input/test-jpg-v2/'
train = pd.read_csv('../input/train_v2.csv')
test = pd.read_csv('../input/sample_submission_v2.csv')

define function for extracting features

In [29]:
def extract_features(df, data_path):
    im_features = df.copy()

    N = len(im_features.image_name.values)

    r_mean = np.zeros(N)
    g_mean = np.zeros(N)
    b_mean = np.zeros(N)
    y_mean = np.zeros(N)

    r_std = np.zeros(N)
    g_std = np.zeros(N)
    b_std = np.zeros(N)
    y_std = np.zeros(N)

    r_max = np.zeros(N)
    g_max = np.zeros(N)
    b_max = np.zeros(N)
    y_max = np.zeros(N)

    r_min = np.zeros(N)
    g_min = np.zeros(N)
    b_min = np.zeros(N)
    y_min = np.zeros(N)
    
    r_kurtosis = np.zeros(N)
    g_kurtosis = np.zeros(N)
    b_kurtosis = np.zeros(N)
    y_kurtosis = np.zeros(N)

    r_skewness = np.zeros(N)
    g_skewness = np.zeros(N)
    b_skewness = np.zeros(N)
    y_skewness = np.zeros(N)
    
    HOG = np.zeros((N,512))

    for i, image_name in enumerate(tqdm(im_features.image_name.values, miniters=1000)):
        im = io.imread(data_path + image_name + '.jpg')
        im = transform.rescale(im, 0.5)
        gray = color.rgb2gray(im)
        fd = hog(gray, orientations=8, pixels_per_cell=(16, 16), cells_per_block=(1, 1),visualise=False,block_norm='L2-Hys')

        HOG[i,:] = fd
        r = im[:, :, 0].ravel()
        g = im[:, :, 1].ravel()
        b = im[:, :, 2].ravel()
        y = gray.ravel()

        r_mean[i] = np.mean(r)
        g_mean[i] = np.mean(g)
        b_mean[i] = np.mean(b)
        y_mean[i] = np.mean(y)
        
        r_std[i] = np.std(r)
        g_std[i] = np.std(g)
        b_std[i] = np.std(b)
        y_std[i] = np.std(y)

        r_max[i] = np.max(r)
        g_max[i] = np.max(g)
        b_max[i] = np.max(b)
        y_max[i] = np.max(y)

        r_min[i] = np.min(r)
        g_min[i] = np.min(g)
        b_min[i] = np.min(b)
        y_min[i] = np.min(y)

        r_kurtosis[i] = scipy.stats.kurtosis(r)
        g_kurtosis[i] = scipy.stats.kurtosis(g)
        b_kurtosis[i] = scipy.stats.kurtosis(b)
        y_kurtosis[i] = scipy.stats.kurtosis(y)

        r_skewness[i] = scipy.stats.skew(r)
        g_skewness[i] = scipy.stats.skew(g)
        b_skewness[i] = scipy.stats.skew(b)
        y_skewness[i] = scipy.stats.skew(y)

    im_features['r_mean'] = r_mean
    im_features['g_mean'] = g_mean
    im_features['b_mean'] = b_mean
    im_features['y_mean'] = y_mean

    im_features['r_std'] = r_std
    im_features['g_std'] = g_std
    im_features['b_std'] = b_std
    im_features['y_std'] = y_std

    im_features['r_max'] = r_max
    im_features['g_max'] = g_max
    im_features['b_max'] = b_max
    im_features['y_max'] = y_max

    im_features['r_min'] = r_min
    im_features['g_min'] = g_min
    im_features['b_min'] = b_min
    im_features['y_min'] = y_min


    im_features['r_range'] = r_max - r_min
    im_features['g_range'] = g_max - g_min
    im_features['b_range'] = b_max - b_min
    im_features['y_range'] = y_max - y_min

    im_features['r_kurtosis'] = r_kurtosis
    im_features['g_kurtosis'] = g_kurtosis
    im_features['b_kurtosis'] = b_kurtosis
    im_features['y_kurtosis'] = y_kurtosis

    im_features['r_skewness'] = r_skewness
    im_features['g_skewness'] = g_skewness
    im_features['b_skewness'] = b_skewness
    im_features['y_skewness'] = y_skewness

    return im_features, HOG

start processsing

In [30]:
# Extract features
print('Extracting train features')
train_features, train_HOG = extract_features(train, train_path)
print('Extracting test features')
test_features, test_HOG = extract_features(test, test_path)

# Prepare data
X = np.array(train_features.drop(['image_name', 'tags'], axis=1))
y_train = []

flatten = lambda l: [item for sublist in l for item in sublist]
labels = np.array(list(set(flatten([l.split(' ') for l in train_features['tags'].values]))))

label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

Extracting train features


  warn("The default mode, 'constant', will be changed to 'reflect' in "
100%|██████████| 40479/40479 [31:39<00:00, 21.31it/s] 


Extracting test features


100%|██████████| 61191/61191 [41:37<00:00, 27.23it/s]  


In [39]:
XH = np.hstack((X, train_HOG))

Tag to classes transform

In [31]:
for tags in tqdm(train.tags.values, miniters=1000):
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1
    y_train.append(targets)

y = np.array(y_train, np.uint8)

print('X.shape = ' + str(X.shape))
print('y.shape = ' + str(y.shape))

n_classes = y.shape[1]

X_test = np.array(test_features.drop(['image_name', 'tags'], axis=1))

100%|██████████| 40479/40479 [00:00<00:00, 256181.53it/s]


X.shape = (40479, 28)
y.shape = (40479, 17)


In [40]:
XHtest = np.hstack((X_test, test_HOG))

trainand predict with xgboost

In [43]:
# Train and predict with one-vs-all strategy
y_pred = np.zeros((XHtest.shape[0], n_classes))

print('Training and making predictions')
for class_i in tqdm(range(n_classes), miniters=1):
    model = xgb.XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=1000, \
                              silent=True, objective='binary:logistic', nthread=-1, \
                              gamma=0, min_child_weight=1, max_delta_step=0, \
                              subsample=0.7, colsample_bytree=1, colsample_bylevel=1, \
                              reg_alpha=0, reg_lambda=0, scale_pos_weight=1, \
                              base_score=0.5, seed=random_seed, missing=None)
    model.fit(XH, y[:, class_i])
    y_pred[:, class_i] = model.predict_proba(XHtest)[:, 1]

preds = [' '.join(labels[y_pred_row > 0.21]) for y_pred_row in y_pred]

Training and making predictions


100%|██████████| 17/17 [2:42:19<00:00, 543.44s/it]  


In [33]:
'''
xgb_params = {
    'n_trees': 500,
    'eta': 0.05,
    'max_depth': 4,
    'subsample': 0.7,
    'objective': 'binary:logistic',
    'eval_metric': 'rmse',
    'base_score': 0.1,
    'silent': 1
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X, y[:, 0])

num_boost_rounds = 1000
#thres = [0.07, 0.17, 0.2, 0.04, 0.23, 0.33, 0.24, 0.22, 0.1, 0.19, 0.23, 0.24, 0.12, 0.14, 0.25, 0.26, 0.16]

#0.1930
xgb.cv(xgb_params, dtrain, num_boost_round=num_boost_rounds,early_stopping_rounds=200)
'''

Unnamed: 0,test-rmse-mean,test-rmse-std,train-rmse-mean,train-rmse-std
0,0.719944,0.000537,0.719827,0.000239
1,0.687942,0.000493,0.687648,0.000150
2,0.658814,0.000404,0.658439,0.000109
3,0.632006,0.000410,0.631538,0.000021
4,0.607208,0.000402,0.606595,0.000113
5,0.584132,0.000423,0.583390,0.000077
6,0.562631,0.000524,0.561719,0.000097
7,0.542660,0.000482,0.541553,0.000186
8,0.523843,0.000508,0.522582,0.000327
9,0.506304,0.000696,0.504898,0.000213


Save to file

In [44]:
subm = pd.DataFrame()
subm['image_name'] = test_features.image_name.values
subm['tags'] = preds
subm.to_csv('submission5.csv', index=False)