In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import os, random
from tqdm import tqdm
import scipy
from sklearn.metrics import fbeta_score
from PIL import Image



In [2]:
random_seed = 0
random.seed(random_seed)
np.random.seed(random_seed)

In [3]:
TRAIN_PATH = '/data/amazon/train-jpg/'
TEST_PATH = '/data/amazon/test-jpg/'
train_df = pd.read_csv('/data/amazon/train_v2.csv')
test_df = pd.read_csv('/data/amazon/sample_submission_v2.csv')

In [4]:
def extract_features(df, data_path):
    im_features = df.copy()
    N = len(im_features.image_name.values)
    
    r_mean = np.zeros(N)
    g_mean = np.zeros(N)
    b_mean = np.zeros(N)
    
    r_std = np.zeros(N)
    g_std = np.zeros(N)
    b_std = np.zeros(N)
    
    r_max = np.zeros(N)
    g_max = np.zeros(N)
    b_max = np.zeros(N)
    
    r_min = np.zeros(N)
    g_min = np.zeros(N)
    b_min = np.zeros(N)
    
    r_kurtosis = np.zeros(N)
    g_kurtosis = np.zeros(N)
    b_kurtosis = np.zeros(N)
    
    r_skewness = np.zeros(N)
    g_skewness = np.zeros(N)
    b_skewness = np.zeros(N)
    
    for i, image_name in enumerate(tqdm(im_features.image_name.values, miniters=1000)):
        im = Image.open(data_path + image_name + '.jpg')
        im = np.array(im)[:,:,:3]
        
        r = im[:,:,0].ravel()
        g = im[:,:,1].ravel()
        b = im[:,:,2].ravel()
        
        r_mean[i] = np.mean(r)
        g_mean[i] = np.mean(g)
        b_mean[i] = np.mean(b)
        
        r_std[i] = np.std(r)
        g_std[i] = np.std(g)
        b_std[i] = np.std(b)
        
        r_max[i] = np.max(r)
        g_max[i] = np.max(g)
        b_max[i] = np.max(b)
        
        r_min[i] = np.min(r)
        g_min[i] = np.min(g)
        b_min[i] = np.min(b)
        
        r_kurtosis[i] = scipy.stats.kurtosis(r)
        g_kurtosis[i] = scipy.stats.kurtosis(g)
        b_kurtosis[i] = scipy.stats.kurtosis(b)
        
        r_skewness[i] = scipy.stats.skew(r)
        g_skewness[i] = scipy.stats.skew(g)
        b_skewness[i] = scipy.stats.skew(b)
        
    im_features['r_mean'] = r_mean
    im_features['g_mean'] = g_mean
    im_features['b_mean'] = b_mean
    
    im_features['rgb_mean_mean'] = (r_mean + g_mean + b_mean)/3
    
    im_features['r_std'] = r_std
    im_features['g_std'] = g_std
    im_features['b_std'] = b_std
    
    im_features['rgb_mean_std'] = (r_std + g_std + b_std)/3
    
    im_features['r_max'] = r_max
    im_features['g_max'] = g_max
    im_features['b_max'] = b_max
    
    im_features['rgb_mean_max'] = (r_max + g_max + b_max)/3
    
    im_features['r_min'] = r_min
    im_features['g_min'] = g_min
    im_features['b_min'] = b_min
    
    im_features['rgb_mean_min'] = (r_min + g_min + b_min)/3
    
    im_features['r_range'] = r_max - r_min
    im_features['g_range'] = g_max - g_min
    im_features['b_range'] = b_max - b_min
    
    im_features['r_kurtosis'] = r_kurtosis
    im_features['g_kurtosis'] = g_kurtosis
    im_features['b_kurtosis'] = b_kurtosis
    
    im_features['r_skewness'] = r_skewness
    im_features['g_skewness'] = g_skewness
    im_features['b_skewness'] = b_skewness
    
    return im_features

In [5]:
# Extract features
print('Extracting train features')
train_features = extract_features(train_df, TRAIN_PATH)
print('Extracting test features')
test_features = extract_features(test_df, TEST_PATH)

# Prepare data
X = np.array(train_features.drop(['image_name', 'tags'], axis=1))
y_train = []

  0%|          | 0/40479 [00:00<?, ?it/s]

Extracting train features


100%|██████████| 40479/40479 [06:30<00:00, 103.62it/s]
  0%|          | 0/61191 [00:00<?, ?it/s]

Extracting test features


100%|██████████| 61191/61191 [13:28<00:00, 75.68it/s]


In [6]:
flatten = lambda l: [item for sublist in l for item in sublist]
labels = np.array(list(set(flatten([l.split(' ') for l in train_features['tags'].values]))))

label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

In [8]:
for tags in tqdm(train_df.tags.values, miniters=1000):
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    y_train.append(targets)

100%|██████████| 40479/40479 [00:00<00:00, 238269.05it/s]


In [9]:
y = np.array(y_train, np.uint8)

print('X.shape = ' + str(X.shape))
print('y.shape = ' + str(y.shape))

X.shape = (40479, 25)
y.shape = (40479, 17)


In [10]:
n_classes = y.shape[1]

X_test = np.array(test_features.drop(['image_name', 'tags'], axis=1))

# Train and predict with one-vs-all strategy
y_pred = np.zeros((X_test.shape[0], n_classes))

In [11]:
print('Training and making predictions')
for class_i in tqdm(range(n_classes), miniters=1): 
    model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=100,
                              silent=True, objective='binary:logistic', nthread=-1,
                              gamma=0, min_child_weight=1, max_delta_step=0, 
                              subsample=1, colsample_bytree=1, colsample_bylevel=1,
                              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, 
                              base_score=0.5, seed=random_seed, missing=None)
    model.fit(X, y[:, class_i])
    y_pred[:, class_i] = model.predict_proba(X_test)[:, 1]


  0%|          | 0/17 [00:00<?, ?it/s]

Training and making predictions


100%|██████████| 17/17 [00:17<00:00,  1.06s/it]


In [12]:
preds = [' '.join(labels[y_pred_row > 0.21]) for y_pred_row in y_pred]

In [13]:
subm = pd.DataFrame()
subm['image_name'] = test_features.image_name.values
subm['tags'] = preds
subm.to_csv('submission.csv', index=False)

In [None]:
# p_valid = model.predict(x_valid, batch_size=128)
# print(y_valid)
# print(p_valid)
# print(fbeta_score(y_valid, y_pred > 0.2, beta=2, average='samples'))