In [9]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import time
import datetime
import pandas as pd

features = pd.read_csv('train/train_medium.csv', header = 0)

X = features.drop(['master_id','worked', 'obs_date'], axis=1)
y = features['worked']
print('{0} train samples loaded'.format(len(X)))

130847 train samples loaded


# Preprocessing
### Fill na

In [10]:
X = X.fillna(-999)

In [12]:
def preprocessing_fillna(df):
    df['days_since_last_dow_order'] = df['days_since_last_dow_order'].fillna(df['days_since_last_dow_order'].mean())
    df = df.fillna(0)
    return df
    
X = preprocessing_fillna(X)

### One hot coding

In [11]:
def preprocessing_one_hot_coding(df, col_name):
    ohe = OneHotEncoder(sparse=False)
    encoded_col = ohe.fit_transform(df[col_name].values.reshape(-1, 1))
    tmp = pd.DataFrame(encoded_col, columns=[col_name + str(i) for i in range(encoded_col.shape[1])], index = df.index)
    df = pd.concat([df, tmp], axis = 1)
    df = df.drop([col_name], axis = 1)
    return df

X = preprocessing_one_hot_coding(X, 'dow')



# Hiperparameters Fitting

In [12]:
folds = 5
kf = KFold(len(X), folds, shuffle=True, random_state=42)
trees = [10, 20, 30, 40, 50, 100, 200]

print()
print('Gradient boosting fitting:')

for t in trees:
    start_time = datetime.datetime.now()
    gb_clf = GradientBoostingClassifier(learning_rate=0.2, n_estimators=t, verbose=False, random_state=241)
    scores = cross_val_score(gb_clf, X=X, y=y, scoring='roc_auc', cv=kf)
    time_to_fit = datetime.datetime.now() - start_time
    print('Trees {0}, Time to fit: {1}, ROC-AUC: {2}'.format(t, time_to_fit, scores.mean()))


Gradient boosting fitting:
Trees 10, Time to fit: 0:00:07.936481, ROC-AUC: 0.7892693529583155
Trees 20, Time to fit: 0:00:13.785848, ROC-AUC: 0.7929986515529099
Trees 30, Time to fit: 0:00:24.108282, ROC-AUC: 0.7947015381334652
Trees 40, Time to fit: 0:00:25.219615, ROC-AUC: 0.7962994581768762
Trees 50, Time to fit: 0:00:30.550247, ROC-AUC: 0.7974914174124279
Trees 100, Time to fit: 0:00:58.020148, ROC-AUC: 0.8004247721330635
Trees 200, Time to fit: 0:01:54.498115, ROC-AUC: 0.8021488558454898


KeyboardInterrupt: 

# Model Fitting

In [13]:
final_gb_clf = GradientBoostingClassifier(learning_rate=0.2, n_estimators=50, verbose=False, random_state=241)
final_gb_clf.fit(X, y)

coeff = final_gb_clf.feature_importances_
names = X.columns.values
df = pd.DataFrame({'features':names, 'importance':coeff})
print(df.sort('importance'))

                     features  importance
15                       dow4    0.000000
11                       dow0    0.002036
16                       dow5    0.008661
14                       dow3    0.012875
13                       dow2    0.021521
12                       dow1    0.022811
4                   orders_4w    0.024293
17                       dow6    0.026248
5                   orders_2m    0.030865
3                   orders_3w    0.037776
6                   orders_3m    0.050637
2                   orders_2w    0.057260
9                  dow_worked    0.076975
1                   orders_1w    0.097532
0                  all_orders    0.106569
10  days_since_last_dow_order    0.109026
8                  dow_orders    0.124275
7       days_since_last_order    0.190640




# Тестирование модели на отложенной выборке

In [14]:
test_data = pd.read_csv('test/test-1-10-jul.csv', header = 0)

X_test = test_data.drop(['master_id','worked', 'obs_date'], axis=1)
y_test = test_data['worked']

# Preprocessing
X_test = X_test.fillna(-999)
X_test = preprocessing_one_hot_coding(X_test, 'dow')

In [15]:
from sklearn.metrics import roc_auc_score
y_pred = final_gb_clf.predict_proba(X_test)[:,1]

print(roc_auc_score(y_test, y_pred))

0.804494384103


### Bins

In [29]:
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
group_names = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100']

categories = pd.cut(y_pred, bins, labels=group_names)

df = pd.DataFrame(columns=['y_true', 'y_pred', 'bin'])
df['y_true'] = y_test
df['y_pred'] = y_pred
df['bin'] = categories

df.to_csv('bins.csv')

# Prediction

In [15]:
X_test = pd.read_csv('test/data-2017-06-21.csv', index_col=[0,1])

# Preprocessing
X_test = preprocessing_fillna(X_test)
X_test = preprocessing_one_hot_coding(X_test, 'dow')

# Prediction
df = pd.DataFrame(index=X_test.index, columns=['worked'])
df['worked'] = final_gb_clf.predict_proba(X_test)[:,1]
df.to_csv('predictions/prediction-2017-06-21-nan.csv')
print('Output in YYYY-MM-DD.csv')

Output in YYYY-MM-DD.csv


In [28]:
print(df.iloc[df.index.get_level_values('master_id') == 121437])

                        worked
master_id date                
121437    2017-05-21  0.474165
          2017-05-22  0.004951
          2017-05-23  0.005757
          2017-05-17  0.005488
          2017-05-18  0.004869
          2017-05-19  0.155925
          2017-05-20  0.220224
