In [36]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import time
import datetime
import pandas as pd

features = pd.read_csv('train/train_medium.csv', header = 0)

X = features.drop(['master_id','worked', 'obs_date'], axis=1)
y = features['worked']
print('{0} train samples loaded'.format(len(X)))

130937 train samples loaded


# Preprocessing
### Fill na

In [37]:
def preprocessing_fillna(df):
    df['age'] = df['age'].fillna(df['age'].mean())
    #df = df.fillna(0)
    df = df.fillna(-999)
    return df
    
X = preprocessing_fillna(X)

### One hot coding

In [38]:
def preprocessing_one_hot_coding(df, col_name):
    ohe = OneHotEncoder(sparse=False)
    encoded_col = ohe.fit_transform(df[col_name].values.reshape(-1, 1))
    tmp = pd.DataFrame(encoded_col, columns=[col_name + str(i) for i in range(encoded_col.shape[1])], index = df.index)
    df = pd.concat([df, tmp], axis = 1)
    df = df.drop([col_name], axis = 1)
    return df

X = preprocessing_one_hot_coding(X, 'dow')



# Hiperparameters Fitting

In [19]:
folds = 5
kf = KFold(len(X), folds, shuffle=True, random_state=42)
trees = [10, 20, 30, 40, 50, 100, 200]

print()
print('Gradient boosting fitting:')

for t in trees:
    start_time = datetime.datetime.now()
    gb_clf = GradientBoostingClassifier(learning_rate=0.2, n_estimators=t, verbose=False, random_state=241)
    scores = cross_val_score(gb_clf, X=X, y=y, scoring='roc_auc', cv=kf)
    time_to_fit = datetime.datetime.now() - start_time
    print('Trees {0}, Time to fit: {1}, ROC-AUC: {2}'.format(t, time_to_fit, scores.mean()))


Gradient boosting fitting:
Trees 10, Time to fit: 0:00:08.851143, ROC-AUC: 0.7892693529583155
Trees 20, Time to fit: 0:00:16.653323, ROC-AUC: 0.7929986515529099
Trees 30, Time to fit: 0:00:19.167418, ROC-AUC: 0.7947015381334652
Trees 40, Time to fit: 0:00:27.623608, ROC-AUC: 0.7962994581768762


KeyboardInterrupt: 

# Model Fitting

In [39]:
final_gb_clf = GradientBoostingClassifier(learning_rate=0.2, n_estimators=50, verbose=False, random_state=241)
final_gb_clf.fit(X, y)

coeff = final_gb_clf.feature_importances_
names = X.columns.values
df = pd.DataFrame({'features':names, 'importance':coeff})
print(df.sort('importance'))

                     features  importance
18                    mol_flg    0.000000
27                        ekb    0.000000
17                    blr_flg    0.000000
24                      basic    0.000000
26                        spb    0.000000
34                       dow4    0.000000
15                    rus_flg    0.000609
16                    ukr_flg    0.004518
31                       dow1    0.005090
19                   asia_flg    0.005139
4                   orders_4w    0.006610
3                   orders_3w    0.007875
30                       dow0    0.010183
22           blacklists_count    0.010404
35                       dow5    0.011313
14                        age    0.013683
28                        lat    0.013976
33                       dow3    0.014901
6                   orders_3m    0.017075
12             busy_afternoon    0.017541
5                   orders_2m    0.021454
32                       dow2    0.021517
8                  dow_orders    0



# Тестирование модели на отложенной выборке

In [41]:
test_data = pd.read_csv('test/test-1-10-jul.csv', header = 0)

X_test = test_data.drop(['master_id','worked', 'obs_date'], axis=1)
y_test = test_data['worked']

# Preprocessing
X_test = preprocessing_fillna(X_test)
X_test = preprocessing_one_hot_coding(X_test, 'dow')

In [42]:
from sklearn.metrics import roc_auc_score
y_pred = final_gb_clf.predict_proba(X_test)[:,1]

print(roc_auc_score(y_test, y_pred))

0.888613453454


### Bins

In [29]:
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
group_names = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100']

categories = pd.cut(y_pred, bins, labels=group_names)

df = pd.DataFrame(columns=['y_true', 'y_pred', 'bin'])
df['y_true'] = y_test
df['y_pred'] = y_pred
df['bin'] = categories

df.to_csv('bins.csv')

# Prediction

In [15]:
X_test = pd.read_csv('test/data-2017-06-21.csv', index_col=[0,1])

# Preprocessing
X_test = preprocessing_fillna(X_test)
X_test = preprocessing_one_hot_coding(X_test, 'dow')

# Prediction
df = pd.DataFrame(index=X_test.index, columns=['worked'])
df['worked'] = final_gb_clf.predict_proba(X_test)[:,1]
df.to_csv('predictions/prediction-2017-06-21-nan.csv')
print('Output in YYYY-MM-DD.csv')

Output in YYYY-MM-DD.csv


In [28]:
print(df.iloc[df.index.get_level_values('master_id') == 121437])

                        worked
master_id date                
121437    2017-05-21  0.474165
          2017-05-22  0.004951
          2017-05-23  0.005757
          2017-05-17  0.005488
          2017-05-18  0.004869
          2017-05-19  0.155925
          2017-05-20  0.220224
