In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import time
import datetime
import pandas as pd

#features = read_csv('train/regions-2017-04-17.csv', header = 0)

data1 = pd.read_csv('train/nan-49.csv', header = 0)
data2 = pd.read_csv('train/nan-42.csv', header = 0)
data3 = pd.read_csv('train/nan-35.csv', header = 0)
data4 = pd.read_csv('train/nan-28.csv', header = 0)
data5 = pd.read_csv('train/nan-21.csv', header = 0)
data6 = pd.read_csv('train/nan-14.csv', header = 0)
data7 = pd.read_csv('train/nan-0.csv', header = 0)
data8 = pd.read_csv('train/regions-2017-05-01.csv', header = 0)
data9 = pd.read_csv('train/regions-2017-04-24.csv', header = 0)

features = data1.append(data2, ignore_index=True)
features = features.append(data3, ignore_index=True)
features = features.append(data4, ignore_index=True)
features = features.append(data5, ignore_index=True)
features = features.append(data6, ignore_index=True)
features = features.append(data7, ignore_index=True)
features = features.append(data8, ignore_index=True)
features = features.append(data9, ignore_index=True)

#frames = [data1, data2]
#result = pd.concat(frames, ignore_index=True)


X = features.drop(['master_id','worked'], axis=1)
y = features['worked']
print('{0} train samples loaded'.format(len(X)))

64519 train samples loaded


# Preprocessing
### Fill na

In [12]:
def preprocessing_fillna(df):
    df['days_since_last_dow_order'] = df['days_since_last_dow_order'].fillna(df['days_since_last_dow_order'].mean())
    df = df.fillna(0)
    return df
    
X = preprocessing_fillna(X)

### One hot coding

In [13]:
def preprocessing_one_hot_coding(df, col_name):
    ohe = OneHotEncoder(sparse=False)
    encoded_col = ohe.fit_transform(df[col_name].values.reshape(-1, 1))
    tmp = pd.DataFrame(encoded_col, columns=[col_name + str(i) for i in range(encoded_col.shape[1])], index = df.index)
    df = pd.concat([df, tmp], axis = 1)
    df = df.drop([col_name], axis = 1)
    return df

X = preprocessing_one_hot_coding(X, 'dow')



# Hiperparameters Fitting

In [17]:
folds = 5
kf = KFold(len(X), folds, shuffle=True, random_state=42)
trees = [10, 20, 30, 40, 50, 100, 200, 300]

print()
print('Gradient boosting fitting:')

for t in trees:
    start_time = datetime.datetime.now()
    gb_clf = GradientBoostingClassifier(learning_rate=0.2, n_estimators=t, verbose=False, random_state=241)
    scores = cross_val_score(gb_clf, X=X, y=y, scoring='roc_auc', cv=kf)
    time_to_fit = datetime.datetime.now() - start_time
    print('Trees {0}, Time to fit: {1}, ROC-AUC: {2}'.format(t, time_to_fit, scores.mean()))


Gradient boosting fitting:
Trees 10, Time to fit: 0:00:05.786110, ROC-AUC: 0.8449358302883923
Trees 20, Time to fit: 0:00:08.710379, ROC-AUC: 0.8513433607143608
Trees 30, Time to fit: 0:00:12.083420, ROC-AUC: 0.8537005372654004
Trees 40, Time to fit: 0:00:14.964632, ROC-AUC: 0.8549649029470672
Trees 50, Time to fit: 0:00:17.810534, ROC-AUC: 0.8557222271274421
Trees 100, Time to fit: 0:00:32.687355, ROC-AUC: 0.858499886876776
Trees 200, Time to fit: 0:01:09.840109, ROC-AUC: 0.8614241208655683
Trees 300, Time to fit: 0:01:44.775418, ROC-AUC: 0.8631823057486023


# Model Fitting

In [14]:
final_gb_clf = GradientBoostingClassifier(learning_rate=0.2, n_estimators=200, verbose=False, random_state=241)
final_gb_clf.fit(X, y)

coeff = final_gb_clf.feature_importances_
names = X.columns.values
df = pd.DataFrame({'features':names, 'importance':coeff})
print(df.sort('importance'))

                     features  importance
29                    new_flg    0.000000
26                    mol_flg    0.000000
27                        msk    0.000000
28                   mytarget    0.000000
42                        spb    0.000000
40                    rus_flg    0.000000
4                    asia_flg    0.000000
1                     admitad    0.000000
19                        ekb    0.000000
47                    ukr_flg    0.000385
15                   crm_hire    0.000393
23              iz_ruk_v_ruki    0.000514
37          rabota_i_zarplata    0.000531
22                   internet    0.000748
46                 tradeleads    0.000864
20                     google    0.001048
48                     vk_cpc    0.001120
56                       dow4    0.001120
44                 targetmail    0.001229
0                   actionpay    0.001325
38      rabota_ucheba_service    0.001498
14                cpaexchange    0.001532
51                    ydirect    0



# Тестирование модели на отложенной выборке

In [57]:
test_data = pd.read_csv('train/nan-7.csv', header = 0)

X_test = test_data.drop(['master_id','worked'], axis=1)
y_test = test_data['worked']

# Preprocessing
X_test = preprocessing_fillna(X_test)
X_test = preprocessing_one_hot_coding(X_test, 'dow')

In [58]:
from sklearn.metrics import roc_auc_score
y_pred = final_gb_clf.predict_proba(X_test)[:,1]

print(roc_auc_score(y_test, y_pred))

0.877626541806


### Bins

In [29]:
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
group_names = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100']

categories = pd.cut(y_pred, bins, labels=group_names)

df = pd.DataFrame(columns=['y_true', 'y_pred', 'bin'])
df['y_true'] = y_test
df['y_pred'] = y_pred
df['bin'] = categories

df.to_csv('bins.csv')

# Prediction

In [15]:
X_test = pd.read_csv('test/data-2017-06-21.csv', index_col=[0,1])

# Preprocessing
X_test = preprocessing_fillna(X_test)
X_test = preprocessing_one_hot_coding(X_test, 'dow')

# Prediction
df = pd.DataFrame(index=X_test.index, columns=['worked'])
df['worked'] = final_gb_clf.predict_proba(X_test)[:,1]
df.to_csv('predictions/prediction-2017-06-21-nan.csv')
print('Output in YYYY-MM-DD.csv')

Output in YYYY-MM-DD.csv


In [28]:
print(df.iloc[df.index.get_level_values('master_id') == 121437])

                        worked
master_id date                
121437    2017-05-21  0.474165
          2017-05-22  0.004951
          2017-05-23  0.005757
          2017-05-17  0.005488
          2017-05-18  0.004869
          2017-05-19  0.155925
          2017-05-20  0.220224
