In [1]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import time
import datetime
import pandas as pd

#features = read_csv('train/regions-2017-04-17.csv', header = 0)

data1 = pd.read_csv('train/regions-2017-03-06.csv', header = 0)
data2 = pd.read_csv('train/regions-2017-03-13.csv', header = 0)
data3 = pd.read_csv('train/regions-2017-03-20.csv', header = 0)
data4 = pd.read_csv('train/regions-2017-03-27.csv', header = 0)
data5 = pd.read_csv('train/regions-2017-04-03.csv', header = 0)
data6 = pd.read_csv('train/regions-2017-04-10.csv', header = 0)
data7 = pd.read_csv('train/regions-2017-04-17.csv', header = 0)
data8 = pd.read_csv('train/regions-2017-05-01.csv', header = 0)
data9 = pd.read_csv('train/regions-2017-04-24.csv', header = 0)

features = data1.append(data2, ignore_index=True)
features = features.append(data3, ignore_index=True)
features = features.append(data4, ignore_index=True)
features = features.append(data5, ignore_index=True)
features = features.append(data6, ignore_index=True)
features = features.append(data7, ignore_index=True)
features = features.append(data8, ignore_index=True)
features = features.append(data9, ignore_index=True)

#frames = [data1, data2]
#result = pd.concat(frames, ignore_index=True)


X = features.drop(['master_id','worked'], axis=1)
y = features['worked']
print('{0} train samples loaded'.format(len(X)))

60947 train samples loaded


In [2]:
ohe = OneHotEncoder(sparse=False)
encoded_dow = ohe.fit_transform(X['dow'].values.reshape(-1, 1))
tmp = pd.DataFrame(encoded_dow, columns=['dow_' + str(i) for i in range(encoded_dow.shape[1])])
X = pd.concat([X, tmp], axis = 1)
X = X.drop(['dow'], axis = 1)

In [65]:
folds = 5
kf = KFold(len(X), folds, shuffle=True, random_state=42)
trees = [10, 20, 30, 40, 50, 100, 200, 300]

print()
print('Gradient boosting fitting:')

for t in trees:
    start_time = datetime.datetime.now()
    gb_clf = GradientBoostingClassifier(learning_rate=0.2, n_estimators=t, verbose=False, random_state=241)
    scores = cross_val_score(gb_clf, X=X, y=y, scoring='roc_auc', cv=kf)
    time_to_fit = datetime.datetime.now() - start_time
    print('Trees {0}, Time to fit: {1}, ROC-AUC: {2}'.format(t, time_to_fit, scores.mean()))


Gradient boosting fitting:
Trees 10, Time to fit: 0:00:07.770303, ROC-AUC: 0.8183966764342119
Trees 20, Time to fit: 0:00:10.172802, ROC-AUC: 0.8276426880606657
Trees 30, Time to fit: 0:00:13.718650, ROC-AUC: 0.8305882008799808
Trees 40, Time to fit: 0:00:17.274143, ROC-AUC: 0.8328799253473728
Trees 50, Time to fit: 0:00:20.710310, ROC-AUC: 0.8343960374568372
Trees 100, Time to fit: 0:00:38.660456, ROC-AUC: 0.8391785034295873
Trees 200, Time to fit: 0:01:14.974125, ROC-AUC: 0.8439178100290425
Trees 300, Time to fit: 0:01:50.548335, ROC-AUC: 0.8463825606665141


In [4]:
final_gb_clf = GradientBoostingClassifier(learning_rate=0.2, n_estimators=200, verbose=False, random_state=241)
final_gb_clf.fit(X, y)

coeff = final_gb_clf.feature_importances_
names = X.columns.values
df = pd.DataFrame({'features':names, 'importance':coeff})
print(df.sort('importance'))

                     features  importance
38              iz_ruk_v_ruki    0.000000
20                    new_flg    0.000000
34                    admitad    0.000000
47                        ekb    0.000000
29                 tradeleads    0.000000
37      rabota_ucheba_service    0.000000
46                        spb    0.000518
10                    ukr_flg    0.000543
35                     vk_cpc    0.000554
41                  actionpay    0.000658
12                    mol_flg    0.000759
25                 targetmail    0.000808
22                    pro_flg    0.001090
9                     rus_flg    0.001090
52                      dow_4    0.001148
13                   asia_flg    0.001213
28                   mytarget    0.001265
49                      dow_1    0.001305
23                   crm_hire    0.001485
36                     yandex    0.002233
33          rabota_i_zarplata    0.002313
31                   internet    0.002355
21                    spe_flg    0



# Тестирование модели на отложенной выборке

In [73]:
test_data = read_csv('train/regions-2017-04-24.csv', header = 0)

X_test = test_data.drop(['master_id','worked'], axis=1)
y_test = test_data['worked']


ohe = OneHotEncoder(sparse=False)
encoded_dow = ohe.fit_transform(X_test['dow'].values.reshape(-1, 1))
tmp = pd.DataFrame(encoded_dow, columns=['dow_' + str(i) for i in range(encoded_dow.shape[1])])
X_test = pd.concat([X_test, tmp], axis = 1)
X_test = X_test.drop(['dow'], axis = 1)

In [74]:
from sklearn.metrics import roc_auc_score
y_pred = final_gb_clf.predict_proba(X_test)[:,1]

print(roc_auc_score(y_test, y_pred))

0.847328402226


In [48]:
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
group_names = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100']

categories = pd.cut(y_pred, bins, labels=group_names)

#print(categories)

df = DataFrame(columns=['y_true', 'y_pred', 'bin'])
df['y_true'] = y_test
df['y_pred'] = y_pred
df['bin'] = categories
#df_work['categories'] = pd.cut(df_work['will_cancel'], bins, labels=group_names)

df.to_csv('bins.csv')
#print(len(df_work))

[90-100, 80-90, 90-100, 80-90, 70-80, ..., 10-20, 10-20, 50-60, 60-70, 10-20]
Length: 6119
Categories (10, object): [0-10 < 10-20 < 20-30 < 30-40 ... 60-70 < 70-80 < 80-90 < 90-100]


# Предсказание

In [7]:
X_test = pd.read_csv('test/data-2017-05-29.csv', index_col=[0,1])

ohe = OneHotEncoder(sparse=False)
encoded_dow = ohe.fit_transform(X_test['dow'].values.reshape(-1, 1))
tmp = pd.DataFrame(encoded_dow, columns=['dow_' + str(i) for i in range(encoded_dow.shape[1])], index = X_test.index)

X_test = pd.concat([X_test, tmp], axis = 1)
X_test = X_test.drop(['dow'], axis = 1)

df = pd.DataFrame(index=X_test.index, columns=['worked'])
df['worked'] = final_gb_clf.predict_proba(X_test)[:,1]
df.to_csv('predictions/prediction-2017-05-29.csv')
print('Output in YYYY-MM-DD.csv')

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [101]:
X_test = read_csv('test/data-2017-05-16-nans.csv', index_col=[0,1])
tmp = X_test['days_since_last_dow_order'].fillna(X_test['days_since_last_dow_order'].max())
X_test['days_since_last_dow_order'] = tmp

print(X_test.head())

X_test = X_test.fillna(-1)


ohe = OneHotEncoder(sparse=False)
encoded_dow = ohe.fit_transform(X_test['dow'].values.reshape(-1, 1))
tmp = pd.DataFrame(encoded_dow, columns=['dow_' + str(i) for i in range(encoded_dow.shape[1])], index = X_test.index)
X_test = pd.concat([X_test, tmp], axis = 1)
X_test = X_test.drop(['dow'], axis = 1)

print(X_test.head())

                      all_orders  days_since_last_dow_order  dow  dow_worked  \
master_id date                                                                 
9532      2017-05-21           1                      521.0    0         NaN   
          2017-05-22           1                        9.0    1         1.0   
          2017-05-23           1                      521.0    2         NaN   
          2017-05-17           1                      521.0    3         NaN   
          2017-05-18           1                      521.0    4         NaN   

                      total_dow  dow_worked_prc  age  rating  \
master_id date                                                 
9532      2017-05-21          2             NaN   46     0.0   
          2017-05-22          3        0.333333   46     0.0   
          2017-05-23          3             NaN   46     0.0   
          2017-05-17          2             NaN   46     0.0   
          2017-05-18          2             NaN   46   

In [102]:
df = DataFrame(index=X_test.index, columns=['worked'])
df['worked'] = final_gb_clf.predict_proba(X_test)[:,1]
df.to_csv('predictions/prediction-2017-05-16-nas.csv')
print('Output in YYYY-MM-DD.csv')

Output in YYYY-MM-DD.csv


In [103]:
#df[index[0] == 125779 ]
print(df.iloc[df.index.get_level_values('master_id') == 121437])
#print(df.where(df.index[]==125779))
#df.head(10)

                        worked
master_id date                
121437    2017-05-21  0.430620
          2017-05-22  0.878362
          2017-05-23  0.010991
          2017-05-17  0.827917
          2017-05-18  0.847496
          2017-05-19  0.290889
          2017-05-20  0.326732
