In [1]:
from pandas import read_csv, DataFrame
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import roc_auc_score
import numpy as np
import time
import datetime
import pandas as pd

features = read_csv('train/3train.csv', header = 0)
X = features.drop(['order_id','canceled'], axis=1)
y = features['canceled']
print('{0} train samples loaded'.format(len(X)))

27949 train samples loaded


# Parameters Fitting

In [2]:
folds = 5
kf = KFold(len(X), folds, shuffle=True, random_state=42)
trees = [10, 20, 30, 40, 50, 100, 200, 300]

print()
print('Gradient boosting fitting:')

for t in trees:
    start_time = datetime.datetime.now()
    gb_clf = GradientBoostingClassifier(learning_rate=0.1, n_estimators=t, verbose=False, random_state=241)
    scores = cross_val_score(gb_clf, X=X, y=y, scoring='roc_auc', cv=kf)
    time_to_fit = datetime.datetime.now() - start_time
    print('Trees {0}, Time to fit: {1}, ROC-AUC: {2}'.format(t, time_to_fit, scores.mean()))


Gradient boosting fitting:
Trees 10, Time to fit: 0:00:01.709924, ROC-AUC: 0.8695068534367942
Trees 20, Time to fit: 0:00:03.352428, ROC-AUC: 0.8771347673025971
Trees 30, Time to fit: 0:00:04.197216, ROC-AUC: 0.8807485293279574
Trees 40, Time to fit: 0:00:05.201099, ROC-AUC: 0.8841870519608067
Trees 50, Time to fit: 0:00:06.533552, ROC-AUC: 0.8866767653693735
Trees 100, Time to fit: 0:00:12.779614, ROC-AUC: 0.892327143128616
Trees 200, Time to fit: 0:00:24.689046, ROC-AUC: 0.8952788689035687
Trees 300, Time to fit: 0:00:37.020866, ROC-AUC: 0.8959723303550309


# Train Final Model

In [3]:
final_gb_clf = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, verbose=False, random_state=241)
final_gb_clf.fit(X, y)

coeff = final_gb_clf.feature_importances_
names = X.columns.values
df = DataFrame({'features':names, 'importance':coeff})
print(df.sort('importance'))

                       features  importance
33                 evening_slot    0.000000
27                microwave_flg    0.000000
11                      ios_flg    0.000000
20             refrigerator_flg    0.000000
15             subscription_flg    0.000000
32                     day_slot    0.000000
24         kitchen_cabinets_flg    0.000000
25                tableware_flg    0.000274
26                balconies_flg    0.000571
6                      cash_flg    0.000609
31                 morning_slot    0.000730
21                     oven_flg    0.001353
10                      web_flg    0.001689
22                  windows_fls    0.001696
5                     bathrooms    0.002154
23                  ironing_flg    0.003603
30                  weekend_flg    0.004353
14                  has_comment    0.006217
4                         rooms    0.006284
13       has_active_credit_card    0.010454
28            keys_delivery_flg    0.012140
1                 credits_cents 



# Тест на отложенной выборке

In [4]:
features = read_csv('test/test-old-16-19-may.csv', header = 0)

X_test = features.drop(['order_id','canceled'], axis=1)

y_true = features['canceled']
y_pred = final_gb_clf.predict_proba(X_test)[:,1]


print(roc_auc_score(y_true, y_pred))

0.874464243861


### Bins

In [7]:
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
group_names = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100']

categories = pd.cut(y_pred, bins, labels=group_names)

df = DataFrame(columns=['y_true', 'y_pred', 'bin'])
df['y_true'] = y_true
df['y_pred'] = y_pred
df['bin'] = categories

df.to_csv('bins_old_model.csv')

# Prediction

In [4]:
X_test = read_csv('predict/old_2017-06-30.csv', index_col=[0])

df = DataFrame(index=X_test.index, columns=['will_cancel'])
df['will_cancel'] = final_gb_clf.predict_proba(X_test)[:,1]
df.to_csv('predict/predict-old-2017-06-30.csv')
print('Output in YYYY-MM-DD.csv')

Output in YYYY-MM-DD.csv


In [16]:
import pandas as pd
df_work = df[df['will_cancel']>=0.5]
bins = [0.5, 0.65, 0.80, 1.0]
group_names = ['green', 'yellow', 'red']

categories = pd.cut(df_work['will_cancel'], bins, labels=group_names)
df_work['categories'] = pd.cut(df_work['will_cancel'], bins, labels=group_names)

df_work.to_csv('predictions/prediction-2017-05-02.csv')
print(len(df_work))

230


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
