In [2]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from six.moves import cPickle as pickle
import numpy as np
import time
import datetime
import pandas as pd

features = pd.read_csv('train/train_small.csv', header = 0)
X = features.drop(['order_id','cancel', 'dow'], axis=1)
y = features['cancel']
print('{0} train samples loaded'.format(len(X)))

#print(X.head())

51761 train samples loaded


# Preprocessing

In [3]:
#def fill_na_zero(df, col_name):
#    df[col_name] = df[col_name].fillna(df[col_name].mean())
#    df = df.fillna(0)
#    #df = df.fillna(df.mean())
#    return df

def fill_na_mean(df, col_names):
    for c in col_names:
        df[c] = df[c].fillna(df[c].mean())
    return df

def le_fit_transform(df, col_name):
    le = LabelEncoder()
    le.fit(df[col_name])
    df[col_name] = le.transform(df[col_name])
    return df, le

def le_transform(df, col_name, le):
    df[col_name] = le.transform(df[col_name])
    return df

def ohc_fit_transform(df, col_name):
    ohe = OneHotEncoder(sparse=False)
    ohe.fit(df[col_name].values.reshape(-1, 1))
    encoded_col = ohe.transform(df[col_name].values.reshape(-1, 1))
    #encoded_col = ohe.fit_transform(df[col_name].values.reshape(-1, 1))
    tmp = pd.DataFrame(encoded_col, columns=[col_name + str(i) for i in range(encoded_col.shape[1])], index = df.index)
    df = pd.concat([df, tmp], axis = 1)
    df = df.drop([col_name], axis = 1)
    return df, ohe

def ohc_transform(df, col_name, ohe):
    encoded_col = ohe.transform(df[col_name].values.reshape(-1, 1))
    tmp = pd.DataFrame(encoded_col, columns=[col_name + str(i) for i in range(encoded_col.shape[1])], index = df.index)
    df = pd.concat([df, tmp], axis = 1)
    df = df.drop([col_name], axis = 1)
    return df

def exponentiation(df, cols):
    for c in cols:
        df[c + '_sq'] = df[c]**2
        df[c + '_sqrt'] = np.sqrt(df[c])
    return df
        

 
X = fill_na_mean(X, ['shifts_num', 'dow_paid_share'])
X = X.fillna(-999)


#X, le_type = le_fit_transform(X, 'type')
#X, ohc_type = ohc_fit_transform(X, 'type')

#X, le_start_hour = le_fit_transform(X, 'start_hour')
#X, ohc_le_start_hour = ohc_fit_transform(X, 'start_hour')

#X, le_creation_mean = le_fit_transform(X, 'creation_mean')
#X, ohc_creation_mean = ohc_fit_transform(X, 'creation_mean')

#X, le_payment_type = le_fit_transform(X, 'payment_type')
#X, ohc_payment_type = ohc_fit_transform(X, 'payment_type')

# X = exponentiation(X, ['paid_all','canceled_all_nt','paid_16d', 'paid_30d', 'paid_60d', 'canceled_16d_nt','canceled_30d_nt', 'canceled_60d_nt', 'dow_paid', 'dow_canceled'])

In [3]:
folds = 5
kf = KFold(len(X), folds, shuffle=True, random_state=42)
trees = [10, 20, 30, 50, 100]
depths = [3, 5, 6, 7]
rates = [0.001, 0.002, 0.1, 0.2, 0.5, 1]


print()
print('Gradient boosting fitting:')

for t in trees:
    for d in depths:
        for r in rates:
            start_time = datetime.datetime.now()
            gb_clf = GradientBoostingClassifier(learning_rate=r, n_estimators=t, verbose=False, random_state=241, max_depth = d)
            scores = cross_val_score(gb_clf, X=X, y=y, scoring='roc_auc', cv=kf)
            time_to_fit = datetime.datetime.now() - start_time
            print('Trees {0}, Depth {1}, Learning Rate {2}, Time to fit: {3}, ROC-AUC: {4}'.format(t, d, r, time_to_fit, scores.mean()))


Gradient boosting fitting:
Trees 10, Depth 3, Learning Rate 0.001, Time to fit: 0:00:04.122871, ROC-AUC: 0.7271299792686674
Trees 10, Depth 3, Learning Rate 0.002, Time to fit: 0:00:03.866373, ROC-AUC: 0.7278862766565259
Trees 10, Depth 3, Learning Rate 0.1, Time to fit: 0:00:03.839212, ROC-AUC: 0.7729221063398933
Trees 10, Depth 3, Learning Rate 0.2, Time to fit: 0:00:03.871731, ROC-AUC: 0.7863959759201512
Trees 10, Depth 3, Learning Rate 0.5, Time to fit: 0:00:03.712033, ROC-AUC: 0.7967224708334385
Trees 10, Depth 3, Learning Rate 1, Time to fit: 0:00:03.668042, ROC-AUC: 0.7986422978851025
Trees 10, Depth 5, Learning Rate 0.001, Time to fit: 0:00:09.143151, ROC-AUC: 0.7672000683942325


KeyboardInterrupt: 

# Model Training

In [5]:
final_gb_clf = GradientBoostingClassifier(learning_rate=0.2, n_estimators=70, max_depth=5, verbose=False, random_state=241)
final_gb_clf.fit(X, y)


coeff = final_gb_clf.feature_importances_
names = X.columns.values
df = pd.DataFrame({'features':names, 'importance':coeff})
print(df.sort_values('importance', ascending = False))

                             features  importance
13               creation_start_hours    0.128751
27         days_since_last_paid_order    0.074691
11                                lat    0.067812
30                     dow_paid_share    0.064652
12                                lng    0.064252
16              canceled_all_nt_share    0.056208
23                         shifts_num    0.048922
14                           paid_all    0.043403
25      days_since_last_cancel_all_nt    0.037691
0                         total_cents    0.036303
1                      subtotal_cents    0.034955
7                     additionals_num    0.030780
2                            discount    0.028856
19                           paid_60d    0.028477
31                 dow_all_paid_share    0.028439
3                        discount_prc    0.024146
29                       dow_canceled    0.020385
26  days_since_last_cancel_onetime_nt    0.019831
15                    canceled_all_nt    0.017963


In [207]:
le_creation_mean.classes_

array(['admin', 'auto', 'crm', 'ios', 'reactivation', 'web_new_flow'], dtype=object)

### Dump model to pickle

In [7]:
#file_name = 'cancellations_model.pickle'
#with open(file_name, 'wb') as f:
#    pickle.dump(final_gb_clf, f, pickle.HIGHEST_PROTOCOL)

In [6]:
pickle_file = 'cancellations_model.pickle'
try:
    f = open(pickle_file, 'wb')
    save = {
        'model': final_gb_clf,
        #'ohc_dow': ohc_dow,
        #'le_type': le_type,
        #'ohc_type': ohc_type,
        #'le_creation_mean': le_creation_mean,
        #'ohc_creation_mean': ohc_creation_mean,
        #'le_payment_type': le_payment_type,
        #'ohc_payment_type': ohc_payment_type,
    }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise

# Тест на отложенной выборке

In [7]:
features = pd.read_csv('test/test-20-29-jun.csv', header = 0)

X_test = features.drop(['order_id','cancel', 'dow'], axis=1)


# X_test = fill_na(X_test, 'shifts_num')

X_test = fill_na_mean(X_test, ['shifts_num', 'dow_paid_share'])

X_test = X_test.fillna(-999)




#X_test = ohc_transform(X_test, 'dow', ohc_dow)

#X_test = le_transform(X_test, 'type', le_type)
#X_test = ohc_transform(X_test, 'type', ohc_type)

#X_test = le_transform(X_test, 'creation_mean', le_creation_mean)
#X_test = ohc_transform(X_test, 'creation_mean', ohc_creation_mean)

#X_test = le_transform(X_test, 'payment_type', le_payment_type)
#X_test = ohc_transform(X_test, 'payment_type', ohc_payment_type)

# X_test = exponentiation(
#    X_test, ['paid_all','canceled_all','paid_16d','canceled_16d', 'paid_30d','canceled_30d', 'paid_60d','canceled_60d'])


y_true = features['cancel']
y_pred = final_gb_clf.predict_proba(X_test)[:,1]


print(roc_auc_score(y_true, y_pred))

0.826187226577


### Bins

In [58]:
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
group_names = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100']

categories = pd.cut(y_pred, bins, labels=group_names)

df = pd.DataFrame(columns=['y_true', 'y_pred', 'bin'])
df['y_true'] = y_true
df['y_pred'] = y_pred
df['bin'] = categories

df.to_csv('bins_nt_model-2017-06-29.csv')

# Prediction

In [218]:
features = pd.read_csv('predict/2017-07-01.csv', header = 0, index_col='order_id')
#X_test = read_csv('test/test-2017.csv', index_col=[0,1,2,3])




# Preprocessing
X_test = features

X_test = features.drop(['dow'], axis=1)
X_test = fill_na_mean(X_test, ['shifts_num', 'dow_paid_share'])
X_test = X_test.fillna(0)



#y_true = features['cancel']
y_pred = final_gb_clf.predict_proba(X_test)[:,1]


#print(roc_auc_score(y_true, y_pred))
print(y_pred)


df = pd.DataFrame(index=X_test.index, columns=['will_cancel'])
df['will_cancel'] = final_gb_clf.predict_proba(X_test)[:,1]
df.to_csv('predict/predict-2017-07-01.csv')
print('DONE')

[ 0.47702018  0.50489465  0.47570555 ...,  0.5580376   0.10706268
  0.17947012]
DONE
