In [34]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from six.moves import cPickle as pickle
import numpy as np
import time
import datetime
import pandas as pd

features = pd.read_csv('train/aidata.csv', header = 0, dtype={'segments': np.str})
X = features.drop(['order_id','cancel', 'dow', 'segments'], axis=1)
y = features['cancel']
print('{0} train samples loaded'.format(len(X)))

print(features.dtypes)

32788 train samples loaded
order_id                               int64
total_cents                            int64
subtotal_cents                         int64
discount                               int64
discount_prc                         float64
cash_flg                               int64
auto_flg                               int64
dow                                    int64
hot                                    int64
additionals_num                        int64
day_slot_flg                           int64
has_comment                            int64
subscription_flg                       int64
lat                                  float64
lng                                  float64
creation_start_hours                   int64
cancel                                 int64
paid_all                               int64
canceled_all_nt                        int64
canceled_all_nt_share                float64
paid_16d                               int64
paid_30d                    

In [18]:
segments = []

for i in range(len(y)):
    segments_tmp = str(X['segments'][i]).split('/')
    for s in segments_tmp:
        if s not in segments:
            segments.append(s)

#print(segments)

dic_df = pd.DataFrame(columns=segments, index=X.index)

In [21]:
for i in range(len(y)):
    segments_tmp = str(X['segments'][i]).split('/')
    for s in segments_tmp:
        try:
            dic_df[s][i] = 1
        except KeyError:
            pass
        
print(dic_df.shape)

(32788, 1221)


In [22]:
#X = X.merge(aidata, how='left', on = 'tracking_id')

X = pd.concat([X, dic_df], axis=1)

print(X.shape)

(32788, 1254)


In [26]:
X = X.drop(['segments'], axis=1)

In [27]:
X.shape

(32788, 1253)

# Preprocessing

In [35]:
#def fill_na_zero(df, col_name):
#    df[col_name] = df[col_name].fillna(df[col_name].mean())
#    df = df.fillna(0)
#    #df = df.fillna(df.mean())
#    return df

def fill_na_mean(df, col_names):
    for c in col_names:
        df[c] = df[c].fillna(df[c].mean())
    return df

def le_fit_transform(df, col_name):
    le = LabelEncoder()
    le.fit(df[col_name])
    df[col_name] = le.transform(df[col_name])
    return df, le

def le_transform(df, col_name, le):
    df[col_name] = le.transform(df[col_name])
    return df

def ohc_fit_transform(df, col_name):
    ohe = OneHotEncoder(sparse=False)
    ohe.fit(df[col_name].values.reshape(-1, 1))
    encoded_col = ohe.transform(df[col_name].values.reshape(-1, 1))
    #encoded_col = ohe.fit_transform(df[col_name].values.reshape(-1, 1))
    tmp = pd.DataFrame(encoded_col, columns=[col_name + str(i) for i in range(encoded_col.shape[1])], index = df.index)
    df = pd.concat([df, tmp], axis = 1)
    df = df.drop([col_name], axis = 1)
    return df, ohe

def ohc_transform(df, col_name, ohe):
    encoded_col = ohe.transform(df[col_name].values.reshape(-1, 1))
    tmp = pd.DataFrame(encoded_col, columns=[col_name + str(i) for i in range(encoded_col.shape[1])], index = df.index)
    df = pd.concat([df, tmp], axis = 1)
    df = df.drop([col_name], axis = 1)
    return df

def exponentiation(df, cols):
    for c in cols:
        df[c + '_sq'] = df[c]**2
        df[c + '_sqrt'] = np.sqrt(df[c])
    return df
        

 
X = fill_na_mean(X, ['shifts_num', 'dow_paid_share'])
X = X.fillna(0)


#X, le_type = le_fit_transform(X, 'type')
#X, ohc_type = ohc_fit_transform(X, 'type')

#X, le_start_hour = le_fit_transform(X, 'start_hour')
#X, ohc_le_start_hour = ohc_fit_transform(X, 'start_hour')

#X, le_creation_mean = le_fit_transform(X, 'creation_mean')
#X, ohc_creation_mean = ohc_fit_transform(X, 'creation_mean')

#X, le_payment_type = le_fit_transform(X, 'payment_type')
#X, ohc_payment_type = ohc_fit_transform(X, 'payment_type')

# X = exponentiation(X, ['paid_all','canceled_all_nt','paid_16d', 'paid_30d', 'paid_60d', 'canceled_16d_nt','canceled_30d_nt', 'canceled_60d_nt', 'dow_paid', 'dow_canceled'])

In [32]:
tmp = X.sum(axis=0)[X.sum(axis=0) >100]
# print(tmp)
#for t in tmp:
#    if t == 0:
#        print(t)
del_list = []
for c in X.columns:
    if X[c].sum(axis=0) < 100:
        del_list.append(c)
        
#for c in X.columns:
#    if X[c].sum(axis=0) > 800:
#        del_list.append(c)
 
print(len(del_list))
X = X.drop(del_list, axis=1)

686


In [36]:
folds = 3
kf = KFold(len(X), folds, shuffle=True, random_state=42)
trees = [10, 20]
depths = [8, 12, 16, 20]
rates = [0.2]


print()
print('Gradient boosting fitting:')

for t in trees:
    for d in depths:
        for r in rates:
            start_time = datetime.datetime.now()
            gb_clf = GradientBoostingClassifier(learning_rate=r, n_estimators=t, verbose=False, random_state=241, max_depth = d)
            scores = cross_val_score(gb_clf, X=X, y=y, scoring='roc_auc', cv=kf)
            time_to_fit = datetime.datetime.now() - start_time
            print('Trees {0}, Depth {1}, Learning Rate {2}, Time to fit: {3}, ROC-AUC: {4}'.format(t, d, r, time_to_fit, scores.mean()))


Gradient boosting fitting:
Trees 10, Depth 8, Learning Rate 0.2, Time to fit: 0:00:08.536340, ROC-AUC: 0.8313103294854534
Trees 10, Depth 12, Learning Rate 0.2, Time to fit: 0:00:33.581750, ROC-AUC: 0.8363150981108888
Trees 10, Depth 16, Learning Rate 0.2, Time to fit: 0:01:40.018537, ROC-AUC: 0.8248618428293403
Trees 10, Depth 20, Learning Rate 0.2, Time to fit: 0:01:40.499153, ROC-AUC: 0.7964070930406068
Trees 20, Depth 8, Learning Rate 0.2, Time to fit: 0:00:17.752863, ROC-AUC: 0.8393904115914351
Trees 20, Depth 12, Learning Rate 0.2, Time to fit: 0:01:10.039264, ROC-AUC: 0.8482884635838007
Trees 20, Depth 16, Learning Rate 0.2, Time to fit: 0:02:56.001169, ROC-AUC: 0.8383835024019454
Trees 20, Depth 20, Learning Rate 0.2, Time to fit: 0:04:43.019465, ROC-AUC: 0.812052091834126


# Model Training

In [77]:
final_gb_clf = GradientBoostingClassifier(learning_rate=0.2, n_estimators=70, max_depth=5, verbose=False, random_state=241)
final_gb_clf.fit(X, y)


coeff = final_gb_clf.feature_importances_
names = X.columns.values
df = pd.DataFrame({'features':names, 'importance':coeff})
print(df.sort('importance'))

                             features  importance
6                                 hot    0.002019
20                    canceled_16d_nt    0.005088
17                           paid_16d    0.005613
8                        day_slot_flg    0.008072
18                           paid_30d    0.008320
4                            cash_flg    0.009337
5                            auto_flg    0.010809
24                canceled_onetime_nt    0.011134
9                         has_comment    0.011813
10                   subscription_flg    0.012483
29                       dow_canceled    0.013205
21                    canceled_30d_nt    0.015207
22                    canceled_60d_nt    0.015371
7                     additionals_num    0.015371
15                    canceled_all_nt    0.016124
28                           dow_paid    0.016677
3                        discount_prc    0.020242
31                 dow_all_paid_share    0.022859
19                           paid_60d    0.025745




In [73]:
df.to_csv('features_imp.csv')

In [207]:
le_creation_mean.classes_

array(['admin', 'auto', 'crm', 'ios', 'reactivation', 'web_new_flow'], dtype=object)

### Dump model to pickle

In [7]:
#file_name = 'cancellations_model.pickle'
#with open(file_name, 'wb') as f:
#    pickle.dump(final_gb_clf, f, pickle.HIGHEST_PROTOCOL)

In [4]:
pickle_file = 'cancellations_model.pickle'
try:
    f = open(pickle_file, 'wb')
    save = {
        'model': final_gb_clf,
        #'ohc_dow': ohc_dow,
        #'le_type': le_type,
        #'ohc_type': ohc_type,
        #'le_creation_mean': le_creation_mean,
        #'ohc_creation_mean': ohc_creation_mean,
        #'le_payment_type': le_payment_type,
        #'ohc_payment_type': ohc_payment_type,
    }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise

# Тест на отложенной выборке

In [7]:
features = pd.read_csv('test/test-20-29-jun.csv', header = 0)

X_test = features.drop(['order_id','cancel', 'dow'], axis=1)


# X_test = fill_na(X_test, 'shifts_num')

X_test = fill_na_mean(X_test, ['shifts_num', 'dow_paid_share'])

X_test = X_test.fillna(-999)




#X_test = ohc_transform(X_test, 'dow', ohc_dow)

#X_test = le_transform(X_test, 'type', le_type)
#X_test = ohc_transform(X_test, 'type', ohc_type)

#X_test = le_transform(X_test, 'creation_mean', le_creation_mean)
#X_test = ohc_transform(X_test, 'creation_mean', ohc_creation_mean)

#X_test = le_transform(X_test, 'payment_type', le_payment_type)
#X_test = ohc_transform(X_test, 'payment_type', ohc_payment_type)

# X_test = exponentiation(
#    X_test, ['paid_all','canceled_all','paid_16d','canceled_16d', 'paid_30d','canceled_30d', 'paid_60d','canceled_60d'])


y_true = features['cancel']
y_pred = final_gb_clf.predict_proba(X_test)[:,1]


print(roc_auc_score(y_true, y_pred))

0.826187226577


### Bins

In [58]:
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
group_names = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100']

categories = pd.cut(y_pred, bins, labels=group_names)

df = pd.DataFrame(columns=['y_true', 'y_pred', 'bin'])
df['y_true'] = y_true
df['y_pred'] = y_pred
df['bin'] = categories

df.to_csv('bins_nt_model-2017-06-29.csv')

# Prediction

In [218]:
features = pd.read_csv('predict/2017-07-01.csv', header = 0, index_col='order_id')
#X_test = read_csv('test/test-2017.csv', index_col=[0,1,2,3])




# Preprocessing
X_test = features

X_test = features.drop(['dow'], axis=1)
X_test = fill_na_mean(X_test, ['shifts_num', 'dow_paid_share'])
X_test = X_test.fillna(0)



#y_true = features['cancel']
y_pred = final_gb_clf.predict_proba(X_test)[:,1]


#print(roc_auc_score(y_true, y_pred))
print(y_pred)


df = pd.DataFrame(index=X_test.index, columns=['will_cancel'])
df['will_cancel'] = final_gb_clf.predict_proba(X_test)[:,1]
df.to_csv('predict/predict-2017-07-01.csv')
print('DONE')

[ 0.47702018  0.50489465  0.47570555 ...,  0.5580376   0.10706268
  0.17947012]
DONE
