In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import time
import datetime
import pandas as pd

features = pd.read_csv('train/train_big.csv', header = 0)
X = features.drop(['order_id','cancel'], axis=1)
y = features['cancel']
print('{0} train samples loaded'.format(len(X)))

#print(X.head())

374200 train samples loaded


# Preprocessing

In [5]:
def fill_na(df, col_name):
    df[col_name] = df[col_name].fillna(df[col_name].mean())
    df = df.fillna(0)
    return df

def le_fit_transform(df, col_name):
    le = LabelEncoder()
    le.fit(df[col_name])
    df[col_name] = le.transform(df[col_name])
    return df, le

def le_transform(df, col_name, le):
    df[col_name] = le.transform(df[col_name])
    return df

def ohc_fit_transform(df, col_name):
    ohe = OneHotEncoder(sparse=False)
    ohe.fit(df[col_name].values.reshape(-1, 1))
    encoded_col = ohe.transform(df[col_name].values.reshape(-1, 1))
    #encoded_col = ohe.fit_transform(df[col_name].values.reshape(-1, 1))
    tmp = pd.DataFrame(encoded_col, columns=[col_name + str(i) for i in range(encoded_col.shape[1])], index = df.index)
    df = pd.concat([df, tmp], axis = 1)
    df = df.drop([col_name], axis = 1)
    return df, ohe

def ohc_transform(df, col_name, ohe):
    encoded_col = ohe.transform(df[col_name].values.reshape(-1, 1))
    tmp = pd.DataFrame(encoded_col, columns=[col_name + str(i) for i in range(encoded_col.shape[1])], index = df.index)
    df = pd.concat([df, tmp], axis = 1)
    df = df.drop([col_name], axis = 1)
    return df

def exponentiation(df, cols):
    for c in cols:
        df[c + '_sq'] = df[c]**2
        df[c + '_sqrt'] = np.sqrt(df[c])
    return df
        

 
X = fill_na(X, 'shifts_num')

X, ohc_dow = ohc_fit_transform(X, 'dow')

X, le_type = le_fit_transform(X, 'type')
X, ohc_type = ohc_fit_transform(X, 'type')

X, le_creation_mean = le_fit_transform(X, 'creation_mean')
X, ohc_creation_mean = ohc_fit_transform(X, 'creation_mean')

X, le_payment_type = le_fit_transform(X, 'payment_type')
X, ohc_payment_type = ohc_fit_transform(X, 'payment_type')

X = exponentiation(X, ['paid_all','canceled_all','paid_16d','canceled_16d', 'paid_30d','canceled_30d', 'paid_60d','canceled_60d'])

In [6]:
folds = 5
kf = KFold(len(X), folds, shuffle=True, random_state=42)
trees = [10, 50, 100]
depths = [3, 4]
rates = [0.2]


print()
print('Gradient boosting fitting:')

for t in trees:
    for d in depths:
        for r in rates:
            start_time = datetime.datetime.now()
            gb_clf = GradientBoostingClassifier(learning_rate=r, n_estimators=t, verbose=False, random_state=241, max_depth = d)
            scores = cross_val_score(gb_clf, X=X, y=y, scoring='roc_auc', cv=kf)
            time_to_fit = datetime.datetime.now() - start_time
            print('Trees {0}, Depth {1}, Learning Rate {2}, Time to fit: {3}, ROC-AUC: {4}'.format(t, d, r, time_to_fit, scores.mean()))


Gradient boosting fitting:
Trees 10, Depth 3, Learning Rate 0.2, Time to fit: 0:01:39.963544, ROC-AUC: 0.860007446230256
Trees 10, Depth 4, Learning Rate 0.2, Time to fit: 0:02:22.858603, ROC-AUC: 0.8800338839699634
Trees 50, Depth 3, Learning Rate 0.2, Time to fit: 0:06:03.632265, ROC-AUC: 0.9032229368724007
Trees 50, Depth 4, Learning Rate 0.2, Time to fit: 0:10:04.550437, ROC-AUC: 0.9093175906363371
Trees 100, Depth 3, Learning Rate 0.2, Time to fit: 0:12:39.006230, ROC-AUC: 0.9085114273831738
Trees 100, Depth 4, Learning Rate 0.2, Time to fit: 0:18:38.343109, ROC-AUC: 0.912996632246162


# Model Training

In [7]:
final_gb_clf = GradientBoostingClassifier(learning_rate=0.2, n_estimators=100, verbose=False, random_state=241, max_depth=4)
final_gb_clf.fit(X, y)


coeff = final_gb_clf.feature_importances_
names = X.columns.values
df = pd.DataFrame({'features':names, 'importance':coeff})
print(df.sort('importance'))

                features  importance
17            shower_flg    0.000000
53                 type2    0.000000
58        creation_mean4    0.000000
19               cat_flg    0.000000
61        creation_mean7    0.000000
21          wardrobe_flg    0.000000
47                  dow3    0.000000
77     canceled_30d_sqrt    0.000000
11  kitchen_cabinets_flg    0.000130
52                 type1    0.000172
46                  dow2    0.000352
25          evening_slot    0.000360
49                  dow5    0.000572
38          canceled_30d    0.000576
72       canceled_16d_sq    0.000576
20            lustre_flg    0.000717
13         balconies_flg    0.000798
81     canceled_60d_sqrt    0.000829
10           ironing_flg    0.000829
48                  dow4    0.000837
45                  dow1    0.000860
18          bedlinen_flg    0.001089
62        creation_mean8    0.001102
16       consumables_flg    0.001163
35          canceled_16d    0.001164
44                  dow0    0.001390
5



# Тест на отложенной выборке

In [8]:
features = pd.read_csv('test/test-new-16-19-may.csv', header = 0)

X_test = features.drop(['order_id','cancel'], axis=1)


X_test = fill_na(X_test, 'shifts_num')

X_test = ohc_transform(X_test, 'dow', ohc_dow)

X_test = le_transform(X_test, 'type', le_type)
X_test = ohc_transform(X_test, 'type', ohc_type)

X_test = le_transform(X_test, 'creation_mean', le_creation_mean)
X_test = ohc_transform(X_test, 'creation_mean', ohc_creation_mean)

X_test = le_transform(X_test, 'payment_type', le_payment_type)
X_test = ohc_transform(X_test, 'payment_type', ohc_payment_type)

X_test = exponentiation(
    X_test, ['paid_all','canceled_all','paid_16d','canceled_16d', 'paid_30d','canceled_30d', 'paid_60d','canceled_60d'])


y_true = features['cancel']
y_pred = final_gb_clf.predict_proba(X_test)[:,1]


print(roc_auc_score(y_true, y_pred))

0.853840089084


### Bins

In [34]:
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
group_names = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100']

categories = pd.cut(y_pred, bins, labels=group_names)

df = pd.DataFrame(columns=['y_true', 'y_pred', 'bin'])
df['y_true'] = y_true
df['y_pred'] = y_pred
df['bin'] = categories

df.to_csv('bins_new_model.csv')

# Prediction

In [15]:
X_test = read_csv('test/test-2017.csv', index_col=[0,1,2,3])

df = DataFrame(index=X_test.index, columns=['will_cancel'])
df['will_cancel'] = final_gb_clf.predict_proba(X_test)[:,1]
# df.to_csv('predictions/prediction-2017-04-07-2.csv')
print('Output in YYYY-MM-DD.csv')

Output in YYYY-MM-DD.csv


In [16]:
import pandas as pd
df_work = df[df['will_cancel']>=0.5]
bins = [0.5, 0.65, 0.80, 1.0]
group_names = ['green', 'yellow', 'red']

categories = pd.cut(df_work['will_cancel'], bins, labels=group_names)
df_work['categories'] = pd.cut(df_work['will_cancel'], bins, labels=group_names)

df_work.to_csv('predictions/prediction-2017-05-02.csv')
print(len(df_work))

230


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
