In [1]:
import os
import pandas as pd
import numpy as np
import time
from random import choice
import gc
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from contextlib import contextmanager
from sklearn.model_selection import StratifiedShuffleSplit
import lightgbm as lgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import f1_score # one more metric to evaluate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# Preparation

In [2]:
os.listdir('../input/weekappv3')

['df7_p_feav3.csv', 'df14_p_feav3.csv', 'df7_feav3.csv', 'df14_feav3.csv']

In [3]:
sub = pd.read_csv('../input/predict-in-app-purchase/sample_submission_2.csv')
df7 = pd.read_csv('../input/weekappv3/df7_feav3.csv')
df14 = pd.read_csv('../input/weekappv3/df14_feav3.csv')
df7_p = pd.read_csv('../input/weekappv3/df7_p_feav3.csv')
df14_p = pd.read_csv('../input/weekappv3/df14_p_feav3.csv')
submission = sub[['user_id_hash']]

In [4]:
# fillna with -1
df7 = df7.fillna(-1)
df14 = df14.fillna(-1)
df7_p = df7_p.fillna(-1)
df14_p = df14_p.fillna(-1)

In [5]:
def fea_importance(tree,fea):
    importance_df = pd.DataFrame()
    importance_df["feature"] = fea
    importance_df["importance"] = tree.feature_importances_
    return importance_df.sort_values(by='importance',ascending=False)
    
def threshold_search(y_true, y_proba):
    '''
    searching a threshold to find the best f1-score
    '''
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
#     search_result = {'threshold': best_threshold, 'f1': best_score} print if u want
    return best_score

def Model(model,trainX,trainy,nsplits=5):
    nsplits=5
    kf = StratifiedShuffleSplit(n_splits=nsplits,random_state=420)
    df = pd.DataFrame(columns=['Average AUC', 'Average F_score'])

    t = time.time()
    all_auc = []
    f_score = []
    for train, test in kf.split(trainX, trainy):
        X_train, X_test, y_train, y_test = trainX.loc[train], trainX.loc[test], trainy[train], trainy[test]
        model.fit(X_train, y_train)
        probabilities = model.predict_proba(X_test)
        score = probabilities[:, 1]
        preds = model.predict(X_test)
        all_auc.append(roc_auc_score(y_test, score))
        fscore = threshold_search(y_test, score)
        f_score.append(fscore)
    print("AUC and F1",(np.mean(all_auc), np.mean(f_score)))
    print(f"Time use:{time.time()-t:.3f}s")
    return model

# 7-day Part

two small parts
- use all features
- use top20 features

In [6]:
drop_col7 = ['con_act_day_count_max_89', 'time_gap_min_9', 'time_gap_min_89',
       'is96', 'con_act_max_89', 'con_act_day_count_min_9',
       'con_act_day_count_total_9', 'next_pch_time_min_rec_mon',
       'con_act_day_count_max_9', 'con_act_day_count_min_09',
       'con_act_day_count_max_09', 'con_act_day_count_mean_9',
       'con_act_day_count_total_09', 'con_act_max_9', 'time_gap_max_9',
       'con_act_day_count_total_89', 'con_act_day_count_std_89',
       'con_act_day_count_mean_89', 'is67', 'con_act_day_count_std_9',
       'con_act_day_count_std_09','user_purchase_binary_14_days',
             'user_purchase_binary_7_days','user_id_le','user_id_hash']

In [7]:
# Those params have been tuned already
param1 = {'n_estimators': 100, 'learning_rate': 0.1,'reg_lambda': 20,'is_unbalance': True, 'boosting_type': 'gbdt', 'max_depth': 5, 'feature_fraction': 0.7, 'objective': 'binary', 'metric': 'auc'}
param1_r = {'n_estimators': 200, 'class_weight': 'balanced', 'min_samples_split': 20, 'n_jobs': -1, 'min_samples_leaf': 4, 'criterion': 'entropy', 'max_features': 'auto', 'max_depth': 14}
feats = [c for c in df7.columns if c not in drop_col7]
X_7 = df7[feats]

# X_7 = X_7.rename(columns={'7buy_mean_tr>.8_tr':"7buy_mean_tr_more_80p_tr",'7length<3_tr':"7length_less_3_tr"})
X_7 = X_7.rename(columns={'7buy_mean_tr>.8_tr':"7buy_mean_tr8_tr",'7length<3_tr':"7length3_tr"}) # for xgb
y_7 = df7['user_purchase_binary_7_days']

LGB1 = lgb.LGBMClassifier(**param1)
RF1 = RandomForestClassifier(**param1_r)
LR1 = LogisticRegression(class_weight='balanced')
XGB1 = XGBClassifier(eval_metric='auc')
xgb1 = Model(XGB1,X_7,y_7,nsplits=5)
rf1 = Model(RF1,X_7,y_7,nsplits=5)
# lr1 = Model(LR1,X_7,y_7,nsplits=5) # really slow
# lgb1 = Model(LGB1,X_7,y_7,nsplits=5) # perform bad


AUC and F1 (0.9906990645273226, 0.5592126362889578)
Time use:839.170s
AUC and F1 (0.9897716516882262, 0.5372940751949701)
Time use:662.639s


In [8]:
# np.array(fea_importance(lgb1,feats)['feature'][:20])
# print('Plotting feature importances...')
# ax = lgb.plot_importance(lgb1, max_num_features=40,figsize=(15,10))
# plt.show()
# np.array(list(set(fea_importance(rf1,feats)[-30:]['feature']).intersection(set(fea_importance(lgb1,feats)[-30:]['feature']))))

# 14-day Part

In [9]:
drop_col14 = ['con_pch_day_count_std_rec_mon', 'con_act_day_count_min_78',
       'con_act_day_count_total_78', 'con_pch_day_count_total_rec_mon',
       'time_gap_max_8', 'time_gap_min_8', 'is96',
       'next_pch_time_min_rec_mon', 'con_act_day_count_mean_8',
       'con_act_day_count_max_8', 'con_act_max_8', 'is67',
       'con_act_day_count_mean_78', 'con_act_day_count_std_8',
       'con_act_day_count_min_08', 'user_eve_day_count_78',
       'con_act_day_count_min_8', 'time_gap_std_8',
       'con_act_day_count_std_78', 'time_gap_mean_8','user_purchase_binary_14_days',
        'user_purchase_binary_7_days','user_id_le','user_id_hash']

In [10]:
# param2 = {'n_estimators': 100, 'learning_rate': 0.1,'reg_lambda': 20,'is_unbalance': True, 'boosting_type': 'gbdt', 'max_depth': 7, 'feature_fraction': 0.7, 'objective': 'binary', 'metric': 'auc'}

# Those params have been tuned already
param2 = {'n_estimators': 150, 'bagging_fraction': 0.9, 'learning_rate': 0.1, 'is_unbalance': True, 'max_bin': 25, 'boosting_type': 'dart', 'max_depth': 5, 'feature_fraction': 0.8, 'lambda_l1': 40, 'objective': 'binary', 'metric': 'auc'}
param2_r = {'n_estimators': 140, 'max_features': 'auto', 'n_jobs': -1, 'max_depth': 15, 'min_samples_leaf': 4, 'class_weight': 'balanced', 'min_samples_split': 8}

feats2 = [c for c in df14.columns if c not in drop_col14]
X_14 = df14[feats2]
X_14 = X_14.rename(columns={'14buy_mean_tr>.8_tr':"14buy_mean_tr8_tr",'14length<3_tr':"14length3_tr"})
y_14 = df14['user_purchase_binary_14_days']

LGB2 = lgb.LGBMClassifier(**param2)
RF2 = RandomForestClassifier(**param2_r)
LR2 = LogisticRegression(class_weight='balanced')
XGB2 = XGBClassifier(eval_metric='auc')
xgb2 = Model(XGB2,X_14,y_14,nsplits=5)
lgb2 = Model(LGB2,X_14,y_14,nsplits=5)
rf2 = Model(RF2,X_14,y_14,nsplits=5)
# lr2 = Model(LR2,X_14,y_14,nsplits=5)
# fea_importance(rf2,rf_f20_2)

AUC and F1 (0.9930259154268327, 0.6479112807352301)
Time use:839.585s
AUC and F1 (0.9936688345454693, 0.6525635744578292)
Time use:321.664s
AUC and F1 (0.9916809216265332, 0.5991233159946361)
Time use:537.967s


In [11]:
## use this to get top20
# fea_importance(lgb2,feats2)

In [12]:
# print('Plotting feature importances...')
# ax = lgb.plot_importance(lgb2, max_num_features=40,figsize=(15,10))
# plt.show()

# prediction
- 7-day part
- 14-day part

# 7-day

In [13]:
drop_col7_p = ['con_act_day_count_max_910', 'time_gap_min_10', 'time_gap_min_910',
       'is96', 'con_act_max_910', 'con_act_day_count_min_10',
       'con_act_day_count_total_10', 'next_pch_time_min_rec_mon',
       'con_act_day_count_max_10', 'con_act_day_count_min_010',
       'con_act_day_count_max_010', 'con_act_day_count_mean_10',
       'con_act_day_count_total_010', 'con_act_max_10', 'time_gap_max_10',
       'con_act_day_count_total_910', 'con_act_day_count_std_910',
       'con_act_day_count_mean_910', 'is67', 'con_act_day_count_std_10',
       'con_act_day_count_std_010','user_purchase_binary_14_days',
        'user_purchase_binary_7_days','user_id_le','user_id_hash']

In [14]:
# use all feats1
X_7_p = df7_p[[c for c in df7_p.columns if c not in drop_col7_p]]
X_7_p = X_7.rename(columns={'7buy_mean>.8_p':'7buy_mean8_p','7length<3_p':'7length3_p'})

In [15]:
# use all feats(just rf+lr) lgb is so bad!
pred_rf7 = rf1.predict_proba(X_7_p)[:, 1]
pred_xgb7 = xgb1.predict_proba(X_7_p)[:, 1]
# pred_lr7 = lr1.predict_proba(X_7_p)[:, 1]

# pred =  1/3*pred_rf7+1/3*pred_xgb7+1/3*pred_lr7
pred = 1/2*pred_xgb7+1/2*pred_rf7
df7_p['user_purchase_binary_7_days'] = pred
pred_df = df7_p[['user_id_hash','user_purchase_binary_7_days']]

# 14-day

In [17]:
drop_col14_p = ['con_pch_day_count_std_rec_mon', 'con_act_day_count_min_910',
       'con_act_day_count_total_910', 'con_pch_day_count_total_rec_mon',
       'time_gap_max_10', 'time_gap_min_10', 'is96',
       'next_pch_time_min_rec_mon', 'con_act_day_count_mean_10',
       'con_act_day_count_max_10', 'con_act_max_10', 'is67',
       'con_act_day_count_mean_910', 'con_act_day_count_std_10',
       'con_act_day_count_min_010', 'user_eve_day_count_910',
       'con_act_day_count_min_10', 'time_gap_std_10',
       'con_act_day_count_std_910', 'time_gap_mean_10',
        'user_purchase_binary_14_days','user_purchase_binary_7_days',
         'user_id_le','user_id_hash']

In [18]:
# use all feats
X_14_p = df14_p[[c for c in df14_p.columns if c not in drop_col14_p]]
X_14_p = X_14.rename(columns={'14buy_mean>.8_p':'14buy_mean8_p','14length<3_p':'14length3_p'})

In [19]:
pred_lg14 = lgb2.predict_proba(X_14_p)[:, 1]
pred_rf14 = rf2.predict_proba(X_14_p)[:, 1]
pred_xgb14 = xgb2.predict_proba(X_14_p)[:, 1]
pred2 =  1/3*pred_lg14+1/3*pred_rf14+1/3*pred_xgb14
df14_p['user_purchase_binary_14_days'] = pred2
pred_df2 = df14_p[['user_id_hash','user_purchase_binary_14_days']]

In [21]:
submission = pd.merge(submission,pred_df,on=['user_id_hash'],how='left')
submission = pd.merge(submission,pred_df2,on=['user_id_hash'],how='left')

In [22]:
def impute_mean(df):
    for c in df.columns:
        if c not in ['user_id_hash']:
            df[c] = df[c].fillna(0)
    return df

submission = impute_mean(submission)
submission.to_csv('submission.csv',index=False)