In [334]:
import numpy as np
import pandas as pd

from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

In [263]:
dfoff = pd.read_csv("./midterm_data/train_offline.csv")

#Read test data
dftest = pd.read_csv("./midterm_data/test_offline.csv")
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
print(df.shape)
print(dftest.shape)



(746969, 14)
(306313, 6)


In [264]:
dfoff.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,


In [265]:
dftest.head(10)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received
0,1439408,4663,11002.0,150:20,1.0,20160528.0
1,1439408,2632,8591.0,20:1,0.0,20160613.0
2,1439408,2632,8591.0,20:1,0.0,20160516.0
3,2029232,450,1532.0,30:5,0.0,20160530.0
4,2029232,6459,12737.0,20:1,0.0,20160519.0
5,2747744,6901,1097.0,50:10,,20160606.0
6,196342,1579,10698.0,20:1,1.0,20160606.0
7,253750,6901,2366.0,30:5,0.0,20160518.0
8,343660,4663,11002.0,150:20,,20160528.0
9,1113008,3621,2705.0,20:5,0.0,20160524.0


In [266]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""

def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = df.apply(label, axis=1)
dfoff["label"].value_counts()

0.0    710665
1.0     36304
Name: label, dtype: int64

In [269]:
dfoff.loc[np.isnan(dfoff["label"]), 'label'] = -1

In [33]:
#處理日期
def time_processing(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d") 

In [270]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
dfoff['weekday_type'] = dfoff['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to trainset
dftest['weekday_type'] = dftest['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to testset



In [271]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf



['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [65]:
dfoff['Date_received'] = pd.to_datetime(df['Date_received'], format = "%Y%m%d") 
dfoff['Date'] = pd.to_datetime(df['Date'], format = "%Y%m%d") 

In [454]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['price_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_rate'] = 1 - df['price_rate']
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    df['received_coupons']= df.groupby(['User_id', 'Merchant_id']).cumcount()+1
    
    df['Merchant_id_Hash'] = df['Merchant_id'].map(lambda x:hash(x) % 10)
    
    df['Distance_double'] = df['Distance'] **2
    df['discount_man_log'] =  np.log(df['discount_man'])
    df['Distance_log'] = np.log(df['Distance']) 

    
    # convert distance
    #df.loc[df.Distance.isna(), "Distance"] = 99
    df['Distance'].fillna(df['Distance'].mean())
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)



In [388]:
dfoff.loc[dfoff.Distance==99, "Distance"] = np.nan

In [443]:
#收到的第幾張coupon
#dfoff['received_coupons']= dfoff.groupby(['User_id', 'Merchant_id']).cumcount()+1

#總共送過幾張？

#mechant_id做hash


dfoff['Distance_double'] = dfoff['Distance'] **2
dfoff['Distance_log'] = np.log(dfoff['Distance']) 
dfoff['discount_man_log'] =  np.log(dfoff['discount_man'])

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


In [398]:
dfoff.columns

Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'label', 'weekday', 'weekday_type',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'weekday_7', 'price_rate', 'discount_rate', 'discount_man',
       'discount_jian', 'discount_type', 'received_coupons',
       'Merchant_id_Hash', 'Distance_double', 'discount_man_log'],
      dtype='object')

In [301]:
df.shape

(746969, 14)

In [441]:
dfoff.head(10)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_7,price_rate,discount_rate,discount_man,discount_jian,discount_type,received_coupons,Merchant_id_Hash,Distance_double,discount_man_log
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0.0,3.0,0,...,0,0.95,0.05,20,1,1,1,2,0.0,2.995732
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0.0,6.0,0,...,0,0.95,0.05,20,1,1,2,2,0.0,2.995732
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0.0,5.0,0,...,0,0.9,0.1,200,20,1,1,1,0.0,5.298317
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0.0,5.0,0,...,0,0.9,0.1,200,20,1,1,1,1.0,5.298317
5,2223968,3381,9776.0,10:5,2.0,20160129.0,,0.0,5.0,0,...,0,0.5,0.5,10,5,1,1,1,4.0,2.302585
6,73611,2099,12034.0,100:10,3.73904,20160207.0,,0.0,7.0,0,...,1,0.9,0.1,100,10,1,1,9,13.980423,4.60517
7,163606,1569,5054.0,200:30,10.0,20160421.0,,0.0,4.0,0,...,0,0.85,0.15,200,30,1,1,9,100.0,5.298317
8,3273056,4833,7802.0,200:20,10.0,20160130.0,,0.0,6.0,0,...,0,0.9,0.1,200,20,1,1,3,100.0,5.298317
9,94107,3381,7610.0,200:20,2.0,20160412.0,,0.0,2.0,0,...,0,0.9,0.1,200,20,1,1,1,4.0,5.298317
11,253750,8390,7531.0,20:5,0.0,20160327.0,,0.0,7.0,0,...,1,0.75,0.25,20,5,1,1,0,0.0,2.995732


In [404]:
#切資料
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
dfoff = dfoff[dfoff['label'] != -1].copy()
#df["is_train"] = df["Date_received"].apply(split_train_valid)
train_size = 667753
#train = df[df["is_train"]]
#valid = df[~df["is_train"]]
train = dfoff[:667753]
valid = dfoff[667753:]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 667753, #positive: 32304.0
Valid size: 79216, #positive: 4000.0


In [405]:
train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_7,price_rate,discount_rate,discount_man,discount_jian,discount_type,received_coupons,Merchant_id_Hash,Distance_double,discount_man_log
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0.0,3.0,0,...,0,0.95,0.05,20,1,1,1,2,0.0,2.995732
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0.0,6.0,0,...,0,0.95,0.05,20,1,1,2,2,0.0,2.995732
2,1832624,3381,7610.0,200:20,0.0,20160429.0,,0.0,5.0,0,...,0,0.9,0.1,200,20,1,1,1,0.0,5.298317
3,2029232,3381,11951.0,200:20,1.0,20160129.0,,0.0,5.0,0,...,0,0.9,0.1,200,20,1,1,1,1.0,5.298317
4,2223968,3381,9776.0,10:5,2.0,20160129.0,,0.0,5.0,0,...,0,0.5,0.5,10,5,1,1,1,4.0,2.302585


In [312]:
train.columns

Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'label', 'weekday', 'weekday_type',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'weekday_7', 'price_rate', 'discount_rate', 'discount_man',
       'discount_jian', 'discount_type', 'received_coupons',
       'Merchant_id_Hash'],
      dtype='object')

In [462]:
#要使用哪些features
original_features = ['Merchant_id_Hash',
                     'discount_rate', 
                     'price_rate',
                     'discount_type', 
                     'discount_man', 
                     'discount_jian', 
                     'Distance', 
                     #'Distance_double', 
                     'Distance_log', 
                     'discount_man_log',
                     'weekday_type',
                     'received_coupons']  + weekdaycols

In [463]:
predictors = original_features

In [464]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
x_train, x_test, y_train, y_test = train_test_split(dfoff[predictors], dfoff['label'], test_size=0.1, random_state=42)



In [437]:
gbmodel = GradientBoostingClassifier(n_estimators=700, max_depth=7,
                               random_state=4)

In [439]:
gbmodel.fit(x_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
y_test_pred = gbmodel.predict_proba(x_test)

In [None]:
auc_score = roc_auc_score(y_true=y_test, y_score=y_test_pred[:,1])
acc = accuracy_score(y_true=y_test, y_pred=y_test_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))


In [None]:
gbmodel.save_model("gb.model")

In [359]:
import xgboost as xgb

In [366]:
# 設定要訓練的超參數組合
n_estimators = [100, 200, 300]
max_depth = [1, 3, 5]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(xgbmodel, param_grid, scoring="roc_auc", n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

# 預設會跑 3-fold cross-validadtion，總共 9 種參數組合，總共要 train 27 次模型



Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed: 18.9min finished


In [367]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: 0.888699 using {'max_depth': 5, 'n_estimators': 300}


In [459]:
xgbmodel = xgb.XGBClassifier(n_estimators = 300, max_depth=6)
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2]
n_estimators = [100, 200, 300]
param_grid = []

In [460]:
xgbmodel.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=None, n_estimators=300,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

In [461]:
y_test_pred = xgbmodel.predict_proba(x_test)

auc_score = roc_auc_score(y_true=y_test, y_score=y_test_pred[:,1])
acc = accuracy_score(y_true=y_test, y_pred=y_test_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))


Validation AUC: 0.873, Accuracy: 0.954


In [433]:
x_train.head()

Unnamed: 0,Merchant_id_Hash,discount_rate,price_rate,discount_type,discount_man,discount_jian,Distance,Distance_double,discount_man_log,weekday_type,received_coupons,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
903261,1,0.166667,0.833333,1,30,5,0.0,0.0,3.401197,0,2,0,0,0,0,0,0,1
730280,0,0.066667,0.933333,1,150,10,1.0,1.0,5.010635,0,1,0,0,0,1,0,0,0
815017,7,0.1,0.9,1,100,10,1.0,1.0,4.60517,0,1,0,0,0,0,1,0,0
391429,0,0.1,0.9,1,100,10,10.0,100.0,4.60517,0,1,1,0,0,0,0,0,0
1029119,9,0.3,0.7,1,100,30,8.0,64.0,4.60517,0,1,0,0,1,0,0,0,0


In [337]:
# 設定要訓練的超參數組合
n_estimators = [100, 200, 300]
max_depth = [1, 3, 5]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

# 預設會跑 3-fold cross-validadtion，總共 9 種參數組合，總共要 train 27 次模型

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed: 24.8min finished


In [338]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: 0.886936 using {'max_depth': 5, 'n_estimators': 300}


In [316]:
model.fit(train[predictors], train['label'])

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=2,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=4, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [296]:
y_valid_pred = model.predict_proba(valid[predictors])

In [297]:
#AUC CALCULATION

from sklearn.metrics import roc_auc_score, accuracy_score

auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))


Validation AUC: 0.870, Accuracy: 0.950


In [465]:
#輸出結果
targetset = dftest.copy()
print(targetset.shape)
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = xgbmodel.predict_proba(testset[predictors])
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
print(test1.shape)

(306313, 25)
(306313, 19)


In [466]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [467]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("./midterm_answer6_XGB.csv", header=["uid", "label"], index=False) # submission format
out.head()



Unnamed: 0,uid,label
0,1000020_2705_20160519,0.141017
1,1000020_8192_20160513,0.079318
2,1000065_1455_20160527,0.109224
3,1000085_8067_20160513,0.076233
4,1000086_2418_20160613,0.059725
