In [None]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

In [None]:
DATA_ROOT = "../Data/Midterm_Exam/"

dftrain = pd.read_csv(os.path.join(DATA_ROOT,"train_offline.csv"))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)

In [7]:
print(dftrain.shape)
print(dftest.shape)

(1160742, 8)
(306313, 6)


In [4]:
dftrain.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,
5,2223968,3381,9776.0,10:5,2.0,20160129.0,
6,73611,2099,12034.0,100:10,,20160207.0,
7,163606,1569,5054.0,200:30,10.0,20160421.0,
8,3273056,4833,7802.0,200:20,10.0,20160130.0,
9,94107,3381,7610.0,200:20,2.0,20160412.0,


In [5]:
#
# Creat target label
#
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1       # 在 n=15 天內有使用優惠券
2) buy with coupon but out of 15 days ==> 0             # 超過 n=15 天沒有使用優惠券
3) buy without coupon ==> -1 (we don't care)            # 都沒有使用優惠券，忽略這些資料
"""

def labeling(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dftrain["label"] = dftrain.apply(labeling, axis=1)
dftrain.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label
0,1439408,2632,,,0.0,,20160217.0,-1
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0


In [6]:
dftrain["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [8]:
#
# Generate features - weekday acquired coupon   取得是星期幾拿到優惠券
#

def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1    # add one to make it from 0~6 -> 1~7

dftrain['weekday'] = dftrain['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
dftrain['weekday_type'] = dftrain['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to train-set
dftest['weekday_type'] = dftest['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 )   # apply to test-set

In [9]:
dftrain.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type
0,1439408,2632,,,0.0,,20160217.0,-1,,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0


In [10]:
dftest.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,weekday,weekday_type
0,1439408,4663,11002.0,150:20,1.0,20160528.0,6,0
1,1439408,2632,8591.0,20:1,0.0,20160613.0,1,0
2,1439408,2632,8591.0,20:1,0.0,20160516.0,1,0
3,2029232,450,1532.0,30:5,0.0,20160530.0,1,0
4,2029232,6459,12737.0,20:1,0.0,20160519.0,4,0


In [11]:
# 產生星期的欄位名稱
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(dftrain['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftrain[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [12]:
#
# Generate features - coupon discount and distance
#

def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

    
def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

    
def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

    
def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

    
def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df


dftrain = processData(dftrain)
dftest = processData(dftest)

In [13]:
dftrain.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type
0,1439408,2632,,,0.0,,20160217.0,-1,,0,...,0,0,0,0,0,0,,0,0,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,...,0,1,0,0,0,0,0.95,20,1,1
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,0,...,0,0,0,0,1,0,0.95,20,1,1
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1


In [14]:
#
# Naive model
#

def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    

df = dftrain[dftrain['label'] != -1].copy()      # 排除掉非目標資料
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]                      #驗證資料集，將訓練資料排除

train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)

print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 667753, #positive: 32472
Valid size: 79216, #positive: 3832


In [18]:
print('Train:', dftrain.shape)
print('Test: ', dftest.shape)

Train: (1160742, 21)
Test:  (306313, 19)


In [15]:
# 建立特性欄位
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type'] + weekdaycols

print(len(original_feature),original_feature)

14 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [16]:
predictors = original_feature
print('predictors columns:', predictors)


predictors columns: ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [53]:
# 碼表物件

import time
import math

class StopWatch:
    def __init__(self):
        self.start()
    def start(self):
        self._startTime = time.time()
        self._stopTime = None
    def stop(self):
        self._stopTime = time.time()
    def getStartTime(self):
        return self._startTime
    def elapsed(self):
        if self._stopTime is None:
            self.stop()
        diff= self._stopTime - self._startTime
        hours, rem = divmod(diff, 3600)
        minutes, seconds = divmod(rem, 60)
        return "{:0>2}:{:0>2}:{:06.3f}".format(int(hours),int(minutes),seconds)
    

In [None]:
def check_model(data, predictors):
    '''
    測試模型，並取得最佳參數組合
    '''
    classifier = lambda: SGDClassifier(
        loss='modified_huber', #'log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        tol=1e-3,               #default 0.001
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.0001, 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1, 0.15, 0.5]
    }

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    
    grid_search = grid_search.fit(data[predictors].astype('float'), data['label'].astype('float'))
    
    return grid_search

In [None]:
sw = StopWatch()

sw.start()

model = check_model(train, predictors)

sw.stop()
print('time elapsed:', sw.elapsed())


Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:
print(model)

In [56]:
y_valid_pred = model.predict_proba(valid[predictors].astype('float'))
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

In [57]:
from sklearn.metrics import roc_auc_score, accuracy_score

auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))

print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.741, Accuracy: 0.952


In [59]:
targetset = dftest.copy()
print(targetset.shape)

targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors].astype('float'))
test1 = testset.copy()

test1['pred_prob'] = y_test_pred[:, 1]
print(test1.shape)

(306313, 19)
(306313, 15)


In [60]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [61]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]

out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.090585
1,1000020_8192_20160513,0.086388
2,1000065_1455_20160527,0.068559
3,1000085_8067_20160513,0.070725
4,1000086_2418_20160613,0.051928


In [62]:
# 將結果存為 CSV 檔
out.to_csv("midterm_exam_01.csv", header=["uid", "label"], index=False) # submission format