In [1]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

In [2]:
raw_train = pd.read_csv('ml100marathon-02-01/train_offline.csv')
raw_test = pd.read_csv('ml100marathon-02-01/test_offline.csv')
raw_test = raw_test[~raw_test.Coupon_id.isna()]
raw_test.reset_index(drop=True, inplace=True)
print(raw_train.shape)
print(raw_test.shape)
raw_train.head(20)

(1160742, 7)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,
5,2223968,3381,9776.0,10:5,2.0,20160129.0,
6,73611,2099,12034.0,100:10,,20160207.0,
7,163606,1569,5054.0,200:30,10.0,20160421.0,
8,3273056,4833,7802.0,200:20,10.0,20160130.0,
9,94107,3381,7610.0,200:20,2.0,20160412.0,


In [3]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

raw_train["label"] = raw_train.apply(label, axis=1)
raw_train["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [4]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

raw_train['weekday'] = raw_train['Date_received'].apply(getWeekday)
raw_test['weekday'] = raw_test['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
raw_train['weekday_type'] = raw_train['weekday'].apply(lambda x : 1 if x >= 6 else 0 ) # apply to trainset
raw_test['weekday_type'] = raw_test['weekday'].apply(lambda x : 1 if x >= 6 else 0 ) # apply to testset

In [5]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(raw_train['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
raw_train[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(raw_test['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
raw_test[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [6]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    df['distance_class'] = df['Distance'].apply(lambda x : 1 if x <= 3 else 0 )
    return df

raw_train = processData(raw_train)
raw_test = processData(raw_test)

In [7]:
# total coupon_counts

# if there is a coupon id, count one
def coupon_counts(row):
    if (np.isnan(row)):
        return 0
    else:
        return 1
    
raw_train['coupon_counts'] = raw_train['Coupon_id'].apply(coupon_counts)
raw_test['coupon_counts'] = raw_test['Coupon_id'].apply(coupon_counts)

# caculate each customer's total coupon received
cc = raw_train[['User_id','coupon_counts']].groupby('User_id',as_index=False).sum()
cc.columns = ['User_id', 'total_coupon_received']
raw_train = raw_train.merge(cc, on = 'User_id')
# print(raw_train.head())

cc = raw_test[['User_id','coupon_counts']].groupby('User_id',as_index=False).sum()
cc.columns = ['User_id', 'total_coupon_received']
raw_test = raw_test.merge(cc, on = 'User_id')
# print(raw_test.head())

In [8]:
raw_train.corr()['label']

User_id                 -0.000863
Merchant_id             -0.006524
Coupon_id                0.014643
Distance                 0.222471
Date_received            0.090294
Date                     0.011166
label                    1.000000
weekday                 -0.016272
weekday_type             0.371477
weekday_1                0.234004
weekday_2                0.190574
weekday_3                0.193491
weekday_4                0.209967
weekday_5                0.244123
weekday_6                0.225822
weekday_7                0.264702
discount_rate           -0.078597
discount_man             0.506418
discount_jian            0.526072
discount_type            0.893634
distance_class          -0.301633
coupon_counts            0.945769
total_coupon_received    0.154368
Name: label, dtype: float64

In [9]:
## Naive model
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = raw_train[raw_train['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 667753, #positive: 32472
Valid size: 79216, #positive: 3832


In [10]:
original_feature = ['discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday_type',
                    'total_coupon_received',
                    'distance_class'] + weekdaycols
print(len(original_feature),original_feature)

14 ['discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday_type', 'total_coupon_received', 'distance_class', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [11]:
predictors = original_feature
print(predictors)

['discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday_type', 'total_coupon_received', 'distance_class', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [28]:
train[predictors].head()

Unnamed: 0,discount_type,discount_man,discount_jian,Distance,weekday_type,total_coupon_received,distance_class,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,1,20,1,0.0,0,2,1,0,0,1,0,0,0,0
1,1,20,1,0.0,1,2,1,0,0,0,0,0,1,0
2,1,200,20,1.0,0,1,1,0,0,0,0,1,0,0
3,1,10,5,2.0,0,1,1,0,0,0,0,1,0,0
4,1,100,10,99.0,1,1,0,0,0,0,0,0,0,1


In [31]:
## try logistic regression

mm = MinMaxScaler()
mm_target = ['discount_man','discount_jian','Distance']

train_mm = train[predictors].copy()
valid_mm = valid[predictors].copy()

train_mm[mm_target] = mm.fit_transform(train_mm[mm_target])
valid_mm[mm_target] = mm.fit_transform(valid_mm[mm_target])

lr = LogisticRegression()

lr.fit(train_mm,train['label'])

y_valid_pred = lr.predict(valid_mm)

from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred)
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred)
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.507, Accuracy: 0.952


In [32]:
## try Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier()

GBC.fit(train_mm,train['label'])

y_valid_pred_GBC = GBC.predict(valid_mm)

auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred_GBC)
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred_GBC)
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.515, Accuracy: 0.952


In [14]:
## try SGDClassifier

def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

SGD = check_model(train, predictors)

y_valid_pred_SGD = SGD.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred_SGD[:, 1]

auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred_SGD[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred_SGD.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  1.1min finished


Validation AUC: 0.789, Accuracy: 0.952


In [15]:
targetset = raw_test.copy()
print(targetset.shape)
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = SGD.predict_proba(raw_test[predictors])
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
print(test1.shape)

output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("try.csv", header=["uid", "label"], index=False) # submission format
out.head()

(306313, 22)
(306313, 15)
(306313, 4)


Unnamed: 0,uid,label
0,1000020_2705_20160519,0.11957
1,1000020_8192_20160513,0.102173
2,1000065_1455_20160527,0.091905
3,1000085_8067_20160513,0.086528
4,1000086_2418_20160613,0.076079


In [16]:
## try Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42)

rf.fit(train[predictors],train['label'])

y_valid_pred_rf = rf.predict(valid[predictors])

auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred_rf)
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred_rf)
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.513, Accuracy: 0.953


In [22]:
##### try: blending

### logistic Regression
y_test_pred_lr = lr.predict(raw_test[predictors])
test2 = testset.copy()
test2['pred_prob'] = y_test_pred_lr
print(test2.shape)

output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test2["pred_prob"]), axis=1)
print(output.shape)
output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

out_lr = output.groupby("uid", as_index=False).mean()
out_lr["pred_prob"].describe()

### Gradient Boosting
y_test_pred_GBC = GBC.predict(raw_test[predictors])
test3 = testset.copy()
test3['pred_prob'] = y_test_pred_GBC
print(test3.shape)

output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test3["pred_prob"]), axis=1)
print(output.shape)
output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

out_GBC = output.groupby("uid", as_index=False).mean()
out_GBC["pred_prob"].describe()

### RandomForestClassifier
y_test_pred_rf = rf.predict(raw_test[predictors])
test4 = testset.copy()
test4['pred_prob'] = y_test_pred_rf
print(test4.shape)

output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test4["pred_prob"]), axis=1)
print(output.shape)
output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

out_rf = output.groupby("uid", as_index=False).mean()
out_rf["pred_prob"].describe()

(306313, 15)
(306313, 4)
(306313, 15)
(306313, 4)
(306313, 15)
(306313, 4)


count    304096.000000
mean          0.001874
std           0.043254
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: pred_prob, dtype: float64

In [26]:
blending = out.copy()
blending['label'] = blending['label']*0.7 + out_rf["pred_prob"]*0.1 + out_lr["pred_prob"]*0.1 + out_GBC["pred_prob"]*0.1
blending.to_csv("blending.csv", header=["uid", "label"], index=False)