In [75]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets, metrics
from sklearn.ensemble import GradientBoostingClassifier

DATA_ROOT = "./data/"
df_train = pd.read_csv(DATA_ROOT + 'train_offline.csv')
dftest = pd.read_csv(DATA_ROOT + 'test_offline.csv')

# dftest = dftest[~dftest.Coupon_id.isna()]
# dftest.reset_index(drop=True, inplace=True)
print(df_train.shape)
print(dftest.shape)
df_train.head(20)

(1160742, 7)
(594142, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,
5,2223968,3381,9776.0,10:5,2.0,20160129.0,
6,73611,2099,12034.0,100:10,,20160207.0,
7,163606,1569,5054.0,200:30,10.0,20160421.0,
8,3273056,4833,7802.0,200:20,10.0,20160130.0,
9,94107,3381,7610.0,200:20,2.0,20160412.0,


In [76]:
def target_label(row):
    if np.isnan(row['Coupon_id']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0
    
df_train["label"] = df_train.apply(target_label, axis=1)
df_train["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [77]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0
    


def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

df_train = processData(df_train)
dftest = processData(dftest)

In [78]:
def countWent(df):
    WentTimes = df[['User_id','Merchant_id']]
    WentTimes['temp'] = 1
    WentTimes = WentTimes.groupby(['User_id','Merchant_id']).agg('sum').reset_index()
    WentTimes.columns = ['User_id','Merchant_id','WentTimes']

    df = pd.merge(df, WentTimes , on = ['User_id','Merchant_id'], how = 'left')
    return df

# df_train.
df_train = countWent(df_train)
dftest = countWent(dftest)    


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [79]:
df_train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,discount_rate,discount_man,discount_jian,discount_type,WentTimes
0,1439408,2632,,,0.0,,20160217.0,-1,,0,0,0,3
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,0.95,20,1,1,3
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,0.95,20,1,1,3
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,0.9,200,20,1,1
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,0.9,200,20,1,1


In [80]:
## Naive model
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = df_train[df_train['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))
valid.head()

Train size: 667753, #positive: 32472
Valid size: 79216, #positive: 3832


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,discount_rate,discount_man,discount_jian,discount_type,WentTimes,is_train
0,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,0.9,200,20,1,1,False
1,163606,1569,5054.0,200:30,10.0,20160421.0,,0,0.85,200,30,1,1,False
2,4061024,3381,7610.0,200:20,10.0,20160426.0,,0,0.9,200,20,1,2,False
3,106443,450,3732.0,30:5,99.0,20160429.0,,0,0.833333,30,5,1,1,False
4,114747,1569,5054.0,200:30,9.0,20160426.0,,0,0.85,200,30,1,2,False


In [85]:
uid = train['User_id']
cid = train['Coupon_id']
predictors = ['discount_rate',
              'discount_type',
              'discount_man', 
              'discount_jian',
              'WentTimes',
              'Distance']
print(len(predictors),predictors)
# df.drop("User_id","Coupon_id","Merchant_id",axis=1)
def check_model(data, predictors):
    
#     classifier = lambda: SGDClassifier(
#         loss='log', 
#         penalty='elasticnet', 
#         fit_intercept=True, 
#         max_iter=100, 
#         shuffle=True, 
#         n_jobs=1,
#         class_weight=None)
    classifier = GradientBoostingClassifier(random_state=7)


    
    parameters = {
        'n_estimators': [25],
        'max_depth': [3],
        'learning_rate': [0.2,0.1]
    }
    # min_samples_leaf : [1 ,5, 15, 30, 60, 100]

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        classifier, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], data['label'])
    
    return grid_search



6 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'WentTimes', 'Distance']


In [86]:
model = check_model(train, predictors)
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (model.best_score_, model.best_params_))

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  1.2min finished


Best Accuracy: 0.952491 using {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 25}


In [87]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (model.best_score_, model.best_params_))

Best Accuracy: 0.952491 using {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 25}


In [88]:
#驗證集預測
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

In [89]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.856, Accuracy: 0.953


In [90]:































targetset = dftest.copy()
print(targetset.shape)
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors])
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
print(test1.shape)


(594142, 11)
(306313, 7)


In [91]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("submit_gbc.csv", header=["uid", "label"], index=False) # submission format
out.head()

(306313, 4)


Unnamed: 0,uid,label
0,1000020_2705_20160519,0.037376
1,1000020_8192_20160513,0.037376
2,1000065_1455_20160527,0.113353
3,1000085_8067_20160513,0.061449
4,1000086_2418_20160613,0.020341


In [10]:
df_train.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,discount_rate,discount_man,discount_jian,discount_type
0,1439408,2632,,,0.0,,20160217.0,-1,,0,0,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,0.95,20,1,1
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,0.95,20,1,1
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,0.9,200,20,1
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,0.9,200,20,1
5,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,0.5,10,5,1
6,73611,2099,12034.0,100:10,99.0,20160207.0,,0,0.9,100,10,1
7,163606,1569,5054.0,200:30,10.0,20160421.0,,0,0.85,200,30,1
8,3273056,4833,7802.0,200:20,10.0,20160130.0,,0,0.9,200,20,1
9,94107,3381,7610.0,200:20,2.0,20160412.0,,0,0.9,200,20,1
