In [None]:
import os, sys, pickle
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from datetime import date
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
# display for this notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
dfoff = pd.read_csv('../input/ccf_offline_stage1_train.csv')
dftest = pd.read_csv('../input/ccf_offline_stage1_test_revised.csv')
dfon = pd.read_csv('../input/ccf_online_stage1_train.csv')

dfoff = dfoff.fillna('null')  
dftest = dftest.fillna('null')
dfon = dfon.fillna('null')

In [6]:
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0
def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)
def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0
def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0
def processData(df):
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].apply(convertRate)
    df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
    print(df['discount_rate'].unique())
    
    # convert distance
    df['distance'] = df['Distance'].replace('null', -1).astype(int)
    print(df['distance'].unique())
    return df


dfoff = processData(dfoff)
dftest = processData(dftest)

[1.         0.86666667 0.95       0.9        0.83333333 0.8
 0.5        0.85       0.75       0.66666667 0.93333333 0.7
 0.6        0.96666667 0.98       0.99       0.975      0.33333333
 0.2        0.4       ]
[ 0  1 -1  2 10  4  7  9  3  5  6  8]
[0.83333333 0.9        0.96666667 0.8        0.95       0.75
 0.98       0.5        0.86666667 0.6        0.66666667 0.7
 0.85       0.33333333 0.94       0.93333333 0.975      0.99      ]
[ 1 -1  5  2  0 10  3  6  7  4  9  8]


In [8]:
def getWeekday(row):
    if row == 'null':
        return row
    else:
        return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)
# weekday_type :  周六和周日为1，其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
dftest['weekday_type'] = dftest['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )


In [9]:
# change weekday to one-hot encoding 
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)
tmpdf = pd.get_dummies(dfoff['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf
tmpdf = pd.get_dummies(dftest['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [10]:
def label(row):
    if row['Date_received'] == 'null':
        return -1
    if row['Date'] != 'null':
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff['label'] = dfoff.apply(label, axis = 1)

In [None]:
# data split
df = dfoff[dfoff['label'] != -1].copy()
train = df[(df['Date_received'] < 20160516)].copy()
valid = df[(df['Date_received'] >= 20160516) & (df['Date_received'] <= 20160615)].copy()
print(train['label'].value_counts())
print(valid['label'].value_counts())
# feature
original_feature = ['discount_rate','discount_type','discount_man', 'discount_jian','distance', 'weekday', 'weekday_type'] + weekdaycols
print(len(original_feature),original_feature)







In [13]:
feature = dfoff[(dfoff['Date'] < '20160516') | ((dfoff['Date'] == 'null') & (dfoff['Date_received'] < '20160516'))].copy()
data = dfoff[(dfoff['Date_received'] >= '20160516') & (dfoff['Date_received'] <= '20160615')].copy()
print(data['label'].value_counts())

TypeError: '<' not supported between instances of 'float' and 'str'

In [None]:
tem = dfoff[dfoff['Date'] != 'null']
tem = tem[tem['Date'] < 20160516]

tem_2 = dfoff[dfoff['Date'] == 'null']
tem_2 = tem_2[tem_2['Date_received'] < 20160516]

feature = pd.concat([tem, tem_2], axis=0)

In [None]:
# model1
predictors = original_feature
print(predictors)
def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)
    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])
    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }
    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search


if not os.path.isfile('1_model.pkl'):
    model = check_model(train, predictors)
    print(model.best_score_)
    print(model.best_params_)
    with open('1_model.pkl', 'wb') as f:
        pickle.dump(model, f)
else:
    with open('1_model.pkl', 'rb') as f:
        model = pickle.load(f)

['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [10]:
# valid predict
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]
valid1.head(2)


# avgAUC calculation
vg = valid1.groupby(['Coupon_id'])
aucs = []
for i in vg:
    tmpdf = i[1] 
    if len(tmpdf['label'].unique()) != 2:
        continue
    fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
    aucs.append(auc(fpr, tpr))
print(np.average(aucs))

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_rate,discount_man,discount_jian,...,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,label,pred_prob
1,1439408,4663,11002,150:20,1,20160500.0,,0.866667,150,20,...,1,0,0,0,0,0,1,0,0,0.019472
4,1439408,2632,8591,20:1,0,20160600.0,,0.95,20,1,...,0,1,0,0,0,0,0,0,0,0.101713


0.5323444694516165


In [12]:
def clf_train(X_train, y_train, X_val, y_val):
    categorical_feature = ['性别', '入院科室']
    clf = lgb.LGBMClassifier(n_estimators=10000,
                                   learning_rate=0.06,
                                   max_depth=5,
                                   num_leaves=30,
                                   objective='binary',
                                   subsample=0.8,
                                   sub_feature=0.8,
#                                    class_weight='balanced',  #设置样本平衡；好像不要会更好
                                   )
    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='auc',
            early_stopping_rounds=100, verbose = 5000,
#             categorical_feature = categorical_feature    #onehot之后就不需要了
            )
    feat_impo = sorted(zip(X_train.columns, clf.feature_importances_), key=lambda x: x[1], reverse=True)
    
    return clf, feat_impo 
def clf_predict(clf, X_val):
    y_pred = clf.predict_proba(X_val, num_iteration=clf.best_iteration)
    
    return y_pred
def clf_evaluate(df_val, y_pred_):
    y_pred_ = y_pred_[:, 1]  #因为sklearn输出每个类别的概率，手动选择1类
    
    df_val_auc = df_val[['Coupon_id', 'label']]
    df_val_auc['pred_prob'] = y_pred_

    # 计算平均AUC
    vg = df_val_auc.groupby(['Coupon_id'])
    aucs = []
    for i in vg:   #这个“i”是分好组子集
        df_tem = i[1]

        if len(df_tem['label'].unique()) != 2:
            continue

        fpr, tpr, thresholds = roc_curve(df_tem['label'], df_tem['pred_prob'], pos_label=1)
        aucs.append(auc(fpr, tpr))
    print('平均auc为：', np.average(aucs))


start_time = time()

predictors = ['discount_rate','discount_man', 'discount_jian','distance', 'weekday_type'] + weekdaycols

clf, feat_impo = clf_train(train[predictors], train['label'], valid[predictors], valid['label'])
y_pred_ = clf_predict(clf, valid[predictors])
clf_evaluate(valid, y_pred_)

print('训练预测的时间为:', int(time() - start_time))

Train until valid scores didn't improve in 100 rounds.
Early stopping, best iteration is:
[24]	valid_0's auc: 0.648326


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


平均auc为： 0.5529419866221547
训练预测的时间为: 10


#### 可能用到的代码保存

In [16]:
valid1 = valid_.copy()
valid1['pred_prob'] = y_pred_[:, 1]
valid1.head(2)



# avgAUC calculation
vg = valid1.groupby(['Coupon_id'])
aucs = []
for i in vg:
    tmpdf = i[1]
    
    if len(tmpdf['label'].unique()) != 2:
        continue
    fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
    aucs.append(auc(fpr, tpr))
print(np.average(aucs))

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_rate,discount_man,discount_jian,...,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,label,pred_prob
1,1439408,4663,11002,150:20,1,20160500.0,,0.866667,150,20,...,1,0,0,0,0,0,1,0,0,0.130769
4,1439408,2632,8591,20:1,0,20160600.0,,0.95,20,1,...,0,1,0,0,0,0,0,0,0,0.230289


0.5529419866221547
