建模过程：训练数据-特征提取-建立模型-预测-结果


In [2]:
import os, sys, pickle

import numpy as np
import pandas as pd

from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

# display for this notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


In [4]:
#读取三个表格的数据：线上训练集、线下训练集、线下测试集
dfoff = pd.read_csv('ccf_offline_stage1_train.csv',keep_default_na = False)#必须设置keep_default_na=Falese，否则读取为NaN
dfon = pd.read_csv('ccf_online_stage1_train.csv',keep_default_na = False)
dftest = pd.read_csv('ccf_offline_stage1_test_revised.csv',keep_default_na = False)

In [5]:
#查看数据前6行
dfoff.head(10)
#dfon.head(6)
#dftest.head(6)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0,,20160217.0
1,1439408,4663,11002.0,150:20,1,20160528.0,
2,1439408,2632,8591.0,20:1,0,20160217.0,
3,1439408,2632,1078.0,20:1,0,20160319.0,
4,1439408,2632,8591.0,20:1,0,20160613.0,
5,1439408,2632,,,0,,20160516.0
6,1439408,2632,8591.0,20:1,0,20160516.0,20160613.0
7,1832624,3381,7610.0,200:20,0,20160429.0,
8,2029232,3381,11951.0,200:20,1,20160129.0,
9,2029232,450,1532.0,30:5,0,20160530.0,


In [6]:
shape_dfoff = dfoff.shape
print(shape_dfoff)
#做描述统计
print('有优惠券，购买商品: %d' % dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] != 'null')].shape[0])
print('有优惠卷，未购商品：%d' % dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] == 'null')].shape[0])
print('无优惠卷，购买商品：%d' % dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] != 'null')].shape[0])
print('无优惠卷，未购商品：%d' % dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] == 'null')].shape[0])


(1754884, 7)
有优惠券，购买商品: 75382
有优惠卷，未购商品：977900
无优惠卷，购买商品：701602
无优惠卷，未购商品：0


In [7]:
## 特征提取：折扣、距离
## 首先考虑折扣
print('Discount_rate 类型：\n',dfoff['Discount_rate'].unique())
#可以提取4中特征：打折类型、折扣率、满多少、减多少

Discount_rate 类型：
 ['null' '150:20' '20:1' '200:20' '30:5' '50:10' '10:5' '100:10' '200:30'
 '20:5' '30:10' '50:5' '150:10' '100:30' '200:50' '100:50' '300:30'
 '50:20' '0.9' '10:1' '30:1' '0.95' '100:5' '5:1' '100:20' '0.8' '50:1'
 '200:10' '300:20' '100:1' '150:30' '300:50' '20:10' '0.85' '0.6' '150:50'
 '0.75' '0.5' '200:5' '0.7' '30:20' '300:10' '0.2' '50:30' '200:100'
 '150:5']


In [15]:
# 打折类型的转换
def getDiscountType(row):#自动一行一行读取
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0
#折扣率转换：没打折-1.0，满减-计算，折扣率-不变
def convertRate(row):
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)
#满多少的转换
def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0
#满多少的转换
def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        #print(int(rows[1]))
        return int(rows[1])
    else:
        return 0
#应用定义的函数
def processData(df):
    df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
    df['discount_rate'] = df['Discount_rate'].apply(convertRate)
    df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
    print('折扣率类型 : \n',df['discount_rate'].unique())
    return df


In [16]:
dfoff = processData(dfoff)
dftest = processData(dftest)
dfoff.head(10)

折扣率类型 : 
 [1.         0.86666667 0.95       0.9        0.83333333 0.8
 0.5        0.85       0.75       0.66666667 0.93333333 0.7
 0.6        0.96666667 0.98       0.99       0.975      0.33333333
 0.2        0.4       ]
折扣率类型 : 
 [0.83333333 0.9        0.96666667 0.8        0.95       0.75
 0.98       0.5        0.86666667 0.6        0.66666667 0.7
 0.85       0.33333333 0.94       0.93333333 0.975      0.99      ]


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,discount_jian
0,1439408,2632,,,0,,20160217.0,,1.0,0,0
1,1439408,4663,11002.0,150:20,1,20160528.0,,1.0,0.866667,150,20
2,1439408,2632,8591.0,20:1,0,20160217.0,,1.0,0.95,20,1
3,1439408,2632,1078.0,20:1,0,20160319.0,,1.0,0.95,20,1
4,1439408,2632,8591.0,20:1,0,20160613.0,,1.0,0.95,20,1
5,1439408,2632,,,0,,20160516.0,,1.0,0,0
6,1439408,2632,8591.0,20:1,0,20160516.0,20160613.0,1.0,0.95,20,1
7,1832624,3381,7610.0,200:20,0,20160429.0,,1.0,0.9,200,20
8,2029232,3381,11951.0,200:20,1,20160129.0,,1.0,0.9,200,20
9,2029232,450,1532.0,30:5,0,20160530.0,,1.0,0.833333,30,5


In [17]:
## 第二个特征：距离
print('Distance 类型：\n',dfoff['Distance'].unique())

distance 类型：
 ['0' '1' 'null' '2' '10' '4' '7' '9' '3' '5' '6' '8']


In [20]:
# 将distance里的null转化为-1
dfoff['distance'] = dfoff['Distance'].replace('null', -1).astype(int)
print(dfoff['distance'].unique())
dftest['distance'] = dftest['Distance'].replace('null', -1).astype(int)
print(dftest['distance'].unique())
dfoff.head(20)


[ 0  1 -1  2 10  4  7  9  3  5  6  8]
[ 1 -1  5  2  0 10  3  6  7  4  9  8]


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,discount_jian,distance
0,1439408,2632,,,0.0,,20160217.0,,1.0,0,0,0
1,1439408,4663,11002.0,150:20,1.0,20160528.0,,1.0,0.866667,150,20,1
2,1439408,2632,8591.0,20:1,0.0,20160217.0,,1.0,0.95,20,1,0
3,1439408,2632,1078.0,20:1,0.0,20160319.0,,1.0,0.95,20,1,0
4,1439408,2632,8591.0,20:1,0.0,20160613.0,,1.0,0.95,20,1,0
5,1439408,2632,,,0.0,,20160516.0,,1.0,0,0,0
6,1439408,2632,8591.0,20:1,0.0,20160516.0,20160613.0,1.0,0.95,20,1,0
7,1832624,3381,7610.0,200:20,0.0,20160429.0,,1.0,0.9,200,20,0
8,2029232,3381,11951.0,200:20,1.0,20160129.0,,1.0,0.9,200,20,1
9,2029232,450,1532.0,30:5,0.0,20160530.0,,1.0,0.833333,30,5,0


In [44]:
## 第三个特征：领券日期
#返回周几
def getWeekday(row):
    if row == "null":
        return row
    else:
        return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1
dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)

#将周六、日定义为1，其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x: 1 if x in [6,7] else 0 )
dftest['weekday_type'] = dftest['weekday'].apply(lambda x: 1 if x in [6,7] else 0 )

#将周几进行编码
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]#输出格式为 weekday_1
#print(weekdaycols)
tmpdf = pd.get_dummies(dfoff['weekday'].replace('null',np.nan))#将weekady这一列转化为哑变量，np.nan用所有都是0来代替
#print(tmpdf.head(6))
tmpdf.columns = weekdaycols #建立一个索引关系
#print(tmpdf.columns)
dfoff[weekdaycols] = tmpdf

#test 中周几的编码
tmpdf = pd.get_dummies(dftest['weekday'].replace('null',np.nan))#将weekady这一列转化为哑变量，np.nan用所有都是0来代替
#print(tmpdf.head(6))
tmpdf.columns = weekdaycols #建立一个索引关系
#print(tmpdf.columns)
dftest[weekdaycols] = tmpdf

dfoff.head(10)
#dftest.head(6)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,...,distance,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,1439408,2632,,,0,,20160217.0,,1.0,0,...,0,,0,0,0,0,0,0,0,0
1,1439408,4663,11002.0,150:20,1,20160528.0,,1.0,0.866667,150,...,1,6.0,1,0,0,0,0,0,1,0
2,1439408,2632,8591.0,20:1,0,20160217.0,,1.0,0.95,20,...,0,3.0,0,0,0,1,0,0,0,0
3,1439408,2632,1078.0,20:1,0,20160319.0,,1.0,0.95,20,...,0,6.0,1,0,0,0,0,0,1,0
4,1439408,2632,8591.0,20:1,0,20160613.0,,1.0,0.95,20,...,0,1.0,0,1,0,0,0,0,0,0
5,1439408,2632,,,0,,20160516.0,,1.0,0,...,0,,0,0,0,0,0,0,0,0
6,1439408,2632,8591.0,20:1,0,20160516.0,20160613.0,1.0,0.95,20,...,0,1.0,0,1,0,0,0,0,0,0
7,1832624,3381,7610.0,200:20,0,20160429.0,,1.0,0.9,200,...,0,5.0,0,0,0,0,0,1,0,0
8,2029232,3381,11951.0,200:20,1,20160129.0,,1.0,0.9,200,...,1,5.0,0,0,0,0,0,1,0,0
9,2029232,450,1532.0,30:5,0,20160530.0,,1.0,0.833333,30,...,0,1.0,0,1,0,0,0,0,0,0


In [54]:
## 对样本进行标注
def label(row):
    if row['Date_received'] == 'null':
        return -1
    elif row['Date'] != 'null': 
        td = pd.to_datetime(row['Date'], format = '%Y%m%d') - pd.to_datetime(row['Date_received'], format = '%Y%m%d')
        if td <= pd.Timedelta(15,'D'):#时间上的差异为15，按天计算
            return 1
        else:
            return 0
    else:
        return 0
        
dfoff['label'] = dfoff.apply(label,axis=1)    
dfoff.head(10)       

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,...,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,label
0,1439408,2632,,,0,,20160217.0,,1.0,0,...,,0,0,0,0,0,0,0,0,-1
1,1439408,4663,11002.0,150:20,1,20160528.0,,1.0,0.866667,150,...,6.0,1,0,0,0,0,0,1,0,0
2,1439408,2632,8591.0,20:1,0,20160217.0,,1.0,0.95,20,...,3.0,0,0,0,1,0,0,0,0,0
3,1439408,2632,1078.0,20:1,0,20160319.0,,1.0,0.95,20,...,6.0,1,0,0,0,0,0,1,0,0
4,1439408,2632,8591.0,20:1,0,20160613.0,,1.0,0.95,20,...,1.0,0,1,0,0,0,0,0,0,0
5,1439408,2632,,,0,,20160516.0,,1.0,0,...,,0,0,0,0,0,0,0,0,-1
6,1439408,2632,8591.0,20:1,0,20160516.0,20160613.0,1.0,0.95,20,...,1.0,0,1,0,0,0,0,0,0,0
7,1832624,3381,7610.0,200:20,0,20160429.0,,1.0,0.9,200,...,5.0,0,0,0,0,0,1,0,0,0
8,2029232,3381,11951.0,200:20,1,20160129.0,,1.0,0.9,200,...,5.0,0,0,0,0,0,1,0,0,0
9,2029232,450,1532.0,30:5,0,20160530.0,,1.0,0.833333,30,...,1.0,0,1,0,0,0,0,0,0,0


In [57]:
print(dfoff['label'].value_counts())

 0    988887
-1    701602
 1     64395
Name: label, dtype: int64


In [58]:
#划分训练集和验证集
df = dfoff[dfoff['label'] != -1].copy()#筛选出标签为0和1的用户
train = df[(df['Date_received'] < '20160516')].copy()#小于20160516的训练集
valid = df[(df['Date_received'] >= '20160516') & (df['Date_received'] <= '20160615')].copy()
print('Train Set: \n', train['label'].value_counts())
print('Valid Set: \n', valid['label'].value_counts())


Train Set: 
 0    759172
1     41524
Name: label, dtype: int64
Valid Set: 
 0    229715
1     22871
Name: label, dtype: int64


In [85]:

#特征数量
original_feature=['discount_rate','discount_type','discount_man','discount_jian','distance','weekday','weekday_type']+weekdaycols
print ('共有特征：',len(original_feature),'个')
print(original_feature)

#建立模型-logistic回归(随机梯度下降法更新模型)
def check_model(data, predictors):
    classifier=lambda:SGDClassifier(
        loss='log',#logistic回归函数和损失函数
        penalty='elasticnet',#正则化项
        fit_intercept=True,#有截距项
        max_iter=100,
        shuffle=True,
        n_jobs=1,
        class_weight=None)#设置不同类别的权重，没有给出是，默认都为1
    #使用sklearn 的 pipeline,将整个过程流程化，首先标准化数据、之后分类
    model=Pipeline(steps=[
        ('ss',StandardScaler()),
        ('en',classifier())   
        ])
    #可选择的参数
    parameters={
       'en__alpha':[0.001,0.01,0.1],
        'en__l1_ratio':[0.001,0.01,0.1]
        }
    #确定折叠次数
    folder=StratifiedKFold(n_splits=3,shuffle=True)  
    grid_search=GridSearchCV(
        model,
        parameters,
        cv=folder,
        n_jobs=-1,
        verbose=1)
    grid_search=grid_search.fit(data[predictors],data['label'])  
    return grid_search


共有特征： 14 个
['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [86]:
#训练模型
predictors = original_feature
model = check_model(train,predictors)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   48.6s finished


In [87]:
# 模型验证
y_valid_pred = model.predict_proba(valid[predictors])#调用训练好的模型，输入验证集数据
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]
valid1.head(20)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,...,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,label,pred_prob
1,1439408,4663,11002,150:20,1,20160528,,1,0.866667,150,...,1,0,0,0,0,0,1,0,0,0.019429
4,1439408,2632,8591,20:1,0,20160613,,1,0.95,20,...,0,1,0,0,0,0,0,0,0,0.101248
6,1439408,2632,8591,20:1,0,20160516,20160613.0,1,0.95,20,...,0,1,0,0,0,0,0,0,0,0.101248
9,2029232,450,1532,30:5,0,20160530,,1,0.833333,30,...,0,1,0,0,0,0,0,0,0,0.098011
10,2029232,6459,12737,20:1,0,20160519,,1,0.95,20,...,0,0,0,0,1,0,0,0,0,0.130609


In [99]:

#fpr是错误的正类类别，tpr是正确的正类类别
aucs = []
fpr, tpr, thresholds = roc_curve(valid1['label'], valid1['pred_prob'], pos_label=1)
aucs.append(auc(fpr, tpr))
#计算auc的值
print(np.average(aucs))


0.6226297566068204


In [101]:
y_test_pred=model.predict_proba(dftest[predictors])
dftest1=dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['Probability']=y_test_pred[:,1]
#将生成的csv保存到本地
dftest1.to_csv('E:/yanjiu/class/tianchi/submit2.csv',index=False,header=False)#index-行名 header-列名
dftest1.head(5)


Unnamed: 0,User_id,Coupon_id,Date_received,Probability
0,4129537,9983,20160712,0.10682
1,6949378,3429,20160706,0.149929
2,2166529,6928,20160727,0.005259
3,2166529,1808,20160727,0.017902
4,6172162,6500,20160708,0.063804


In [50]:
import datetime
datetime.datetime.now()
datetime.datetime.now().weekday()
date(2020,4,11).weekday()
pd.to_datetime('20200410')-pd.to_datetime('20200409')
pd.Timedelta(15)

Timedelta('0 days 00:00:00.000000')