In [2]:
#!/usr/bin/env python
# coding=utf-8
# -------- import basic package --------
import pdb
import math
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import datetime as dt
import warnings
import copy
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier

import washer.utils.path.pathHandle as ph
import washer.utils.store as store
from washer.sample.feature import FeatSampler
from washer.repairer.isolatedRepairer import GeneralRepairer

from washer.demo.tianchi.mobile.system.utils.preProcess import *
from washer.demo.tianchi.mobile.system.utils.selectFeature import *

PATH_OFFLINE = "F:/codeGit/my project/python/dataset/tianchi/offline/"
PATH_ONLINE = "F:/codeGit/my project/python/dataset/tianchi/online/"
PATH_OF_DATAOUT = "F:/codeGit/my project/python/dataset/tianchi/submission/"
PATH_MODEL = "F:/codeGit/my project/python/dataset/tianchi/model/"



In [3]:
def getOffsetDate(date, offset):
    if offset > 0:
        date_next = (dt.datetime.strptime(date, '%Y-%m-%d') + dt.timedelta(offset)).strftime('%Y-%m-%d')
    elif offset < 0:
        date_next = (dt.datetime.strptime(date, '%Y-%m-%d') - dt.timedelta(offset * -1)).strftime('%Y-%m-%d')    
    return date_next

In [4]:
def updateParasList(parasList, para, k):
    if len(parasList) < k:
        parasList.append(para)
        minIdx = -1
    else:
        minIdx = Series(parasList).idxmin()
        if para > parasList[minIdx]:
            parasList[minIdx] = para
    return parasList, minIdx

In [5]:
def updateKModelList(kModelList, idx, model, k):
    if len(kModelList) < k:
        kModelList.append(model)
        kModelList[idx] = model
    return kModelList

In [6]:
def trainWithMutliTime(model, df, nTime, k):
    print("trainig model...")
    parasList = []
    kModelList = []

    for i in range(nTime):
        df_train, df_cv = selectSamplesByBagging(df, 0.3, 1)
        X_test, y_test, le = transform_dataset(df_cv.drop(['user_id', 'item_id', 'item_category'], axis = 1))
        X_train, y_train, le1 = transform_dataset(df_train.drop(['user_id', 'item_id', 'item_category'], axis = 1))
        
        paras = []
        model_new = copy.deepcopy(model)
        y_pred = model_new.fit(X_train, y_train).predict(X_test)
        pred_labels = le.inverse_transform(y_pred)
        F1, P ,R = calc_F1(y_test, pred_labels)
        paras.append(F1)
        paras.append(P)
        paras.append(R)
        parasList, index = updateParasList(parasList, paras[0], k)
        kModelList = updateKModelList(kModelList, index, model_new, k)
        # print 'mean: F1/P/R %.2f%%/%.2f%%/%.2f%%\n' %(paras[0], paras[1], paras[2])
    return kModelList, parasList

In [7]:
def getSampleSetWithComplementSet(df, nSamlpe):
    colsName = df.columns
    df = pd.DataFrame(df.values, index = range(len(df.index)))
    df.columns = colsName
    
    df_sample = df.sample(nSamlpe)
    list_comple = Series(df.index.isin(df_sample.index)).apply(lambda x: not x)
    df_comple = df[list_comple]
    return df_sample, df_comple

In [8]:
def selectSamplesByBagging(df, ratioOnPos, ratioP2N):
    colsName = df.columns
    df = pd.DataFrame(df.values, index = range(len(df.index)))
    df.columns = colsName
    
    df_pos = df[df.label == 1]
    list_pos = Series(df.index.isin(df_pos.index))
    list_neg = list_pos.apply(lambda x: not x)
    df_neg = df[list_neg]
 
    len_pos = len(df_pos)  
    len_bag = int(ratioOnPos * len_pos)

    df_bag_pos, df_left_pos = getSampleSetWithComplementSet(df_pos, len_bag)
    df_bag_neg, df_left_neg = getSampleSetWithComplementSet(df_neg, (int(len_bag / ratioP2N)))
                                                            
    df_bag = pd.concat([df_pos.sample(len_bag), df_neg.sample(int(len_bag / ratioP2N))], axis = 0)
    df_left = pd.concat([df_left_pos, df_left_neg], axis = 0)
    return df_bag, df_left

In [9]:
def calc_F1(y ,y_pred):

	y = DataFrame(y, columns = ['label'])
	y_pred = DataFrame(y_pred, columns = ['label'])

	pos_yp = y_pred[y_pred.label == 1]
	neg_yp = y[y_pred.label == 0]
	pos_y = y[y.label == 1]
	neg_y = y[y.label == 0]
	
	## calc TP and TN
	len_yp = len(y_pred); len_y = len(y)
	
	TP = sum(pos_yp.index.isin(pos_y.index))
	FP = len(pos_yp) - TP
	TN = sum(neg_yp.index.isin(neg_y.index))
	FN = len(neg_yp) - TN

	## calc R, P, F1
	if (len(pos_yp) == 0 or len(pos_y) == 0 or TP == 0):
		P = 0; R = 0; F1 = 0
	else:
		R = 1.0 * TP / len(pos_yp) * 100
		P = 1.0 * TP / len(pos_y) * 100
		F1 = (2.0 * R * P) / (R + P)
		# the two following formulation are also right
	# R = 1.0 * TP / (TP + FP) * 100
	# P = 1.0 * TP / (TP + FN) * 100 
	return F1, P, R

In [10]:
## 生成训练集
date = '2014-11-22'
df_train = DataFrame()
while date != '2014-12-08':
    df = pd.read_csv(PATH_OFFLINE + 'statisFeat_' + date + '_' + '4days.csv')
    df_easySet = df[df.last_cart_h > 0]
    df_train = pd.concat([df_train, df_easySet], axis = 0)
    date = getOffsetDate(date, 1)
print(df_train.iloc[:10])

PATH_OF_OFFLINE = 'F:/codeGit/my project/python/dataset/tianchi/offline/'
# df_train.to_csv(PATH_OF_OFFLINE + 'validTrainSet' + .csv', mode = 'w', index = False)
df_train = pd.read_csv(PATH_OF_OFFLINE + 'validTrainSet.csv')

KeyboardInterrupt: 

In [26]:
## 训练多个随机森林模型
model = RandomForestClassifier(n_estimators=100, n_jobs=4, random_state=24)
models_RF, paraList_RF = trainWithMutliTime(model, df_train, 10, 6)
print(paraList_RF)
## 训练多个gbdt模型
model = GradientBoostingClassifier(max_depth=6)
models_GBDT, paraList_GBDT = trainWithMutliTime(model, df_train, 10, 6)
print(paraList_GBDT)
models_RF.extend(models_GBDT)
models = models_RF

trainig model...
[8.83977900552486, 8.717203831425408, 8.818454459553838, 8.75292118121946, 8.669575786558273, 8.949853366334574]
trainig model...
[8.543804061006796, 8.529735071955143, 8.531786279362246, 8.636205235430374, 8.629379476401331, 8.634670617965556]


In [24]:
## 生成测试集
date = '2014-12-08'
df_test = DataFrame()
while date != '2014-12-11':
    df = pd.read_csv(PATH_OFFLINE + 'statisFeat_' + date + '_' + '4days.csv')
    df_test = pd.concat([df_test, df], axis = 0)
    date = getOffsetDate(date, 1)
print(df)

date = '2014-12-16'
while date != '2014-12-19':
    df = pd.read_csv(PATH_OFFLINE + 'statisFeat_' + date + '_' + '4days.csv')
    df_test = pd.concat([df_test, df], axis = 0)
    date = getOffsetDate(date, 1)
print(df)
df_test.to_csv(PATH_OF_OFFLINE + 'validTestSet' + '.csv', mode = 'w', index = False)
df_test = df_test.drop(['user_id', 'item_id', 'item_category'], axis = 1)

In [54]:
def trainStackingModel(models, stackModel, X, y): 
    X_layer1 = []
    for i in range(len(models)):
        X_layer1.append(models[i].predict(X).tolist())
    X_layer1 = np.array(X_layer1).T
    stackModel = stackModel.fit(X_layer1, y)
    return stackModel

In [56]:
def predictStackingModel(models, stackModel, X):
    X_layer1 = []
    for i in range(len(models)):
        X_layer1.append(models[i].predict(X))
    X_layer1 = np.array(X_layer1).T        
    y = stackModel.predict(X_layer1)
    return y

In [61]:
## 单模型测试

df_test_1 = df_test[df_test.last_cart_h > 0]
df_test_2 = df_test[df_test.last_cart_h <= 0]

X_test_1, y_test_1, le = transform_dataset(df_test_1)
X_test_2, y_test_2, le = transform_dataset(df_test_2)
model = models[0]
y_pred_1 = model.predict(X_test_1)

y_test = np.zeros(len(y_test_1) + len(y_test_2))
y_pred = np.zeros(len(y_test_1) + len(y_test_2))

y_test[: len(y_test_1)] = y_test_1
y_test[len(y_test_1):] = y_test_2

y_pred[: len(y_pred_1)] = y_pred_1
y_pred[len(y_pred_1):] = 0

F1, P, R = calc_F1(y_test, y_pred)
print 'F1/P/R %.2f%%/%.2f%%/%.2f%%\n' %(F1, P, R)

F1/P/R 7.43%/23.26%/4.42%



In [63]:
## 直接单模型进行stacking
df_bag, df_cv = selectSamplesByBagging(df_train, 0.6, 1)
X_test, y_test, le = transform_dataset(df_cv.drop(['user_id', 'item_id', 'item_category'], axis = 1))
X_train, y_train, le1 = transform_dataset(df_bag.drop(['user_id', 'item_id', 'item_category'], axis = 1))

stackModel = RandomForestClassifier(n_estimators=100, n_jobs=4, random_state=24)
stackModel = trainStackingModel(models, stackModel, X_train, y_train)
y_pred = predictStackingModel(models, stackModel, X_test)
pred_labels = le.inverse_transform(y_pred)
F1, P ,R = calc_F1(y_test, pred_labels)
print 'F1/P/R %.2f%%/%.2f%%/%.2f%%\n' %(F1, P, R)

df_test_1 = df_test[df_test.last_cart_h > 0]
df_test_2 = df_test[df_test.last_cart_h <= 0]

X_test_1, y_test_1, le = transform_dataset(df_test_1)
X_test_2, y_test_2, le = transform_dataset(df_test_2)
y_pred_1 = predictStackingModel(models, stackModel, X_test_1)

y_test = np.zeros(len(y_test_1) + len(y_test_2))
y_pred = np.zeros(len(y_test_1) + len(y_test_2))

y_test[: len(y_test_1)] = y_test_1
y_test[len(y_test_1):] = y_test_2

y_pred[: len(y_pred_1)] = y_pred_1
y_pred[len(y_pred_1):] = 0

F1, P, R = calc_F1(y_test, y_pred)
print 'F1/P/R %.2f%%/%.2f%%/%.2f%%\n' %(F1, P, R)

(9146L, 12L)
F1/P/R 5.56%/88.62%/2.87%

F1/P/R 7.34%/22.70%/4.37%



如上对单模型进行stacking的结果还比不上单模型直接预测的结果，这说明了stacking并不能有效减少方差。
- 若使用类平衡化的训练集进行训练，训练结果的cv与test集的结果如下：

F1/P/R 5.56%/88.62%/2.87%  
F1/P/R 7.34%/22.70%/4.37%  

- 若使用类不平衡的训练集进行训练，训练结果的cv与test集的结果如下：

F1/P/R 5.51%/4.46%/7.21%  
F1/P/R 0.30%/0.17%/1.27%  

In [90]:
def baggingModels(models, X):
    y_pred = 0
    nModel = len(models)
    decs = nModel - 1
    for i in range(nModel):
        y_pred = y_pred + models[i].predict(X)
    y_pred = DataFrame(y_pred, columns = ['label'])
    # y_pred.label = y_pred.label.apply(lambda x: 1 if x > decs else 0)
    return y_pred.label

In [80]:
## 训练多个随机森林模型
model = RandomForestClassifier(n_estimators=100, n_jobs=4, random_state=24)
models_RF, paraList_RF = trainWithMutliTime(model, df_train, 40, 40)
print(paraList_RF)
## 训练多个gbdt模型
model = GradientBoostingClassifier(max_depth=6)
models_GBDT, paraList_GBDT = trainWithMutliTime(model, df_train, 40, 40)
print(paraList_GBDT)


trainig model...
[8.641329497213446, 8.586366775180032, 8.923449682636653, 8.799493255521748, 8.598659923395916, 8.254351197656387, 8.421669679058182, 8.667421517249672, 8.694850304309515, 8.816912214492334, 8.715393578976977, 8.426737410583165, 8.910704172172927, 8.687780964585748, 8.56263666676561, 8.482033010836295, 8.609917804360292, 8.855336494468013, 8.375670943482879, 8.749795517749062, 8.402051439953045, 8.64275987290059, 8.74073924337218, 8.766836614314585, 8.648703506368296, 8.518591383041512, 8.8845241702101, 8.665677024697878, 8.531784322050253, 8.356851828663347, 8.695305332854693, 8.507644961164575, 8.85290825985545, 8.58985518714885, 8.989946839307333, 8.856925782916852, 8.372589781244637, 8.746149573296975, 8.542590099165595, 8.646730253473411]
trainig model...
[8.539765991443739, 8.35131614276196, 8.333497469027593, 8.517324738114423, 8.468554777159051, 8.356726408982942, 8.644654520817674, 8.724903433931832, 8.807133714841807, 8.200812813770023, 8.510850785471034, 8.2

In [91]:
## 使用bagging模型再使用stacking模型
df_bag, df_cv = selectSamplesByBagging(df_train, 0.6, 1)
print sum(df_cv.label), len(df_cv)
X_test, y_test, le = transform_dataset(df_cv.drop(['user_id', 'item_id', 'item_category'], axis = 1))
X_train, y_train, le1 = transform_dataset(df_bag.drop(['user_id', 'item_id', 'item_category'], axis = 1))

models = models_RF[:40]
models.extend(models_GBDT[:40])
X_layer1 = []
nStackBaseModel = 8
nBagModel = int(len(models) / nStackBaseModel)

df_test_1 = df_test[df_test.last_cart_h > 0]
X_test_1, y_test_1, le = transform_dataset(df_test_1)

y_pred_layer1 = []
for i in range(nStackBaseModel):
    X_layer1.append(baggingModels(models[i * nBagModel : (i+1) * nBagModel], X_train).tolist())
    y_pred = baggingModels(models[i * nBagModel : (i+1) * nBagModel], X_test_1)
    test(df_test, y_pred)
    
    y_pred_layer1.append(y_pred)
    # pred_labels = le.inverse_transform(y_pred)
    # F1, P ,R = calc_F1(y_test, pred_labels)
    # print 'F1/P/R %.2f%%/%.2f%%/%.2f%%\n' %(F1, P, R)
    
X_layer1 = np.array(X_layer1).T
stackModel = RandomForestClassifier(n_estimators=100, n_jobs=4, random_state=24)
stackModel = stackModel.fit(X_layer1, y_train)

## 测试
X_layer1 = np.array(y_pred_layer1).T
y_pred = stackModel.predict(X_layer1)
test(df_test, y_pred)



3050.0 292181
80 10
10
10
F1/P/R 1.23%/1.47%/1.06%

10
10
F1/P/R 1.65%/1.93%/1.44%

10
10
F1/P/R 1.45%/1.69%/1.27%

10
10
F1/P/R 1.33%/1.57%/1.16%

10
10
F1/P/R 1.11%/1.49%/0.89%

10
10
F1/P/R 1.00%/1.32%/0.80%

10
10
F1/P/R 1.25%/1.69%/1.00%

10
10
F1/P/R 1.44%/1.96%/1.14%

F1/P/R 7.26%/22.31%/4.33%



In [82]:
def test(df_test, y_pred_1):
    df_test_1 = df_test[df_test.last_cart_h > 0]
    df_test_2 = df_test[df_test.last_cart_h <= 0]

    X_test_1, y_test_1, le = transform_dataset(df_test_1)
    # y_pred_1 = baggingModels(models, X_test_1)

    X_test_2, y_test_2, le = transform_dataset(df_test_2)

    y_test = np.zeros(len(y_test_1) + len(y_test_2))
    y_pred = np.zeros(len(y_test_1) + len(y_test_2))

    y_test[: len(y_test_1)] = y_test_1
    y_test[len(y_test_1):] = y_test_2

    y_pred[: len(y_pred_1)] = y_pred_1
    y_pred[len(y_pred_1):] = 0

    F1, P, R = calc_F1(y_test, y_pred)
    print 'F1/P/R %.2f%%/%.2f%%/%.2f%%\n' %(F1, P, R)

In [92]:
## 提交
df_sub = pd.read_csv(PATH_OFFLINE + 'statisFeat_' + '2014-12-19_' + '4days.csv')
itemSet = pd.read_csv('F:/codeGit/dataset/tianchi_mobile/tianchi_fresh_comp_train_item.csv', usecols = ['item_id'])
itemSet = itemSet['item_id'].apply(lambda x: str(x))
df_sub = df_sub[df_sub['item_id'].isin(itemSet)]

df_sub = df_sub[df_sub.last_cart_h > 0]
uidList = df_sub[['user_id', 'item_id']]
print len(df_sub)
X_sub = df_sub.drop(['user_id', 'item_id', 'item_category'], axis = 1).values

models = models_RF[:40]
models.extend(models_GBDT[:40])
nStackBaseModel = 8
nBagModel = int(len(models) / nStackBaseModel)
y_pred_layer1 = []
for i in range(nStackBaseModel):
    y_pred_layer1.append(baggingModels(models[i * nBagModel : (i+1) * nBagModel], X_sub).tolist())
X_layer1 = np.array(y_pred_layer1).T
y_sub_pred = stackModel.predict(X_layer1)

predList = pd.DataFrame(y_sub_pred, index = uidList.index, columns = ['label'])
print predList
uidList['label'] = predList
print uidList
uidList = uidList[uidList['label'] == 1]

uidList[['user_id', 'item_id']].to_csv(PATH_OF_DATAOUT + 'tianchi_mobile_recommendation_predict.csv', mode = 'w', index = False)

1831
10
10
10
10
10
10
10
10
         label
1207         1
1215         1
1223         0
2905         0
3000         1
3018         1
3024         1
3039         1
3042         1
5295         1
5828         0
5868         0
10346        0
10359        0
10379        0
10389        0
10396        0
10399        1
10407        0
12093        0
12453        0
13015        0
13026        0
13790        0
13875        1
17076        0
17621        0
17721        1
17734        0
17743        1
...        ...
1209604      0
1209631      1
1209653      0
1209654      0
1209663      0
1209674      0
1209679      0
1209680      0
1209699      0
1209741      1
1209751      0
1212805      0
1212809      0
1212826      0
1212847      0
1212918      1
1212931      0
1212953      1
1212956      1
1212958      0
1212987      1
1213001      1
1213005      0
1213007      0
1215015      1
1215177      0
1215215      0
1215217      0
1215440      1
1215566      0

[1831 rows x 1 columns]
           user_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
