In [105]:
#!/usr/bin/env python
# coding=utf-8
# -------- import basic package --------
import pdb
import math
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import datetime as dt
import warnings
import copy
import random
import cPickle as pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier

import washer.utils.path.pathHandle as ph
import washer.utils.store as store
from washer.sample.feature import FeatSampler
from washer.repairer.isolatedRepairer import GeneralRepairer

from washer.demo.tianchi.mobile.system.utils.preProcess import *
from washer.demo.tianchi.mobile.system.utils.selectFeature import *

PATH_OFFLINE = "F:/codeGit/my project/python/dataset/tianchi/offline/"
PATH_ONLINE = "F:/codeGit/my project/python/dataset/tianchi/online/"
PATH_OF_DATAOUT = "F:/codeGit/my project/python/dataset/tianchi/submission/"
PATH_MODEL = "F:/codeGit/my project/python/dataset/tianchi/model/gbdt_rf/"

In [68]:
def getOffsetDate(date, offset):
    if offset > 0:
        date_next = (dt.datetime.strptime(date, '%Y-%m-%d') + dt.timedelta(offset)).strftime('%Y-%m-%d')
    elif offset < 0:
        date_next = (dt.datetime.strptime(date, '%Y-%m-%d') - dt.timedelta(offset * -1)).strftime('%Y-%m-%d')    
    return date_next

In [69]:
def updateParasList(parasList, para, k):
    if len(parasList) < k:
        parasList.append(para)
        minIdx = -1
    else:
        minIdx = Series(parasList).idxmin()
        if para > parasList[minIdx]:
            parasList[minIdx] = para
    return parasList, minIdx

In [70]:
def updateKModelList(kModelList, idx, model, k):
    if len(kModelList) < k:
        kModelList.append(model)
        kModelList[idx] = model
    return kModelList

In [71]:
def trainWithMutliTime(model, df, nTime, k):
    print("trainig model...")
    parasList = []
    kModelList = []

    for i in range(nTime):
        df_train, df_cv = selectSamplesByBagging(df, 0.3, 1)
        X_test, y_test, le = transform_dataset(df_cv.drop(['user_id', 'item_id', 'item_category'], axis = 1))
        X_train, y_train, le1 = transform_dataset(df_train.drop(['user_id', 'item_id', 'item_category'], axis = 1))
        
        paras = []
        model_new = copy.deepcopy(model)
        y_pred = model_new.fit(X_train, y_train).predict(X_test)
        pred_labels = le.inverse_transform(y_pred)
        F1, P ,R = calc_F1(y_test, pred_labels)
        paras.append(F1)
        paras.append(P)
        paras.append(R)
        parasList, index = updateParasList(parasList, paras[0], k)
        kModelList = updateKModelList(kModelList, index, model_new, k)
        # print 'mean: F1/P/R %.2f%%/%.2f%%/%.2f%%\n' %(paras[0], paras[1], paras[2])
    return kModelList, parasList

In [72]:
def trainWithLowVariance(model, df, df_test, nTime, k):
    print("trainig model...")
    parasList = []
    kModelList = []
    X_test_1, y_test_1, le = transform_dataset(df_test.drop(['user_id', 'item_id', 'item_category'], axis = 1))
    for i in range(nTime):
        df_train, df_cv = selectSamplesByBagging(df, 0.3, 1)
        X_test, y_test, le = transform_dataset(df_cv.drop(['user_id', 'item_id', 'item_category'], axis = 1))
        X_train, y_train, le1 = transform_dataset(df_train.drop(['user_id', 'item_id', 'item_category'], axis = 1))

        
        paras = []
        model_new = copy.deepcopy(model)
        y_pred = model_new.fit(X_train, y_train).predict(X_test)
        pred_labels = le.inverse_transform(y_pred)
        F1, P ,R = calc_F1(y_test, pred_labels)
        
        y_pred = model_new.predict(X_test_1)
        pred_labels = le.inverse_transform(y_pred)
        F1_1, P_1 ,R_1 = calc_F1(y_test_1, pred_labels)
        paras.append(F1 + F1_1)
        paras.append(P + P_1)
        paras.append(R + R_1)
        parasList, index = updateParasList(parasList, paras[0], k)
        kModelList = updateKModelList(kModelList, index, model_new, k)
        # print 'mean: F1/P/R %.2f%%/%.2f%%/%.2f%%\n' %(paras[0], paras[1], paras[2])
    return kModelList, parasList

In [73]:
def getSampleSetWithComplementSet(df, nSamlpe):
    colsName = df.columns
    df = pd.DataFrame(df.values, index = range(len(df.index)))
    df.columns = colsName
    
    df_sample = df.sample(nSamlpe)
    list_comple = Series(df.index.isin(df_sample.index)).apply(lambda x: not x)
    df_comple = df[list_comple]
    return df_sample, df_comple

In [74]:
def selectSamplesByBagging(df, ratioOnPos, ratioP2N):
    colsName = df.columns
    df = pd.DataFrame(df.values, index = range(len(df.index)))
    df.columns = colsName
    
    df_pos = df[df.label == 1]
    list_pos = Series(df.index.isin(df_pos.index))
    list_neg = list_pos.apply(lambda x: not x)
    df_neg = df[list_neg]
 
    len_pos = len(df_pos)  
    len_bag = int(ratioOnPos * len_pos)

    df_bag_pos, df_left_pos = getSampleSetWithComplementSet(df_pos, len_bag)
    df_bag_neg, df_left_neg = getSampleSetWithComplementSet(df_neg, (int(len_bag / ratioP2N)))
                                                            
    df_bag = pd.concat([df_pos.sample(len_bag), df_neg.sample(int(len_bag / ratioP2N))], axis = 0)
    df_left = pd.concat([df_left_pos, df_left_neg], axis = 0)
    return df_bag, df_left

In [75]:
def calc_F1(y ,y_pred):

	y = DataFrame(y, columns = ['label'])
	y_pred = DataFrame(y_pred, columns = ['label'])

	pos_yp = y_pred[y_pred.label == 1]
	neg_yp = y[y_pred.label == 0]
	pos_y = y[y.label == 1]
	neg_y = y[y.label == 0]
	
	## calc TP and TN
	len_yp = len(y_pred); len_y = len(y)
	
	TP = sum(pos_yp.index.isin(pos_y.index))
	FP = len(pos_yp) - TP
	TN = sum(neg_yp.index.isin(neg_y.index))
	FN = len(neg_yp) - TN

	## calc R, P, F1
	if (len(pos_yp) == 0 or len(pos_y) == 0 or TP == 0):
		P = 0; R = 0; F1 = 0
	else:
		R = 1.0 * TP / len(pos_yp) * 100
		P = 1.0 * TP / len(pos_y) * 100
		F1 = (2.0 * R * P) / (R + P)
		# the two following formulation are also right
	# R = 1.0 * TP / (TP + FP) * 100
	# P = 1.0 * TP / (TP + FN) * 100 
	return F1, P, R

In [None]:
## 生成训练集
date = '2014-11-22'
df_train = DataFrame()
while date != '2014-12-08':
    df = pd.read_csv(PATH_OFFLINE + 'statisFeat_' + date + '_' + '4days.csv')
    df_easySet = df[df.last_cart_h > 0]
    df_train = pd.concat([df_train, df_easySet], axis = 0)
    date = getOffsetDate(date, 1)
print(df_train.iloc[:10])

In [11]:
PATH_OF_OFFLINE = 'F:/codeGit/my project/python/dataset/tianchi/offline/'
# df_train.to_csv(PATH_OF_OFFLINE + 'validTrainSet' + .csv', mode = 'w', index = False)
df_train = pd.read_csv(PATH_OF_OFFLINE + 'validTrainSet.csv')

## 生成测试集
date = '2014-12-08'
df_test = DataFrame()
while date != '2014-12-11':
    df = pd.read_csv(PATH_OFFLINE + 'statisFeat_' + date + '_' + '4days.csv')
    df_easySet = df[df.last_cart_h > 0]
    df_test = pd.concat([df_test, df_easySet], axis = 0)
    date = getOffsetDate(date, 1)
print(df_test)


           user_id    item_id  item_category  uid_behav_cnt  uid_buy_cnt  \
0              492     254885           6344             11          1.0   
5              492  169720786           6344             17          2.0   
314         131694  308748219           1863              4          0.0   
316         131694  379716378           1863              4          0.0   
319         131694  386870271           1863              6          0.0   
379         137907  272049110          10392              3          0.0   
443         160959   75812179           5800              5          0.0   
444         160959  156277648           5800              4          0.0   
456         160959  352492634           5800              9          0.0   
457         160959  382552889           5800              5          0.0   
546         173019  139048812           1385              3          0.0   
661         186960   10938912           1723              2          0.0   
664         

In [84]:
def trainGBDTandRF(df_train, df_test):
   
    models = []
    parasList = []
    while(len(models) < 30):
        max_length = random.randint(4,12)
        model = GradientBoostingClassifier(max_depth = max_length)
        models_GBDT, paraList_GBDT = trainWithLowVariance(model, df_train, df_test, 10, 3)
        if Series(paraList_GBDT).mean() > Series(parasList).mean() or len(parasList) == 0:
            parasList.extend(paraList_GBDT)
            models.extend(models_GBDT)
            #saveModelToPickle(PATH_MODEL + 'models.pkl', PATH_MODEL + 'paras.pkl', models, parasList)
        print 'now %d models training finish' %len(models)
    
    parasList_1 = []
    while(len(models) < 60):
        n_estimators = random.randint(100,200)
        model = RandomForestClassifier(n_estimators=n_estimators, n_jobs=4, random_state=2)
        models_RF, paraList_RF = trainWithLowVariance(model, df_train, df_test, 10, 3)
        parasList_1.extend(paraList_RF)
        models.extend(models_RF)
        #saveModelToPickle(PATH_MODEL + 'models.pkl', PATH_MODEL + 'paras.pkl', models, parasList_1)
        print 'now %d models training finish' %len(models)
    return models

In [100]:
def baggingModels(models, X, decs):
    y_pred = 0
    nModel = len(models)
    # decs = nModel - 10
    for i in range(nModel):
        y_pred = y_pred + models[i].predict(X)
    y_pred = DataFrame(y_pred, columns = ['label'])
    y_pred.label = y_pred.label.apply(lambda x: 1 if x > decs else 0)
    return y_pred.label

In [107]:
def saveModelToPickle(path_model, path_para, models, paras):
    for i in range(len(models)):
        path = path_model[:-4] + '_' + str(i) + '.pkl'
        with open(path, 'w') as f: 
            pickle.dump(models[i], f)
    with open(path_para, 'w') as f: 
        pickle.dump(paras, f)

def readModelFromPickle(path):
    with open(path, 'r') as f:
        model = pickle.load(f)
    return model
def readParasFromPickle(path):
    with open(path, 'r') as f:
        paras = pickle.load(f)
    return paras

In [89]:
models = trainGBDTandRF(df_train, df_test)
saveModelToPickle(PATH_MODEL + 'models.pkl', PATH_MODEL + 'paras.pkl', models, [1])

trainig model...
now 3 models training finish
trainig model...
now 6 models training finish
trainig model...
now 6 models training finish
trainig model...
now 9 models training finish
trainig model...
now 12 models training finish
trainig model...
now 15 models training finish
trainig model...
now 15 models training finish
trainig model...
now 15 models training finish
trainig model...
now 18 models training finish
trainig model...
now 18 models training finish
trainig model...
now 18 models training finish
trainig model...
now 18 models training finish
trainig model...
now 18 models training finish
trainig model...
now 18 models training finish
trainig model...
now 21 models training finish
trainig model...
now 24 models training finish
trainig model...
now 24 models training finish
trainig model...
now 24 models training finish
trainig model...
now 24 models training finish
trainig model...
now 24 models training finish
trainig model...
now 24 models training finish
trainig model...


NameError: global name 'path_model' is not defined

In [108]:
saveModelToPickle(PATH_MODEL + 'models.pkl', PATH_MODEL + 'paras.pkl', models, [1])

In [39]:
date = '2014-12-18'
df_test_final = DataFrame()
while date != '2014-12-19':
    df = pd.read_csv(PATH_OFFLINE + 'statisFeat_' + date + '_' + '4days.csv')
    df_test_final = pd.concat([df_test_final, df], axis = 0)
    date = getOffsetDate(date, 1)

df_test_final.to_csv(PATH_OF_OFFLINE + 'validTestSet' + '.csv', mode = 'w', index = False)
df_test_final = df_test_final.drop(['user_id', 'item_id', 'item_category'], axis = 1)
print(df_test_final)

         uid_behav_cnt  uid_buy_cnt  user_buy_cnt  user_cnt  item_buy_cnt  \
0                    2          0.0           1.0        32           0.0   
1                    2          0.0           1.0        32           0.0   
2                    3          0.0           1.0        32           0.0   
3                    3          0.0           1.0        32           0.0   
4                    2          0.0           1.0        32           0.0   
5                    2          0.0           1.0        32           0.0   
6                   12          1.0           1.0        32           1.0   
7                    3          0.0           1.0        32           0.0   
8                    3          0.0           1.0        32           0.0   
9                    2          0.0           0.0        81           0.0   
10                   1          0.0           0.0        81           0.0   
11                   2          0.0           0.0        81           0.0   

In [109]:
def testForBestDesc(models, df_test):

    for i in range(10):
        decs = i + 50
        df_test_1 = df_test[df_test.last_cart_h > 0]
        df_test_2 = df_test[df_test.last_cart_h <= 0]
        # df_test_1 = df_test_1.drop(['user_id', 'item_id', 'item_category'], axis = 1)
        # df['label'].apply(lambda x: int(x))
        X_test_1, y_test_1, le = transform_dataset(df_test_1)
        y_pred_1 = baggingModels(models, X_test_1, decs)

        X_test_2, y_test_2, le = transform_dataset(df_test_2)

        y_test = np.zeros(len(y_test_1) + len(y_test_2))
        y_pred = np.zeros(len(y_test_1) + len(y_test_2))

        y_test[: len(y_test_1)] = y_test_1
        y_test[len(y_test_1):] = y_test_2

        y_pred[: len(y_pred_1)] = y_pred_1
        y_pred[len(y_pred_1):] = 0

        F1, P, R = calc_F1(y_test, y_pred)
        print decs
        print 'F1/P/R %.2f%%/%.2f%%/%.2f%%\n' %(F1, P, R)

In [104]:
testForBestDesc(models, df_test_final)

29
F1/P/R 9.70%/23.44%/6.12%

30
F1/P/R 9.83%/23.37%/6.22%

31
F1/P/R 9.76%/22.77%/6.21%

32
F1/P/R 9.79%/22.44%/6.26%

33
F1/P/R 9.89%/22.24%/6.36%

34
F1/P/R 10.05%/22.17%/6.50%

35
F1/P/R 10.19%/22.04%/6.63%

36
F1/P/R 10.26%/21.70%/6.72%

37
F1/P/R 10.24%/21.24%/6.75%

38
F1/P/R 10.32%/20.91%/6.85%

39
F1/P/R 10.29%/20.44%/6.88%

40
F1/P/R 10.38%/20.24%/6.98%

41
F1/P/R 10.55%/20.17%/7.14%

42
F1/P/R 10.62%/19.84%/7.25%

43
F1/P/R 10.72%/19.51%/7.39%

44
F1/P/R 10.75%/19.11%/7.48%

45
F1/P/R 10.93%/18.97%/7.67%

46
F1/P/R 11.20%/18.97%/7.95%

47
F1/P/R 10.95%/18.04%/7.86%

48
F1/P/R 11.06%/17.64%/8.05%

49
F1/P/R 10.94%/16.91%/8.08%

50
F1/P/R 10.89%/16.25%/8.19%

51
F1/P/R 11.07%/15.85%/8.51%

52
F1/P/R 11.05%/15.18%/8.69%

53
F1/P/R 10.91%/14.31%/8.82%

54
F1/P/R 10.99%/13.58%/9.23%

55
F1/P/R 10.55%/12.25%/9.26%

56
F1/P/R 10.36%/11.25%/9.60%

57
F1/P/R 10.07%/9.92%/10.23%

58
F1/P/R 8.99%/7.86%/10.52%



In [111]:
date = '2014-12-17'
df_test_final1 = DataFrame()
while date != '2014-12-18':
    df = pd.read_csv(PATH_OFFLINE + 'statisFeat_' + date + '_' + '4days.csv')
    df_test_final1 = pd.concat([df_test_final1, df], axis = 0)
    date = getOffsetDate(date, 1)

df_test_final1.to_csv(PATH_OF_OFFLINE + 'validTestSet' + '.csv', mode = 'w', index = False)
df_test_final1 = df_test_final1.drop(['user_id', 'item_id', 'item_category'], axis = 1)
print(df_test_final1)

         uid_behav_cnt  uid_buy_cnt  user_buy_cnt  user_cnt  item_buy_cnt  \
0                    2          0.0           1.0        32           0.0   
1                    2          0.0           1.0        32           0.0   
2                    2          0.0           1.0        32           0.0   
3                    3          0.0           1.0        32           0.0   
4                    3          0.0           1.0        32           0.0   
5                    2          0.0           1.0        32           0.0   
6                    2          0.0           1.0        32           0.0   
7                    8          1.0           1.0        32           1.0   
8                    3          0.0           1.0        32           0.0   
9                    3          0.0           1.0        32           0.0   
10                   2          0.0           1.0        32           0.0   
11                   2          0.0           0.0        61           0.0   

In [112]:
testForBestDesc(models, df_test_final1)

F1/P/R 12.47%/17.53%/9.67%

50
F1/P/R 12.47%/17.53%/9.67%

F1/P/R 12.36%/16.74%/9.79%

51
F1/P/R 12.36%/16.74%/9.79%

F1/P/R 12.27%/15.96%/9.97%

52
F1/P/R 12.27%/15.96%/9.97%

F1/P/R 12.14%/14.98%/10.20%

53
F1/P/R 12.14%/14.98%/10.20%

F1/P/R 12.17%/14.32%/10.58%

54
F1/P/R 12.17%/14.32%/10.58%

F1/P/R 11.88%/13.21%/10.79%

55
F1/P/R 11.88%/13.21%/10.79%

F1/P/R 11.55%/12.03%/11.10%

56
F1/P/R 11.55%/12.03%/11.10%

F1/P/R 10.91%/10.40%/11.47%

57
F1/P/R 10.91%/10.40%/11.47%

F1/P/R 10.46%/8.96%/12.56%

58
F1/P/R 10.46%/8.96%/12.56%

F1/P/R 8.30%/6.02%/13.37%

59
F1/P/R 8.30%/6.02%/13.37%



In [117]:
## 提交
df_sub = pd.read_csv(PATH_OFFLINE + 'statisFeat_' + '2014-12-19_' + '4days.csv')
itemSet = pd.read_csv('F:/codeGit/dataset/tianchi_mobile/tianchi_fresh_comp_train_item.csv', usecols = ['item_id'])
itemSet = itemSet['item_id'].apply(lambda x: str(x))
df_sub = df_sub[df_sub['item_id'].isin(itemSet)]

df_sub = df_sub[df_sub.last_cart_h > 0]
uidList = df_sub[['user_id', 'item_id']]
print len(df_sub)
X_sub = df_sub.drop(['user_id', 'item_id', 'item_category'], axis = 1).values
y_sub_pred = baggingModels(models, X_sub, 55).values
predList = pd.DataFrame(y_sub_pred, index = uidList.index, columns = ['label'])
print predList
uidList['label'] = predList
print uidList
uidList = uidList[uidList['label'] == 1]

uidList[['user_id', 'item_id']].to_csv(PATH_OF_DATAOUT + 'tianchi_mobile_recommendation_predict.csv', mode = 'w', index = False)

1831
         label
1207         1
1215         1
1223         0
2905         0
3000         0
3018         1
3024         0
3039         0
3042         0
5295         1
5828         0
5868         0
10346        0
10359        0
10379        0
10389        0
10396        0
10399        0
10407        0
12093        0
12453        0
13015        0
13026        0
13790        1
13875        1
17076        0
17621        0
17721        0
17734        0
17743        0
...        ...
1209604      0
1209631      0
1209653      0
1209654      0
1209663      0
1209674      0
1209679      0
1209680      0
1209699      0
1209741      0
1209751      0
1212805      0
1212809      0
1212826      0
1212847      0
1212918      0
1212931      0
1212953      0
1212956      0
1212958      0
1212987      0
1213001      1
1213005      0
1213007      0
1215015      1
1215177      0
1215215      0
1215217      0
1215440      0
1215566      0

[1831 rows x 1 columns]
           user_id    item_id  label
120

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
