In [8]:
#!/usr/bin/env python
# coding=utf-8
# -------- import basic package --------
import pdb
import math
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import datetime as dt
import warnings
import copy
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier

import washer.utils.path.pathHandle as ph
import washer.utils.store as store
from washer.sample.feature import FeatSampler
from washer.repairer.isolatedRepairer import GeneralRepairer

from washer.demo.tianchi.mobile.system.utils.preProcess import *
from washer.demo.tianchi.mobile.system.utils.selectFeature import *

PATH_OFFLINE = "F:/codeGit/my project/python/dataset/tianchi/offline/"
PATH_ONLINE = "F:/codeGit/my project/python/dataset/tianchi/online/"
PATH_OF_DATAOUT = "F:/codeGit/my project/python/dataset/tianchi/submission/"
PATH_MODEL = "F:/codeGit/my project/python/dataset/tianchi/model/"

In [4]:
def getOffsetDate(date, offset):
    if offset > 0:
        date_next = (dt.datetime.strptime(date, '%Y-%m-%d') + dt.timedelta(offset)).strftime('%Y-%m-%d')
    elif offset < 0:
        date_next = (dt.datetime.strptime(date, '%Y-%m-%d') - dt.timedelta(offset * -1)).strftime('%Y-%m-%d')    
    return date_next

In [2]:
def processDataSet(df, isPolyFeatNeed = False):
	df = sampleBalance(df)
	df = preprocess(df, normlized = True)
	return df

In [3]:
def trainModel(X, y, model, saveMode = False):
	model = trainWithKfold(X, y, model, le)
	if saveMode == True:
		store.saveModelToPickle(PATH_MODEL + 'model_mnb_layer1.pkl', model)
	return model

In [89]:
def trainWithKfold(model, df_train, df_cv):
	print("trainig model...")
	nModel = 5
	X_test, y_test, le = transform_dataset(df_cv.drop(['user_id', 'item_id', 'item_category'], axis = 1))
	X_train, y_train, le1 = transform_dataset(df_train.drop(['user_id', 'item_id', 'item_category'], axis = 1))
	paras = [[] for i in range(3)]
	models = []
	for i in range(nModel):
		model_new = copy.deepcopy(model)
		y_pred = model_new.fit(X_train, y_train).predict(X_test)
		models.append(model_new)
		pred_labels = le.inverse_transform(y_pred)
		F1, P ,R = calc_F1(y_test, pred_labels)
		
		paras[0].append(F1)
		paras[1].append(P)
		paras[2].append(R)
	print 'mean: F1/P/R %.2f%%/%.2f%%/%.2f%%\n' %(sum(paras[0]) / nModel, sum(paras[1]) / nModel, sum(paras[2]) / nModel)
	return models[getMaxIndex(paras[0])]

In [84]:
def selectSamplesByBagging(df, ratioOnPos, ratioP2N):
    colsName = df.columns
    df = pd.DataFrame(df.values, index = range(len(df.index)))
    df.columns = colsName
    
    df_pos = df[df.label == 1]
    list_pos = Series(df.index.isin(df_pos.index))
    list_neg = list_pos.apply(lambda x: not x)
    df_neg = df[list_neg]
 
    len_pos = len(df_pos)  
    len_bag = int(ratioOnPos * len_pos)
    df_bag = pd.concat([df_pos.sample(len_bag), df_neg.sample(int(len_bag / ratioP2N))], axis = 0)
    return df_bag

In [92]:
def calc_F1(y ,y_pred):

	y = DataFrame(y, columns = ['label'])
	y_pred = DataFrame(y_pred, columns = ['label'])

	pos_yp = y_pred[y_pred.label == 1]
	neg_yp = y[y_pred.label == 0]
	pos_y = y[y.label == 1]
	neg_y = y[y.label == 0]
	
	## calc TP and TN
	len_yp = len(y_pred); len_y = len(y)
	
	TP = sum(pos_yp.index.isin(pos_y.index))
	FP = len(pos_yp) - TP
	TN = sum(neg_yp.index.isin(neg_y.index))
	FN = len(neg_yp) - TN

	## calc R, P, F1
	if (len(pos_yp) == 0 or len(pos_y) == 0 or TP == 0):
		P = 0; R = 0; F1 = 0
	else:
		R = 1.0 * TP / len(pos_yp) * 100
		P = 1.0 * TP / len(pos_y) * 100
		F1 = (2.0 * R * P) / (R + P)
		# the two following formulation are also right
	# R = 1.0 * TP / (TP + FP) * 100
	# P = 1.0 * TP / (TP + FN) * 100 
	return F1, P, R

In [73]:
date = '2014-11-22'
df = DataFrame()
for i in range(5):
    df = pd.concat([df, pd.read_csv(PATH_OFFLINE + 'statisFeat_' + date + '_' + '4days.csv')], axis = 0)
    date = getOffsetDate(date, 1)
print(df)

           user_id    item_id  item_category  uid_behav_cnt  uid_buy_cnt  \
0              492   32957845         8877.0            1.0          0.0   
1              492   59758671         9363.0            2.0          0.0   
2              492   76093985        10544.0            1.0          0.0   
3              492  110036513        12919.0            2.0          0.0   
4              492  136443352         4676.0            2.0          0.0   
5              492  137575077        11311.0            1.0          0.0   
6              492  168653170         4676.0            1.0          0.0   
7              492  176404510          280.0            1.0          0.0   
8              492  178412255          280.0            2.0          0.0   
9              492  199638541         6000.0            3.0          0.0   
10             492  204458711         8877.0            3.0          0.0   
11             492  213642652         7094.0            2.0          0.0   
12          

In [138]:
## 分割数据集为易预测集与难预测集
df_easySet = df[df_train.last_cart_h > 0]
list_easySet = Series(df_train.index.isin(df_easySet.index))
# list_hardSet = list_easySet.apply(lambda x: not x)
# df_hardSet = df[list_hardSet]



In [146]:
## 先训练易预测集
df_package = divideDataset(df_easySet, ratioList = [7, 3])
df_train = df_package[0]
df_cv = df_package[1]
df_cv = df_cv.drop(['user_id', 'item_id', 'item_category'], axis = 1)

# 训练单模型
df_bag = selectSamplesByBagging(df_train, 0.3, 1)
df_bag_test = selectSamplesByBagging(df_train, 0.3, 0.1)
print df_bag.iloc[:5]

# 随机森林
model = RandomForestClassifier(n_estimators=100, n_jobs=4, random_state=24)
model = trainWithKfold(model, df_bag, df_bag_test)

# 测试
X_test, y_test, le = transform_dataset(df_cv)
y_pred = model.predict(X_test)
F1, P, R = calc_F1(y_test, y_pred)
print 'F1/P/R %.2f%%/%.2f%%/%.2f%%\n' %(F1, P, R)

           user_id      item_id  item_category  uid_behav_cnt  uid_buy_cnt  \
9304   140338604.0  289867639.0         6321.0            5.0          0.0   
62364   43616207.0   91982010.0        13230.0           13.0          0.0   
21711   23008465.0  317274106.0        14079.0            3.0          1.0   
32627  129312831.0  208397828.0         4963.0            2.0          0.0   
44490   47067239.0   98280254.0         6054.0            6.0          0.0   

       user_buy_cnt  user_cnt  item_buy_cnt  item_cnt  category_buy_cnt  \
9304            3.0     207.0           0.0       5.0              10.0   
62364           0.0     253.0           0.0      13.0             202.0   
21711           0.0      77.0           0.0       9.0             129.0   
32627          15.0     440.0           0.0       2.0              11.0   
44490           2.0     247.0           0.0       6.0              81.0   

       ...    item_buy_ratio  category_buy_ratio  item_rank  user_rank  \
9304  

In [148]:
## 训练多模型
models = []
nModel = 10
for i in range(nModel):
    df_bag = selectSamplesByBagging(df_train, 0.3, 1)
    y_pred = selectSamplesByBagging(df_train, 0.3, 0.1)
    print sum(y_pred.label), len(y_pred)
    model = RandomForestClassifier(n_estimators=100, n_jobs=4, random_state=24)
    model = trainWithKfold(model, df_bag, df_bag_test)
    models.append(model)

478.0 5258
trainig model...
mean: F1/P/R 34.05%/81.59%/21.51%

478.0 5258
trainig model...
mean: F1/P/R 33.20%/79.08%/21.01%

478.0 5258
trainig model...
mean: F1/P/R 33.67%/80.33%/21.30%

478.0 5258
trainig model...
mean: F1/P/R 32.83%/79.92%/20.66%

478.0 5258
trainig model...
mean: F1/P/R 34.19%/80.75%/21.69%

478.0 5258
trainig model...
mean: F1/P/R 32.46%/81.80%/20.25%

478.0 5258
trainig model...
mean: F1/P/R 32.45%/86.82%/19.95%

478.0 5258
trainig model...
mean: F1/P/R 33.22%/81.59%/20.86%

478.0 5258
trainig model...
mean: F1/P/R 31.77%/79.29%/19.86%

478.0 5258
trainig model...
mean: F1/P/R 35.72%/81.38%/22.88%



In [163]:
# 测试bagging的效果
X_test_easy, y_test_easy, le = transform_dataset(df_cv)
# bagging实现esembel models
def baggingModels(models, X):
    y_pred = 0
    decs = nModel - 1
    for i in range(nModel):
        y_pred = y_pred + models[i].predict(X)
    y_pred = DataFrame(y_pred, columns = ['label'])
    y_pred.label = y_pred.label.apply(lambda x: 1 if x > decs else 0)
    return y_pred.label

print sum(y_test_easy), len(X_test_easy)
y_pred = baggingModels(models, X_test_easy)
F1, P, R = calc_F1(y_test_easy, y_pred)
print 'F1/P/R %.2f%%/%.2f%%/%.2f%%\n' %(F1, P, R)

722 26903
F1/P/R 20.70%/42.94%/13.64%



In [139]:
## 查看难预测集
df_package = divideDataset(df_hardSet, ratioList = [7, 3])
df_train_hard = df_package[0]
df_cv_hard = df_package[1]
df_cv_hard = df_cv_hard.drop(['user_id', 'item_id', 'item_category'], axis = 1)
print sum(df_cv_hard.label), len(df_cv_hard)

1373.0 1555633
trainig model...
mean: F1/P/R 46.36%/86.16%/31.71%

F1/P/R 0.79%/80.26%/0.40%



很明显难测试集中的正样本比例还不到0.1%， 对提升精度并没有太大的作用，故将难数据集的预测结果全部设置为0

In [164]:
date = '2014-12-15'
df_test = DataFrame()
for i in range(2):
    df_test = pd.concat([df_test, pd.read_csv(PATH_OFFLINE + 'statisFeat_' + date + '_' + '4days.csv')], axis = 0)
    date = getOffsetDate(date, 1)

F1/P/R 4.38%/13.03%/2.63%



In [167]:
df_test_1 = df_test[df_test.last_cart_h > 0]
df_test_2 = df_test[df_test.last_cart_h <= 0]
# df_test_1 = df_test_1.drop(['user_id', 'item_id', 'item_category'], axis = 1)
# df['label'].apply(lambda x: int(x))
X_test_1, y_test_1, le = transform_dataset(df_test_1)
y_pred_1 = baggingModels(models, X_test_1)

X_test_2, y_test_2, le = transform_dataset(df_test_2)

y_test = np.zeros(len(y_test_1) + len(y_test_2))
y_pred = np.zeros(len(y_test_1) + len(y_test_2))

y_test[: len(y_test_1)] = y_test_1
y_test[len(y_test_1):] = y_test_2

y_pred[: len(y_pred_1)] = y_pred_1
y_pred[len(y_pred_1):] = 0

F1, P, R = calc_F1(y_test, y_pred)
print 'F1/P/R %.2f%%/%.2f%%/%.2f%%\n' %(F1, P, R)

F1/P/R 8.58%/9.07%/8.14%



In [175]:
## 提交
df_sub = pd.read_csv(PATH_OFFLINE + 'statisFeat_' + '2014-12-19_' + '4days.csv')
itemSet = pd.read_csv('F:/codeGit/dataset/tianchi_mobile/tianchi_fresh_comp_train_item.csv', usecols = ['item_id'])
itemSet = itemSet['item_id'].apply(lambda x: str(x))
df_sub = df_sub[df_sub['item_id'].isin(itemSet)]

df_sub = df_sub[df_sub.last_cart_h > 0]
uidList = df_sub[['user_id', 'item_id']]
print len(df_sub)
X_sub = df_sub.drop(['user_id', 'item_id', 'item_category'], axis = 1).values
y_sub_pred = baggingModels(models, X_sub).values
predList = pd.DataFrame(y_sub_pred, index = uidList.index, columns = ['label'])
print predList
uidList['label'] = predList
print uidList
uidList = uidList[uidList['label'] == 1]

uidList[['user_id', 'item_id']].to_csv(PATH_OF_DATAOUT + 'tianchi_mobile_recommendation_predict.csv', mode = 'w', index = False)

1831
         label
1207         0
1215         0
1223         0
2905         0
3000         0
3018         1
3024         0
3039         0
3042         0
5295         1
5828         0
5868         0
10346        0
10359        0
10379        0
10389        0
10396        0
10399        0
10407        0
12093        0
12453        0
13015        0
13026        0
13790        1
13875        0
17076        0
17621        0
17721        0
17734        0
17743        0
...        ...
1209604      0
1209631      0
1209653      0
1209654      0
1209663      0
1209674      0
1209679      0
1209680      0
1209699      0
1209741      0
1209751      0
1212805      0
1212809      0
1212826      0
1212847      0
1212918      1
1212931      0
1212953      1
1212956      1
1212958      0
1212987      0
1213001      0
1213005      0
1213007      0
1215015      1
1215177      0
1215215      0
1215217      0
1215440      0
1215566      0

[1831 rows x 1 columns]
           user_id    item_id  label
120

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
