In [2]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import linear_model

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

## analysis and modeling 1

In [4]:
train      = pd.read_csv("data/train.csv")
test       = pd.read_csv("data/test.csv")
submission = pd.read_csv("data/Sample_Submission_Tm9Lura.txt")

In [5]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [6]:
test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [7]:
dataframe = pd.concat([train, test])
dataframe.shape

(783667, 12)

In [9]:
dataframe.head()

Unnamed: 0,Age,City_Category,Gender,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,Stay_In_Current_City_Years,User_ID
0,0-17,A,F,0,10,3,,,P00069042,8370.0,2,1000001
1,0-17,A,F,0,10,1,6.0,14.0,P00248942,15200.0,2,1000001
2,0-17,A,F,0,10,12,,,P00087842,1422.0,2,1000001
3,0-17,A,F,0,10,12,14.0,,P00085442,1057.0,2,1000001
4,55+,C,M,0,16,8,,,P00285442,7969.0,4+,1000002


In [8]:
dataframe.dtypes

Age                            object
City_Category                  object
Gender                         object
Marital_Status                  int64
Occupation                      int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Product_ID                     object
Purchase                      float64
Stay_In_Current_City_Years     object
User_ID                         int64
dtype: object

In [12]:
target = np.array(dataframe.Purchase)

In [13]:
dataframe.drop(["Purchase"], axis=1, inplace=True)

In [17]:
dataframe.dtypes

Age                           object
City_Category                 object
Gender                        object
Marital_Status                object
Occupation                    object
Product_Category_1            object
Product_Category_2            object
Product_Category_3            object
Product_ID                    object
Stay_In_Current_City_Years    object
User_ID                       object
dtype: object

In [14]:
#convert all columns to string and apply labelencoder
dataframe = dataframe.applymap(str)
df = dataframe.copy()
dataset = np.array(dataframe)
for i in range(dataframe.shape[1]):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(dataset[:,i]))
    dataset[:, i] = lbl.transform(dataset[:, i])

In [16]:
dataset.shape

(783667, 11)

In [18]:
dataset[0]

array([0, 0, 0, 0, 2, 13, 17, 15, 684, 2, 0], dtype=object)

In [19]:
dataset = dataset.astype(int)

In [20]:
# xgboost 1
params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 6
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 3000

In [22]:
params.items()

dict_items([('min_child_weight', 10), ('subsample', 0.7), ('colsample_bytree', 0.7), ('scale_pos_weight', 0.8), ('silent', 1), ('max_depth', 6), ('nthread', 6), ('objective', 'reg:linear'), ('eta', 0.1), ('base_score', 1800), ('eval_metric', 'rmse'), ('seed', 0)])

In [23]:
xgtrain = xgb.DMatrix(dataset[:train.shape[0],:], label=target[:train.shape[0]])
watchlist = [(xgtrain, 'train')]
model_1_xgboost = xgb.train(plst, xgtrain, num_rounds)

In [25]:
model_1_predict = model_1_xgboost.predict(xgb.DMatrix(dataset[train.shape[0]:,:]))

In [30]:
model_1_predict[model_1_predict<0] = 25
submission.Purchase = model_1_predict
#submission.to_csv("../submission/submit_13.csv", index=False)

## analysis and modeling 2

In [51]:
res = train['User_ID'].value_counts().items()

In [52]:
dict_IDCount = {}
for i in res:
    dict_IDCount[i[0]] = i[1]

In [59]:
dict_IDCount

{1001680: 1026,
 1004277: 979,
 1001941: 898,
 1001181: 862,
 1000889: 823,
 1003618: 767,
 1001150: 752,
 1001015: 740,
 1005795: 729,
 1005831: 727,
 1002909: 718,
 1001449: 714,
 1002063: 709,
 1004344: 705,
 1003391: 698,
 1003841: 698,
 1000424: 694,
 1004510: 691,
 1001980: 685,
 1001088: 680,
 1004227: 676,
 1003808: 671,
 1004508: 651,
 1000549: 632,
 1003224: 622,
 1003539: 617,
 1005367: 612,
 1001285: 606,
 1004543: 588,
 1003824: 584,
 1004448: 575,
 1001010: 575,
 1005643: 573,
 1000752: 572,
 1001448: 570,
 1003032: 568,
 1004447: 560,
 1000678: 559,
 1004064: 558,
 1001605: 558,
 1000524: 558,
 1005954: 557,
 1001912: 549,
 1000550: 549,
 1003526: 547,
 1004725: 544,
 1002116: 538,
 1003311: 537,
 1001447: 533,
 1003778: 533,
 1003507: 530,
 1001880: 529,
 1000881: 527,
 1004808: 524,
 1001019: 517,
 1006036: 514,
 1006016: 504,
 1003292: 502,
 1004425: 501,
 1000710: 500,
 1004647: 499,
 1005812: 499,
 1004979: 498,
 1000869: 497,
 1002106: 497,
 1004312: 496,
 1002820:

In [63]:
train['User_ID'][0]

1000001

In [61]:
dict_IDCount[train['User_ID']]

35

In [71]:
train['ID_count'] = train['User_ID'].apply(lambda x:dict_IDCount[x])

In [72]:
train

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,ID_count
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370,35
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200,35
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422,35
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057,35
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969,77
5,1000003,P00193542,M,26-35,15,A,3,0,1,2.0,,15227,29
6,1000004,P00184942,M,46-50,7,B,2,1,1,8.0,17.0,19215,14
7,1000004,P00346142,M,46-50,7,B,2,1,1,15.0,,15854,14
8,1000004,P0097242,M,46-50,7,B,2,1,1,16.0,,15686,14
9,1000005,P00274942,M,26-35,20,A,1,1,8,,,7871,106


In [74]:
train['Product_Category_1'].value_counts()

5     150933
1     140378
8     113925
11     24287
2      23864
6      20466
3      20213
4      11753
16      9828
15      6290
13      5549
10      5125
12      3947
7       3721
18      3125
20      2550
19      1603
14      1523
17       578
9        410
Name: Product_Category_1, dtype: int64

In [75]:
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train.loc[:,'City_Category']))
train.loc[:, 'City_Category'] = lbl.transform(train.loc[:, 'City_Category'])

In [76]:
train

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,ID_count
0,1000001,P00069042,F,0-17,10,0,2,0,3,,,8370,35
1,1000001,P00248942,F,0-17,10,0,2,0,1,6.0,14.0,15200,35
2,1000001,P00087842,F,0-17,10,0,2,0,12,,,1422,35
3,1000001,P00085442,F,0-17,10,0,2,0,12,14.0,,1057,35
4,1000002,P00285442,M,55+,16,2,4+,0,8,,,7969,77
5,1000003,P00193542,M,26-35,15,0,3,0,1,2.0,,15227,29
6,1000004,P00184942,M,46-50,7,1,2,1,1,8.0,17.0,19215,14
7,1000004,P00346142,M,46-50,7,1,2,1,1,15.0,,15854,14
8,1000004,P0097242,M,46-50,7,1,2,1,1,16.0,,15686,14
9,1000005,P00274942,M,26-35,20,0,1,1,8,,,7871,106


In [77]:
train_City_cate = pd.get_dummies(train['City_Category'])

In [81]:
train = pd.concat([train, train_City_cate], axis=1)

In [82]:
train

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,ID_count,0,1,2
0,1000001,P00069042,F,0-17,10,0,2,0,3,,,8370,35,1,0,0
1,1000001,P00248942,F,0-17,10,0,2,0,1,6.0,14.0,15200,35,1,0,0
2,1000001,P00087842,F,0-17,10,0,2,0,12,,,1422,35,1,0,0
3,1000001,P00085442,F,0-17,10,0,2,0,12,14.0,,1057,35,1,0,0
4,1000002,P00285442,M,55+,16,2,4+,0,8,,,7969,77,0,0,1
5,1000003,P00193542,M,26-35,15,0,3,0,1,2.0,,15227,29,1,0,0
6,1000004,P00184942,M,46-50,7,1,2,1,1,8.0,17.0,19215,14,0,1,0
7,1000004,P00346142,M,46-50,7,1,2,1,1,15.0,,15854,14,0,1,0
8,1000004,P0097242,M,46-50,7,1,2,1,1,16.0,,15686,14,0,1,0
9,1000005,P00274942,M,26-35,20,0,1,1,8,,,7871,106,1,0,0


In [None]:
#model_xgb_1 = XGBoost(X_train,y,X_test,cv=5,objective="reg:linear",nrounds=500,max.depth=10,eta=0.1,colsample_bytree=0.5,seed=235,metric="rmse",importance=1)
