In [2]:
import numpy as np 
import pandas as pd
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import xlearn as xl

## Data transforming

Because the original data (both train and test) is too big, we just use part of them. And also due to the format's requirement of the input data of fmm model (the input data should be the libffm format), we have to transform our dataset from csv into libffm.  
  
For LR and fm model, at first, we try to use csv. So we have to transform all string type data into number, and the best way is using one hot encodeing. But after ont hot encoding, the dataset has been expanded from 22 columns into more than 50000 columns. According to the guide of xlearn, users can also give a libffm file to LR and FM task, because xLearn will treat this data as libsvm format. So we decided to use libffm to fit all th models.

In [4]:
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0,click,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,2,15706,320,50,1722,0,35,-1,79
1,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,15704,320,50,1722,0,35,100084,79
2,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,15704,320,50,1722,0,35,100084,79
3,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,15706,320,50,1722,0,35,100084,79
4,0,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,18993,320,50,2161,0,35,-1,157


In [57]:
train['click'].value_counts()

0    98214
1    20792
Name: click, dtype: int64

In [58]:
test = pd.read_csv('data/test.csv')
test.head()

Unnamed: 0,click,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,0,1005,0,85f751fd,c4e18dd6,50e219e0,1779deee,2347f47a,f95efa07,a99f214a,...,1,0,19251,320,50,2201,3,35,-1,43
1,0,1005,1,5b4d2eda,16a36ef3,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,16208,320,50,1800,3,167,100075,23
2,0,1005,0,d6137915,bb1ef334,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,19772,320,50,2227,0,687,100075,48
3,0,1005,1,6bdcda77,b0a0505f,72722551,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,2,20596,320,50,2161,0,35,-1,157
4,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,15701,320,50,1722,0,35,-1,79


In [59]:
test['click'].value_counts()

0    113569
1     23873
Name: click, dtype: int64

So we can see that the dataset is really imbalanced. And we have to do resampling.

#### Data resample

We choose downsampling to resample the data. And because in this situation, 'click' equaling to 0 is much more than 'click' equaling to 1, we just randomly select out 'not clicked' records with same number of 'clicked' records.

In [70]:
#dataset is the original dataset
#num_0 is the number of records with 'click' = 0
#num_0 is the number of records with 'click' = 1

def downsampling(dataset, num_0, num_1):
    ran = range(0, num_0) 
    #implement randomly choose
    nums = random.sample(ran, num_1)    
    sub_data = pd.DataFrame()
    
    #add all selected records into sub_dataset
    for i in range(0, num_1):
        n = dataset.iloc[nums[i], :]
        sub_data = sub_data.append(n)
      
    return sub_data

In [71]:
sub_train = train[train['click']==0]

# select out 20792 records from 98214 train records whose 'click' equals to 0
sub_train = downsampling(sub_train, 98214, 20792)

In [72]:
sub_train.head()

Unnamed: 0,C1,C14,C15,C16,C17,C18,C19,C20,C21,app_category,...,banner_pos,click,device_conn_type,device_id,device_ip,device_model,device_type,site_category,site_domain,site_id
50391,1005.0,15699.0,320.0,50.0,1722.0,0.0,35.0,100084.0,79.0,07d7df22,...,0.0,0.0,0.0,a99f214a,de23f5f7,8a4875bd,1.0,28905ebd,f3845767,1fbe01fe
35652,1005.0,15701.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,07d7df22,...,0.0,0.0,0.0,a99f214a,75bb1b58,0153a639,1.0,28905ebd,f3845767,1fbe01fe
53606,1005.0,15704.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,07d7df22,...,0.0,0.0,0.0,a99f214a,478ce7c1,0eb711ec,1.0,28905ebd,f3845767,1fbe01fe
52901,1005.0,17753.0,320.0,50.0,1993.0,2.0,1063.0,100084.0,33.0,07d7df22,...,1.0,0.0,0.0,a99f214a,b81bd63a,7fdd04d2,1.0,f028772b,98572c79,d9750ee7
34423,1005.0,6616.0,320.0,50.0,576.0,2.0,35.0,-1.0,32.0,07d7df22,...,1.0,0.0,2.0,a99f214a,48764430,1c6a881d,1.0,f028772b,0dde25ec,17caea14


In [73]:
sub_train.shape

(20792, 22)

In [74]:
sub_test = test[test['click']==0]

# select out 23873 records from 113569 test records whose 'click' equals to 0
sub_test = downsampling(sub_test, 113569, 23873)

In [76]:
sub_test.shape

(23873, 22)

In [137]:
new_train = pd.DataFrame()
new_test = pd.DataFrame()

In [138]:
#concatenate the part that 'click' = 1 with the selected part the 'click' = 0
new_train = pd.concat([sub_train, train[train['click']==1]], axis = 0)
new_test = pd.concat([sub_test, test[test['click']==1]], axis = 0)

#reset the index of concatenated dataset
new_train = new_train.reset_index(drop=True)
new_test = new_test.reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [101]:
new_train.head()

Unnamed: 0,C1,C14,C15,C16,C17,C18,C19,C20,C21,app_category,...,banner_pos,click,device_conn_type,device_id,device_ip,device_model,device_type,site_category,site_domain,site_id
0,1005.0,15699.0,320.0,50.0,1722.0,0.0,35.0,100084.0,79.0,07d7df22,...,0.0,0.0,0.0,a99f214a,de23f5f7,8a4875bd,1.0,28905ebd,f3845767,1fbe01fe
1,1005.0,15701.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,07d7df22,...,0.0,0.0,0.0,a99f214a,75bb1b58,0153a639,1.0,28905ebd,f3845767,1fbe01fe
2,1005.0,15704.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,07d7df22,...,0.0,0.0,0.0,a99f214a,478ce7c1,0eb711ec,1.0,28905ebd,f3845767,1fbe01fe
3,1005.0,17753.0,320.0,50.0,1993.0,2.0,1063.0,100084.0,33.0,07d7df22,...,1.0,0.0,0.0,a99f214a,b81bd63a,7fdd04d2,1.0,f028772b,98572c79,d9750ee7
4,1005.0,6616.0,320.0,50.0,576.0,2.0,35.0,-1.0,32.0,07d7df22,...,1.0,0.0,2.0,a99f214a,48764430,1c6a881d,1.0,f028772b,0dde25ec,17caea14


In [3]:
# sub = train[['site_id','site_domain','site_category','app_id','app_domain','app_category','device_id','device_ip','device_model']].astype(str)
# sub = pd.get_dummies(sub)

# train = train[['click','C1','banner_pos','device_type','device_conn_type','C14','C15','C16','C17','C18','C19','C20','C21']]
# df = pd.merge(train, sub, left_index = True, right_index = True, how = 'inner')
# df.head()

#### Data transform

In [139]:
class FFMFormatPandas:
    def __init__(self):
        self.field_index_ = None
        self.feature_index_ = None
        self.y = None

    def fit(self, df, y=None):
        self.y = y
        df_ffm = df[df.columns.difference([self.y])]
        if self.field_index_ is None:
            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}

        if self.feature_index_ is not None:
            last_idx = max(list(self.feature_index_.values()))

        if self.feature_index_ is None:
            self.feature_index_ = dict()
            last_idx = 0

        for col in df.columns:
            vals = df[col].unique()
            for val in vals:
                if pd.isnull(val):
                    continue
                name = '{}_{}'.format(col, val)
                if name not in self.feature_index_:
                    self.feature_index_[name] = last_idx
                    last_idx += 1
            self.feature_index_[col] = last_idx
            last_idx += 1
        return self

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df)

    def transform_row_(self, row, t):
        ffm = []
        if self.y != None:
            ffm.append(str(row.loc[row.index == self.y][0]))
        if self.y is None:
            ffm.append(str(0))

        for col, val in row.loc[row.index != self.y].to_dict().items():
            col_type = t[col]
            name = '{}_{}'.format(col, val)
            if col_type.kind ==  'O':
                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
            elif col_type.kind == 'i':
                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
        return ' '.join(ffm)

    def transform(self, df):
        t = df.dtypes.to_dict()
        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})


In [140]:
#transform train dataset into libffm
ffm_train = FFMFormatPandas()
ffm_train_data = ffm_train.fit_transform(new_train, y='click')

ffm_train_data.head()

0    0.0 9:727:1 10:744:1 11:786:1 14:1310:1 15:478...
1    0.0 9:727:1 10:744:1 11:786:1 14:1310:1 15:478...
2    0.0 9:727:1 10:744:1 11:786:1 14:1310:1 15:479...
3    0.0 9:727:1 10:744:1 11:786:1 14:1310:1 15:479...
4    0.0 9:727:1 10:744:1 11:786:1 14:1310:1 15:479...
dtype: object

In [142]:
type(ffm_train_data)

pandas.core.series.Series

So we have get a series with right libffm format. And according to the example given by xlearn, we have to put this series into txt and then bring it into models. To get the txt, we first transform it into dataframe and then output it as txt.

In [143]:
dict_ffm = {'numbers':ffm_train_data.values}
df = pd.DataFrame(dict_ffm)
df.head()

Unnamed: 0,numbers
0,0.0 9:727:1 10:744:1 11:786:1 14:1310:1 15:478...
1,0.0 9:727:1 10:744:1 11:786:1 14:1310:1 15:478...
2,0.0 9:727:1 10:744:1 11:786:1 14:1310:1 15:479...
3,0.0 9:727:1 10:744:1 11:786:1 14:1310:1 15:479...
4,0.0 9:727:1 10:744:1 11:786:1 14:1310:1 15:479...


In [144]:
#transform train dataset into libffm
ffm_test = FFMFormatPandas()
ffm_test_data = ffm_test.fit_transform(new_test, y='click')

dict_ffm = {'numbers':ffm_test_data.values}
df = pd.DataFrame(dict_ffm)
df.head()

Unnamed: 0,numbers
0,0.0 9:739:1 10:757:1 11:799:1 14:1392:1 15:571...
1,0.0 9:740:1 10:757:1 11:800:1 14:1393:1 15:571...
2,0.0 9:740:1 10:758:1 11:801:1 14:1392:1 15:571...
3,0.0 9:739:1 10:757:1 11:799:1 14:1392:1 15:572...
4,0.0 9:740:1 10:759:1 11:802:1 14:1392:1 15:572...


In [145]:
# output data
df.to_csv('txt/ffm_dataset.txt', sep=' ', index=False, header = False)
df.to_csv('txt/ffm_dataset_test.txt', sep=' ', index=False, header = False)

## Building Models

There are some important parameters in xlearn's model:  
param:
0.task:  
The type of classification  
1.learning rate :  
learning rate is used in gradient descent which decides how the weight will be changed in each iteration  
2.regular lambda :  
lambda is the coeffcient of the penalty item used for regularization  
3.metrics  
By default, xLearn will calculate the validation loss in each epoch, while users can also set different evaluation metrics like 'acc' or 'f1'  
4.opt:  
Optimization methods which is different algorithm of gradient descent. User can choose 'sgd', 'adagrad', and 'ftrl'

### Logistic regression

Logistic regression is a generalized linear regression analysis model. It uses sigmoid function to calculate the result, so the output will only between 0 and 1.  
  
Function: F(X) = 1/(1+e^(-X))  
X = θ_0+θ_1\*x_1+ θ_2\*x_2+⋯+θ_n\*x_n+ε  
  
The cost function of logistic regression is likelihood function:  
L(θ)= ∑( y_i\*log⁡(h(x_i))+(1-y_i)\*log⁡(1-h(x_i) ) )  
Using likelihood function is because the model wants the output's result can be as confident as possible which the probabilty calaulated by model can be close to 0 or 1.  
And logistic regression uses gradient descent to modify the weight of x iteratively and find the best weights to get the minimum cost.

In [6]:
lr_model = xl.create_linear()
lr_model.setTrain('txt/ffm_dataset2.txt') 

# using validation to do the prediction on tets data and assess the model's performance
lr_model.setValidate("txt/ffm_dataset_test2.txt")

param = {'task':'binary', 'lr':0.2, 
         'lambda':0.2, 'metric':'f1', 
         'opt':'sgd'}

lr_model.fit(param, "lr_model.out")

## Poly2

The difference between logistic regression and Poly2 is that poly2 add the interaction of the each two variables. So the X in the function has turn into:  

</br >
<center>
<img src="graph/ploy2.jpg" width=500/>
</center>

Because poly2 can't use libffm and also one hot encoding is not available. We use the function to transform all the date in dataset into numerical type. 

In [141]:
def convert_obj_to_int(self):
    
    object_list_columns = self.columns
    object_list_dtypes = self.dtypes
    new_col_suffix = '_int'
    for index in range(0,len(object_list_columns)):
        if object_list_dtypes[index] == object :
            self[object_list_columns[index]+new_col_suffix] = self[object_list_columns[index]].map( lambda  x: hash(x))
            self.drop([object_list_columns[index]],inplace=True,axis=1)
    return self

In [146]:
train_num = convert_obj_to_int(new_train)

# divide dataset into the label and predictors 
click_train = train_num['click']
del train_num['click']

#train_num only contain predictors now
train_num.head()

Unnamed: 0,C1,C14,C15,C16,C17,C18,C19,C20,C21,banner_pos,...,device_type,app_category_int,app_domain_int,app_id_int,device_id_int,device_ip_int,device_model_int,site_category_int,site_domain_int,site_id_int
0,1005.0,15699.0,320.0,50.0,1722.0,0.0,35.0,100084.0,79.0,0.0,...,1.0,5923414002617375826,-7125963136635554412,-5703471148722480956,581222763733122429,6160174059328367068,-2012623009723425444,1249503283306138516,4555943309868841283,-4418400357375527045
1,1005.0,15701.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,0.0,...,1.0,5923414002617375826,-7125963136635554412,-5703471148722480956,581222763733122429,2463658418091148909,-69902942725993050,1249503283306138516,4555943309868841283,-4418400357375527045
2,1005.0,15704.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,0.0,...,1.0,5923414002617375826,-7125963136635554412,-5703471148722480956,581222763733122429,8183077568972481863,-1764764894793384056,1249503283306138516,4555943309868841283,-4418400357375527045
3,1005.0,17753.0,320.0,50.0,1993.0,2.0,1063.0,100084.0,33.0,1.0,...,1.0,5923414002617375826,-7125963136635554412,-5703471148722480956,581222763733122429,-4282693603111732547,8465295755224364953,1320242193175794360,5918072440415254731,6633535241123338761
4,1005.0,6616.0,320.0,50.0,576.0,2.0,35.0,-1.0,32.0,1.0,...,1.0,5923414002617375826,-7125963136635554412,-5703471148722480956,581222763733122429,-8920077806506709557,590598139723905821,1320242193175794360,-3697132147017116839,8797310823573360583


In [147]:
test_num = convert_obj_to_int(new_test)
click_test = test_num['click']
del test_num['click']

In [148]:
test_num.head()

Unnamed: 0,C1,C14,C15,C16,C17,C18,C19,C20,C21,banner_pos,...,device_type,app_category_int,app_domain_int,app_id_int,device_id_int,device_ip_int,device_model_int,site_category_int,site_domain_int,site_id_int
0,1005.0,15701.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,0.0,...,1.0,5923414002617375826,-7125963136635554412,-5703471148722480956,581222763733122429,4456440312603579193,-3804804701342512922,1249503283306138516,4555943309868841283,-4418400357375527045
1,1010.0,21665.0,320.0,50.0,2493.0,3.0,35.0,-1.0,117.0,1.0,...,4.0,5359304147371990826,-7125963136635554412,825495052553219118,9169416056485873200,-4914715230751263860,-869410031517761026,-2043769323202121456,4308774104209053735,4018024504969277096
2,1005.0,20596.0,320.0,50.0,2161.0,0.0,35.0,-1.0,157.0,0.0,...,1.0,5359304147371990826,-767802628734560060,-4293323736873180314,581222763733122429,-206554157160546806,-525747299466633332,-2043769323202121456,4308774104209053735,4018024504969277096
3,1005.0,15705.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,0.0,...,1.0,5923414002617375826,-7125963136635554412,-5703471148722480956,581222763733122429,2489960224289329428,-7089590725792277321,1249503283306138516,4555943309868841283,-4418400357375527045
4,1005.0,21611.0,320.0,50.0,2480.0,3.0,297.0,100111.0,61.0,0.0,...,1.0,5359304147371990826,7093327047585047287,7423464397151507532,581222763733122429,-4275908480113433652,-7247449653446705119,-2043769323202121456,4308774104209053735,4018024504969277096


Then we use pacakage PolynomialFeatures to get only the interactions between each two predctors by setting parameter 'interaction_only' equal to 'True' and 'include_bias' equal to 'False'.

In [149]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
interact_train = pd.DataFrame(poly.fit_transform(train_num))
interact_test = pd.DataFrame(poly.fit_transform(test_num))

In [150]:
# merge the predictors part and the interaction part
train_inter = pd.merge(train_num, interact_train, left_index = True, right_index = True, how = 'inner')
test_inter = pd.merge(test_num, interact_test, left_index = True, right_index = True, how = 'inner')

In [151]:
train_inter.head()

Unnamed: 0,C1,C14,C15,C16,C17,C18,C19,C20,C21,banner_pos,...,221,222,223,224,225,226,227,228,229,230
0,1005.0,15699.0,320.0,50.0,1722.0,0.0,35.0,100084.0,79.0,0.0,...,-1.239811e+37,7.697158e+36,2.80654e+37,-2.721812e+37,-2.514779e+36,-9.169396e+36,8.892574e+36,5.692666e+36,-5.520806e+36,-2.012998e+37
1,1005.0,15701.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,0.0,...,-1.72217e+35,3.078349e+36,1.122429e+37,-1.088543e+37,-8.734396e+34,-3.1847379999999996e+35,3.0885919999999997e+35,5.692666e+36,-5.520806e+36,-2.012998e+37
2,1005.0,15704.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,0.0,...,-1.444121e+37,1.022478e+37,3.728164e+37,-3.615611e+37,-2.2050800000000002e+36,-8.040169e+36,7.797438e+36,5.692666e+36,-5.520806e+36,-2.012998e+37
3,1005.0,17753.0,320.0,50.0,1993.0,2.0,1063.0,100084.0,33.0,1.0,...,-3.625427e+37,-5.654193e+36,-2.534529e+37,-2.84094e+37,1.117624e+37,5.009823e+37,5.615484e+37,7.813289e+36,8.757873e+36,3.925774e+37
4,1005.0,6616.0,320.0,50.0,576.0,2.0,35.0,-1.0,32.0,1.0,...,-5.2681810000000003e+36,-1.177666e+37,3.297871e+37,-7.847269999999999e+37,7.797325999999999e+35,-2.1835190000000002e+36,5.195675e+36,-4.88111e+36,1.161458e+37,-3.252482e+37


In [170]:
test_inter.head()

Unnamed: 0,C1,C14,C15,C16,C17,C18,C19,C20,C21,banner_pos,...,221,222,223,224,225,226,227,228,229,230
0,1005.0,15701.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,0.0,...,-1.695589e+37,5.568337e+36,2.0303289999999998e+37,-1.969034e+37,-4.754116e+36,-1.733447e+37,1.6811149999999999e+37,5.692666e+36,-5.520806e+36,-2.012998e+37
1,1010.0,21665.0,320.0,50.0,2493.0,3.0,35.0,-1.0,117.0,1.0,...,4.2729030000000004e+36,1.004454e+37,-2.11764e+37,-1.9747449999999999e+37,1.776874e+36,-3.7460910000000003e+36,-3.493311e+36,-8.806140000000001e+36,-8.211915e+36,1.731276e+37
2,1005.0,20596.0,320.0,50.0,2161.0,0.0,35.0,-1.0,157.0,0.0,...,1.0859529999999998e+35,4.22149e+35,-8.899951999999999e+35,-8.299396999999999e+35,1.074506e+36,-2.265326e+36,-2.112466e+36,-8.806140000000001e+36,-8.211915e+36,1.731276e+37
3,1005.0,15705.0,320.0,50.0,1722.0,0.0,35.0,-1.0,79.0,0.0,...,-1.7652799999999999e+37,3.111213e+36,1.134412e+37,-1.100164e+37,-8.858467e+36,-3.229977e+37,3.132465e+37,5.692666e+36,-5.520806e+36,-2.012998e+37
4,1005.0,21611.0,320.0,50.0,2480.0,3.0,297.0,100111.0,61.0,0.0,...,3.098943e+37,8.738971e+36,-1.8423919999999999e+37,-1.718071e+37,1.4812119999999999e+37,-3.1227619999999997e+37,-2.912043e+37,-8.806140000000001e+36,-8.211915e+36,1.731276e+37


After adding interactions between each predictors, we have 252 features which is too much and is quite possible for causing overfitting. So we decided to use PCA to deal our dataset.  
Before implementing PCA, we have to normalize our data first because the result of PCA is sensitive to the variance of each feature. (The first component contain the most of the information and thus has the highest varaince.)  
Normalization : x = (x-mean)/std

In [153]:
train_standard = train_inter.copy(deep = True)

#implement normalization
train_standard = train_standard.apply(lambda x : (x-np.mean(x))/np.std(x))

#add label the normalizaed dataset
train_standard['click'] = click_train
train_standard.head()

Unnamed: 0,C1,C14,C15,C16,C17,C18,C19,C20,C21,banner_pos,...,222,223,224,225,226,227,228,229,230,click
0,0.015008,-0.546482,0.178477,-0.240623,-0.556485,-0.694477,-0.410177,1.291186,-0.075667,-0.507575,...,0.490803,1.056799,-1.102642,-0.203541,-0.262339,0.337401,0.107905,-0.214514,-0.793524,0.0
1,0.015008,-0.54586,0.178477,-0.240623,-0.556485,-0.694477,-0.410177,-0.774318,-0.075667,-0.507575,...,0.186789,0.422703,-0.447634,-0.025791,0.08472,-0.018303,0.107905,-0.214514,-0.793524,0.0
2,0.015008,-0.544926,0.178477,-0.240623,-0.556485,-0.694477,-0.410177,-0.774318,-0.075667,-0.507575,...,0.657173,1.403805,-1.461092,-0.180863,-0.21806,0.292019,0.107905,-0.214514,-0.793524,0.0
3,0.015008,0.092777,0.178477,-0.240623,0.136881,0.96082,3.300258,1.291186,-1.11081,1.926911,...,-0.387993,-0.954202,-1.150417,0.798987,2.061641,2.29592,0.235972,0.920608,1.513402,0.0
4,0.015008,-3.373351,0.178477,-0.240623,-3.488577,0.96082,-0.410177,-0.774318,-1.133314,1.926911,...,-0.790979,1.241793,-3.158161,0.0377,0.011588,0.184204,-0.530656,1.14771,-1.275004,0.0


In [156]:
test_standard = test_inter.copy(deep = True)
test_standard = test_standard.apply(lambda x : (x-np.mean(x))/np.std(x))
test_standard['click'] = click_test
test_standard.head()

Unnamed: 0,C1,C14,C15,C16,C17,C18,C19,C20,C21,banner_pos,...,222,223,224,225,226,227,228,229,230,click
0,0.047469,-0.593251,0.142075,-0.256337,-0.600223,-0.819329,-0.510574,-0.825562,0.1189,-0.55137,...,0.369249,0.744125,-0.773516,-0.328519,-0.571831,0.635307,0.113687,-0.243505,-0.86048,0.0
1,5.770814,1.236781,0.142075,-0.256337,1.32053,1.596212,-0.510574,-0.825562,1.03162,1.763588,...,0.682369,-0.789618,-0.775767,0.164404,-0.06038,-0.175003,-0.709821,-0.443723,0.545349,0.0
2,0.047469,0.908763,0.142075,-0.256337,0.493435,-0.819329,-0.510574,-0.825562,1.992378,-0.55137,...,0.009262,-0.039513,-0.030092,0.111393,-0.004646,-0.119897,-0.709821,-0.443723,0.545349,0.0
3,0.047469,-0.592024,0.142075,-0.256337,-0.600223,-0.819329,-0.510574,-0.825562,0.1189,-0.55137,...,0.197368,0.412853,-0.431032,-0.638293,-1.135107,1.214512,0.113687,-0.243505,-0.86048,0.0
4,0.047469,1.220212,0.142075,-0.256337,1.288144,1.596212,0.363349,1.211621,-0.313441,-0.55137,...,0.591042,-0.687843,-0.674593,1.148232,-1.094753,-1.19773,-0.709821,-0.443723,0.545349,0.0


In [168]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA


df_x = train_standard.iloc[:,0:252]
df_y = train_standard.iloc[:,252:253]

test_x = train_standard.iloc[:,0:252]
test_y = train_standard.iloc[:,252:253]

#  reduce the dimention from 252 into 22 so we will get 22 components
estimator = PCA(n_components=22)

X_pca=estimator.fit_transform(df_x)
test_X_pca = estimator.fit_transform(test_x)

# C is the coeffcient of the penalty item.
lr = LogisticRegression(fit_intercept = True, C = 1).fit(X_pca,df_y)

# calculate the accuracy of model
lr.score(test_X_pca,test_y)

  y = column_or_1d(y, warn=True)


0.6085513659099654

### FM model

FM solves the problem of considering pairwise feature interactions and sparse matrix. It allows us to train, based on reliable information (latent features) from every pairwise combination of features in the model. It will factorize the original data matrix into two latent factor matrics with lower dimensions. So the weight of the interaction in Poly2 has turned into the product of latent factors.  
</br >
<center>
<img src="graph/fm.jpg"width=500/>
</center>  
The number of latent factors in latent factor matrics can dedided by ourselves when building the model. And each data in original matrix is the dot product of the relevant latent fators in two latent factor matrics. FM uses gradien descent to modify the values in latent factor matrics and find the best group of values to get the minmum cost of model. The cost fucntion of FM usually is log loss.

In [17]:
fm_model = xl.create_fm()               
fm_model.setTrain('txt/ffm_dataset2.txt') 
fm_model.setValidate("txt/ffm_dataset_test2.txt")
param = {'task':'binary', 'lr':0.2, 
         'lambda':0.001, 'metric':'f1', 
         'opt':'sgd','epoch':15}
fm_model.fit(param, "fm_model.out")

### FFM model

</br >
<center>
<img src="graph/ffm_graph1.png"width=500/>
</center>

In order to understand FFMs, we need to realize the meaning of field. Field is typically the broader category which contains a particular feature. In the above training example, the fields are Publisher (P), Advertiser (A) and Gender(G).

In  FMs,  every  feature  has  only  one  latent  vector v  to  learn the  latent  effect  with  any  other  features. Take  ESPN  as an  example, wESPN is  used  to  learn  the  latent  effect  with Nike (wESPN·wNike) and Male (wESPN.wMale).  

However, because ESPN and Male belong to different fields, the latent effects of (ESPN, Nike) and (ESPN, Male) may be different. This is not captured by factorization machines as it will use the same parameters for dot product in both cases.
In FFMs, each feature has several group of latent vectors. For example, when we consider the interaction term for ESPN and Nike, the hidden feature for ESPN would have the notation wESPN,A where A(Advertiser) represents the field for the feature Nike. Similarly for Gender field a different parameter wESPN,G would be learnt.
</br >
<center>
<img src="graph/ffm_graph2.png"width=800/>
</center>

In [1]:
ffm_model = xl.create_ffm()               
ffm_model.setTrain('txt/ffm_dataset2.txt') 
ffm_model.setValidate("txt/ffm_dataset_test2.txt")
param = {'task':'binary', 'lr':0.2, 
         'lambda':0.0005, 'metric':'f1', 
         'opt':'sgd','epoch':15}

ffm_model.fit(param, "ffm_model.out")

NameError: name 'xl' is not defined

## Summary  
In each model, we have try different lambda and number of epoch to modify our models and get the best f1 score.

#### LR model
</br >
<center>
<img src="graph/LR_result.jpg"width=500/>
</center>
The final result of LR model has reach 0.88 with the early stop at epoch 7 with lambda 0.2

#### Poly2 model
Because the particularity of poly2, we can't build this model by using xLearn and libffm. Though we have tried PCA to avoid overfitting and increased the coefficient of penalty item, but the score of Poly2 is still much lower than other model which is about 0.61.

#### FM model
</br >
<center>
<img src="graph/fm_reslut.png"width=500/>
</center>
The final result of FM model has reach 0.818 at epoch 12 with lambda 0.001

#### FFM model
</br >
<center>
<img src="graph/ffm_result.png" width=500/>
</center>
The final result of FM model has reach 0.92 at epoch 14 with lambda 0.0005  

#### So the reuslt shows that FFM is best choice for predicting the customer's click behavior