# Field-aware Factorization Machine
- xlearn으로 ffm 모델을 학습합니다
- 주어진 avazu-ctr-prediction 데이터만 사용합니다
- 모델의 성능은 분류 성능인 logloss로 평가합니다

In [None]:
!wget https://github.com/aksnzhy/xlearn/releases/download/v0.4.4/xlearn-0.4.4.tar.gz

In [3]:
!pip install xlearn-0.4.4.tar.gz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Processing ./xlearn-0.4.4.tar.gz
Building wheels for collected packages: xlearn
  Building wheel for xlearn (setup.py) ... [?25l[?25hdone
  Created wheel for xlearn: filename=xlearn-0.4.4-py3-none-any.whl size=239290 sha256=ba1cd1bc4adf57d8f61383c4a415445b817f68d2195521b390ff98c5689c9748
  Stored in directory: /root/.cache/pip/wheels/e3/f6/89/a5fed279e80bf75e7963ff80f58fc180d1e8d95a181266bea4
Successfully built xlearn
Installing collected packages: xlearn
Successfully installed xlearn-0.4.4


In [4]:
import pandas as pd
import tensorflow as tf
from datetime import datetime, date
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from google.colab import drive
import xlearn as xl
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
drive.mount('/content/drive')
K = tf.keras.backend

Mounted at /content/drive


In [5]:
file_path = '/content/drive/MyDrive/recomm_study/recomm_code/Recommend_learningspoons/data/avazu-ctr-prediction-20220921T022743Z-001/avazu-ctr-prediction/ad_click.csv'
parse_date = lambda x : datetime.strptime(x, '%y%m%d%H').strftime('%Y-%m-%d')
click_df = pd.read_csv(file_path, parse_dates = ['datetime'], date_parser = parse_date, header=0)
click_df.head()

Unnamed: 0,id,click,datetime,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10019071520499579916,0,2014-10-21,1005,0,da79c793,71ed77a0,f028772b,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157
1,10025633842336165171,0,2014-10-21,1010,1,85f751fd,c4e18dd6,50e219e0,8c0dcd5a,7801e8d9,...,4,0,21665,320,50,2493,3,35,-1,117
2,10092735447533755726,0,2014-10-21,1002,0,61a8c644,948ff336,50e219e0,ecad2386,7801e8d9,...,0,0,19665,320,50,2253,2,303,-1,52
3,10141326312159899433,1,2014-10-21,1005,1,d9750ee7,98572c79,f028772b,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,-1,79
4,10141793556467368079,0,2014-10-21,1005,0,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,...,1,0,20362,320,50,2333,0,39,-1,157


## 데이터 로드 및 Preprocessing

## FFM 학습 데이터 생성

In [6]:
click_df = click_df[[
    'click', 'datetime', 'banner_pos', 'site_id', 'site_domain',
    'site_category', 'app_id', 'app_domain', 'app_category',
    'device_model', 'device_type', 'device_conn_type',
    'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
]]

In [7]:
# FM에 사용할 피쳐들

feature_col = [
    'banner_pos', 'site_id', 'site_domain','site_category', 'app_id', 'app_domain',
    'app_category', 'device_model', 'device_type', 'device_conn_type',
    'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
]

for feature in feature_col:
    click_df[feature] = click_df[feature].astype("category")
    click_df[feature] = click_df[feature].cat.codes

In [8]:
click_df

Unnamed: 0,click,datetime,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_model,...,device_conn_type,C1,C14,C15,C16,C17,C18,C19,C20,C21
0,0,2014-10-21,0,1546,767,18,1648,43,0,3696,...,0,2,375,3,2,102,0,2,0,42
1,0,2014-10-21,1,949,1316,5,965,43,3,441,...,0,5,719,3,2,196,3,2,0,39
2,0,2014-10-21,0,706,1001,5,1648,43,0,1676,...,0,1,410,3,2,116,2,21,0,15
3,1,2014-10-21,1,1541,1029,18,1648,43,0,2011,...,0,2,202,3,2,51,0,2,0,22
4,0,2014-10-21,0,617,1338,3,1648,43,0,3004,...,0,2,515,3,2,144,0,4,0,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0,2014-10-30,1,647,604,18,1648,43,0,1272,...,0,2,1771,3,2,378,3,8,0,6
199996,0,2014-10-30,0,949,1316,5,564,91,3,2240,...,0,2,759,3,2,201,3,5,81,16
199997,0,2014-10-30,0,217,1620,1,1648,43,0,235,...,0,2,1272,3,2,294,0,2,0,14
199998,0,2014-10-30,0,1361,1251,5,1648,43,0,121,...,0,1,291,3,2,91,2,4,56,7


In [9]:
# feature dimension size
feature_dim = {}
for feature in feature_col:
    feature_dim[feature] = click_df[feature].nunique()

In [10]:
feature_dim

{'banner_pos': 7,
 'site_id': 1804,
 'site_domain': 1711,
 'site_category': 20,
 'app_id': 1776,
 'app_domain': 112,
 'app_category': 22,
 'device_model': 3751,
 'device_type': 4,
 'device_conn_type': 4,
 'C1': 7,
 'C14': 1934,
 'C15': 8,
 'C16': 9,
 'C17': 405,
 'C18': 4,
 'C19': 65,
 'C20': 159,
 'C21': 60}

In [11]:
print('number of variables:', sum([dim for dim in feature_dim.values()]))

number of variables: 11862


In [12]:
# 전체 데이터를 ffm의 variable number로 변환

idx = 0
for feature in feature_col:
    click_df[feature] = click_df[feature] + idx
    idx += feature_dim[feature]

click_df

Unnamed: 0,click,datetime,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_model,...,device_conn_type,C1,C14,C15,C16,C17,C18,C19,C20,C21
0,0,2014-10-21,0,1553,2578,3540,5190,5361,5430,9148,...,9207,9213,9593,11155,11162,11271,11574,11580,11643,11844
1,0,2014-10-21,1,956,3127,3527,4507,5361,5433,5893,...,9207,9216,9937,11155,11162,11365,11577,11580,11643,11841
2,0,2014-10-21,0,713,2812,3527,5190,5361,5430,7128,...,9207,9212,9628,11155,11162,11285,11576,11599,11643,11817
3,1,2014-10-21,1,1548,2840,3540,5190,5361,5430,7463,...,9207,9213,9420,11155,11162,11220,11574,11580,11643,11824
4,0,2014-10-21,0,624,3149,3525,5190,5361,5430,8456,...,9207,9213,9733,11155,11162,11313,11574,11582,11643,11844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0,2014-10-30,1,654,2415,3540,5190,5361,5430,6724,...,9207,9213,10989,11155,11162,11547,11577,11586,11643,11808
199996,0,2014-10-30,0,956,3127,3527,4106,5409,5433,7692,...,9207,9213,9977,11155,11162,11370,11577,11583,11724,11818
199997,0,2014-10-30,0,224,3431,3523,5190,5361,5430,5687,...,9207,9213,10490,11155,11162,11463,11574,11580,11643,11816
199998,0,2014-10-30,0,1368,3062,3527,5190,5361,5430,5573,...,9207,9212,9509,11155,11162,11260,11576,11582,11699,11809


### train / test 데이터 생성

In [13]:
# 날짜를 기준으로 train과 test 데이터를 나눕니다

train_df = click_df[click_df['datetime'] <= '2014-10-28']
test_df = click_df[click_df['datetime'] > '2014-10-28']

print(len(train_df))
print(len(test_df))

160052
39948


In [14]:
# xlearn의 ffm 데이터는 ylabel field_1:index_1:value_1 field_2:index_2:value_2 ... 로 만들어야 합니다

with open('./ffm_train.txt', 'w') as f:
    for _, row in train_df.iterrows():
        label = row['click']
        feature = [str(label)]+ [str(field) + ':' + str(elem) + ':1.0' for field, elem in enumerate(row[feature_col].values)]
        f.write(' '.join(feature) + '\n')

In [15]:
with open('./ffm_test.txt', 'w') as f:
    for _, row in test_df.iterrows():
        label = row['click']
        feature = [str(label)]+ [str(field) + ':' + str(elem) + ':1.0' for field, elem in enumerate(row[feature_col].values)]
        f.write(' '.join(feature) + '\n')

## xlearn FFM
- 참고: xlearn 모델 관련 [하이퍼파라미터](https://xlearn-doc.readthedocs.io/en/latest/all_api/index.html?highlight=create_fm#xlearn-python-api)

In [19]:
import xlearn as xl
import os
os.environ['USER'] = 'test'
xl.hello()

In [None]:
# FM 모델 선언
ffm_model = xl.create_ffm()
ffm_model.setTrain("./content/ffm_train.txt")
ffm_model.setValidate("/content/ffm_test.txt")

# 하이퍼파라미터 선언
param = {'task':'binary', 'lr':0.2, 'lambda':0.001, 'k': 4, 'epoch': 100}

# 모델 학습
ffm_model.setTXTModel("./content/ffm_model.txt")
ffm_model.fit(param, "./content/ffm_model.out")

In [None]:
# test 데이터를 사용하여 학습된 FM 모델로 CTR 예측 => output.txt

ffm_model.setTest("./content/ffm_test.txt")
ffm_model.setSigmoid()
ffm_model.predict("./content/ffm_model.out", "./ffm_output.txt")

NameError: ignored

### 하이퍼 파라미터 튜닝

In [None]:
from sklearn.metrics import log_loss

test_click = test_df['click'].values

result = []

for k in [1,2,4,8]:
    for _lambda in [0.0005, 0.001, 0.002]:
        for lr in [0.1, 0.2, 0.3]:
            
            # train
            ffm_model = xl.create_ffm()         
            ffm_model.setTrain("./ffm_train.txt")
            param = {'task':'binary', 'lr':lr, 'lambda':_lambda, 'k':k, 'epoch':30}
            ffm_model.fit(param, "./ffm_model.out")
            
            # test
            ffm_model.setTest("./ffm_test.txt")
            ffm_model.setSigmoid()
            pCTR = ffm_model.predict("./ffm_model.out")
            result.append([k, _lambda, lr, log_loss(test_click, pCTR)])
            print(k, _lambda, lr, log_loss(test_click, pCTR))
            
result_df = pd.DataFrame(result, columns = ['k', 'lambda', 'lr', 'logloss'])

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.44 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 32 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (./ffm_train.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 11857
[32m[------------] [0mNumber of Field: 19
[32m[------------] [0mTime cost for reading problem: 0.06 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 6.97 MB
[32m[------------] 

[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.416869                0.18
[32m[ [0m   6%[32m      ][0m     2            0.408087                0.18
[32m[ [0m  10%[32m      ][0m     3            0.404978                0.18
[32m[ [0m  13%[32m      ][0m     4            0.403010                0.18
[32m[ [0m  16%[32m      ][0m     5            0.401437                0.18
[32m[ [0m  20%[32m      ][0m     6            0.400205                0.18
[32m[ [0m  23%[32m      ][0m     7            0.399192                0.18
[32m[ [0m  26%[32m      ][0m     8            0.398209                0.18
[32m[ [0m  30%[32m      ][0m     9            0.397338                0.18
[32m[ [0m  33%[32m      ][0m    10            0.396533                0.18
[32m[ [0m  36%[32m      ][0m    11            0.395795                0.18
[32m[ [0m  40%[32m      ][0m    12            0.395055   

[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.418667                0.18
[32m[ [0m   6%[32m      ][0m     2            0.410385                0.18
[32m[ [0m  10%[32m      ][0m     3            0.407513                0.18
[32m[ [0m  13%[32m      ][0m     4            0.405673                0.18
[32m[ [0m  16%[32m      ][0m     5            0.404260                0.18
[32m[ [0m  20%[32m      ][0m     6            0.403189                0.18
[32m[ [0m  23%[32m      ][0m     7            0.402322                0.18
[32m[ [0m  26%[32m      ][0m     8            0.401509                0.18
[32m[ [0m  30%[32m      ][0m     9            0.400838                0.18
[32m[ [0m  33%[32m      ][0m    10            0.400227                0.18
[32m[ [0m  36%[32m      ][0m    11            0.399678                0.18
[32m[ [0m  40%[32m      ][0m    12            0.399147   

[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.422735                0.15
[32m[ [0m   6%[32m      ][0m     2            0.414783                0.15
[32m[ [0m  10%[32m      ][0m     3            0.412097                0.15
[32m[ [0m  13%[32m      ][0m     4            0.410377                0.15
[32m[ [0m  16%[32m      ][0m     5            0.409073                0.15
[32m[ [0m  20%[32m      ][0m     6            0.408046                0.15
[32m[ [0m  23%[32m      ][0m     7            0.407242                0.15
[32m[ [0m  26%[32m      ][0m     8            0.406510                0.15
[32m[ [0m  30%[32m      ][0m     9            0.405877                0.15
[32m[ [0m  33%[32m      ][0m    10            0.405340                0.15
[32m[ [0m  36%[32m      ][0m    11            0.404848                0.15
[32m[ [0m  40%[32m      ][0m    12            0.404376   

[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.417456                0.15
[32m[ [0m   6%[32m      ][0m     2            0.409128                0.15
[32m[ [0m  10%[32m      ][0m     3            0.406320                0.15
[32m[ [0m  13%[32m      ][0m     4            0.404518                0.15
[32m[ [0m  16%[32m      ][0m     5            0.403073                0.15
[32m[ [0m  20%[32m      ][0m     6            0.402011                0.15
[32m[ [0m  23%[32m      ][0m     7            0.401203                0.15
[32m[ [0m  26%[32m      ][0m     8            0.400395                0.15
[32m[ [0m  30%[32m      ][0m     9            0.399719                0.15
[32m[ [0m  33%[32m      ][0m    10            0.399110                0.15
[32m[ [0m  36%[32m      ][0m    11            0.398575                0.15
[32m[ [0m  40%[32m      ][0m    12            0.398079   

[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.418258                0.15
[32m[ [0m   6%[32m      ][0m     2            0.409936                0.15
[32m[ [0m  10%[32m      ][0m     3            0.406988                0.14
[32m[ [0m  13%[32m      ][0m     4            0.405063                0.15
[32m[ [0m  16%[32m      ][0m     5            0.403598                0.15
[32m[ [0m  20%[32m      ][0m     6            0.402423                0.14
[32m[ [0m  23%[32m      ][0m     7            0.401460                0.15
[32m[ [0m  26%[32m      ][0m     8            0.400568                0.15
[32m[ [0m  30%[32m      ][0m     9            0.399798                0.14
[32m[ [0m  33%[32m      ][0m    10            0.399076                0.14
[32m[ [0m  36%[32m      ][0m    11            0.398422                0.14
[32m[ [0m  40%[32m      ][0m    12            0.397790   

[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.422170                0.18
[32m[ [0m   6%[32m      ][0m     2            0.414209                0.18
[32m[ [0m  10%[32m      ][0m     3            0.411453                0.18
[32m[ [0m  13%[32m      ][0m     4            0.409667                0.18
[32m[ [0m  16%[32m      ][0m     5            0.408349                0.18
[32m[ [0m  20%[32m      ][0m     6            0.407290                0.18
[32m[ [0m  23%[32m      ][0m     7            0.406457                0.18
[32m[ [0m  26%[32m      ][0m     8            0.405673                0.18
[32m[ [0m  30%[32m      ][0m     9            0.405023                0.18
[32m[ [0m  33%[32m      ][0m    10            0.404460                0.18
[32m[ [0m  36%[32m      ][0m    11            0.403941                0.18
[32m[ [0m  40%[32m      ][0m    12            0.403445   

[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.417051                0.15
[32m[ [0m   6%[32m      ][0m     2            0.408501                0.15
[32m[ [0m  10%[32m      ][0m     3            0.405554                0.15
[32m[ [0m  13%[32m      ][0m     4            0.403695                0.15
[32m[ [0m  16%[32m      ][0m     5            0.402191                0.15
[32m[ [0m  20%[32m      ][0m     6            0.401061                0.15
[32m[ [0m  23%[32m      ][0m     7            0.400146                0.15
[32m[ [0m  26%[32m      ][0m     8            0.399293                0.15
[32m[ [0m  30%[32m      ][0m     9            0.398568                0.15
[32m[ [0m  33%[32m      ][0m    10            0.397900                0.15
[32m[ [0m  36%[32m      ][0m    11            0.397328                0.15
[32m[ [0m  40%[32m      ][0m    12            0.396775   

[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.419100                0.19
[32m[ [0m   6%[32m      ][0m     2            0.410999                0.18
[32m[ [0m  10%[32m      ][0m     3            0.408276                0.18
[32m[ [0m  13%[32m      ][0m     4            0.406485                0.19
[32m[ [0m  16%[32m      ][0m     5            0.405121                0.19
[32m[ [0m  20%[32m      ][0m     6            0.404071                0.18
[32m[ [0m  23%[32m      ][0m     7            0.403282                0.18
[32m[ [0m  26%[32m      ][0m     8            0.402517                0.18
[32m[ [0m  30%[32m      ][0m     9            0.401863                0.18
[32m[ [0m  33%[32m      ][0m    10            0.401291                0.18
[32m[ [0m  36%[32m      ][0m    11            0.400790                0.18
[32m[ [0m  40%[32m      ][0m    12            0.400300   

[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.421927                0.15
[32m[ [0m   6%[32m      ][0m     2            0.413802                0.15
[32m[ [0m  10%[32m      ][0m     3            0.410944                0.15
[32m[ [0m  13%[32m      ][0m     4            0.409094                0.15
[32m[ [0m  16%[32m      ][0m     5            0.407730                0.15
[32m[ [0m  20%[32m      ][0m     6            0.406617                0.15
[32m[ [0m  23%[32m      ][0m     7            0.405712                0.15
[32m[ [0m  26%[32m      ][0m     8            0.404881                0.15
[32m[ [0m  30%[32m      ][0m     9            0.404180                0.15
[32m[ [0m  33%[32m      ][0m    10            0.403563                0.15
[32m[ [0m  36%[32m      ][0m    11            0.402994                0.15
[32m[ [0m  40%[32m      ][0m    12            0.402444   

[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.416823                0.15
[32m[ [0m   6%[32m      ][0m     2            0.408068                0.15
[32m[ [0m  10%[32m      ][0m     3            0.404975                0.15
[32m[ [0m  13%[32m      ][0m     4            0.402999                0.15
[32m[ [0m  16%[32m      ][0m     5            0.401377                0.15
[32m[ [0m  20%[32m      ][0m     6            0.400076                0.15
[32m[ [0m  23%[32m      ][0m     7            0.399006                0.15
[32m[ [0m  26%[32m      ][0m     8            0.397968                0.15
[32m[ [0m  30%[32m      ][0m     9            0.397044                0.15
[32m[ [0m  33%[32m      ][0m    10            0.396185                0.15
[32m[ [0m  36%[32m      ][0m    11            0.395379                0.15
[32m[ [0m  40%[32m      ][0m    12            0.394590   

[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.418629                0.18
[32m[ [0m   6%[32m      ][0m     2            0.410335                0.19
[32m[ [0m  10%[32m      ][0m     3            0.407466                0.19
[32m[ [0m  13%[32m      ][0m     4            0.405644                0.19
[32m[ [0m  16%[32m      ][0m     5            0.404261                0.18
[32m[ [0m  20%[32m      ][0m     6            0.403175                0.19
[32m[ [0m  23%[32m      ][0m     7            0.402278                0.18
[32m[ [0m  26%[32m      ][0m     8            0.401483                0.18
[32m[ [0m  30%[32m      ][0m     9            0.400804                0.19
[32m[ [0m  33%[32m      ][0m    10            0.400178                0.18
[32m[ [0m  36%[32m      ][0m    11            0.399642                0.18
[32m[ [0m  40%[32m      ][0m    12            0.399128   

[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.422657                0.19
[32m[ [0m   6%[32m      ][0m     2            0.414784                0.19
[32m[ [0m  10%[32m      ][0m     3            0.412101                0.18
[32m[ [0m  13%[32m      ][0m     4            0.410378                0.19
[32m[ [0m  16%[32m      ][0m     5            0.409067                0.18
[32m[ [0m  20%[32m      ][0m     6            0.408046                0.18
[32m[ [0m  23%[32m      ][0m     7            0.407241                0.18
[32m[ [0m  26%[32m      ][0m     8            0.406513                0.18
[32m[ [0m  30%[32m      ][0m     9            0.405888                0.19
[32m[ [0m  33%[32m      ][0m    10            0.405342                0.18
[32m[ [0m  36%[32m      ][0m    11            0.404857                0.18
[32m[ [0m  40%[32m      ][0m    12            0.404393   

[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.417431                0.18
[32m[ [0m   6%[32m      ][0m     2            0.409128                0.18
[32m[ [0m  10%[32m      ][0m     3            0.406295                0.18
[32m[ [0m  13%[32m      ][0m     4            0.404519                0.18
[32m[ [0m  16%[32m      ][0m     5            0.403102                0.19
[32m[ [0m  20%[32m      ][0m     6            0.402010                0.18
[32m[ [0m  23%[32m      ][0m     7            0.401201                0.18
[32m[ [0m  26%[32m      ][0m     8            0.400370                0.18
[32m[ [0m  30%[32m      ][0m     9            0.399715                0.18
[32m[ [0m  33%[32m      ][0m    10            0.399121                0.18
[32m[ [0m  36%[32m      ][0m    11            0.398589                0.18
[32m[ [0m  40%[32m      ][0m    12            0.398096   

[32m[------------] [0mModel size: 13.84 MB
[32m[------------] [0mTime cost for model initial: 0.10 (sec)
[32m[1m[ ACTION     ] Start to train ...[0m
[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.418379                0.27
[32m[ [0m   6%[32m      ][0m     2            0.409951                0.30
[32m[ [0m  10%[32m      ][0m     3            0.407019                0.32
[32m[ [0m  13%[32m      ][0m     4            0.405087                0.32
[32m[ [0m  16%[32m      ][0m     5            0.403622                0.32
[32m[ [0m  20%[32m      ][0m     6            0.402399                0.32
[32m[ [0m  23%[32m      ][0m     7            0.401466                0.33
[32m[ [0m  26%[32m      ][0m     8            0.400564                0.31
[32m[ [0m  30%[32m      ][0m     9            0.399752                0.31
[32m[ [0m  33%[32m      ][0m    10            0.399052     

[32m[------------] [0mModel size: 13.84 MB
[32m[------------] [0mTime cost for model initial: 0.15 (sec)
[32m[1m[ ACTION     ] Start to train ...[0m
[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.422244                0.20
[32m[ [0m   6%[32m      ][0m     2            0.414251                0.24
[32m[ [0m  10%[32m      ][0m     3            0.411457                0.23
[32m[ [0m  13%[32m      ][0m     4            0.409690                0.23
[32m[ [0m  16%[32m      ][0m     5            0.408349                0.23
[32m[ [0m  20%[32m      ][0m     6            0.407285                0.23
[32m[ [0m  23%[32m      ][0m     7            0.406445                0.23
[32m[ [0m  26%[32m      ][0m     8            0.405683                0.23
[32m[ [0m  30%[32m      ][0m     9            0.405030                0.23
[32m[ [0m  33%[32m      ][0m    10            0.404469     

[32m[------------] [0mModel size: 13.84 MB
[32m[------------] [0mTime cost for model initial: 0.16 (sec)
[32m[1m[ ACTION     ] Start to train ...[0m
[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.416945                0.31
[32m[ [0m   6%[32m      ][0m     2            0.408442                0.33
[32m[ [0m  10%[32m      ][0m     3            0.405539                0.33
[32m[ [0m  13%[32m      ][0m     4            0.403622                0.32
[32m[ [0m  16%[32m      ][0m     5            0.402141                0.32
[32m[ [0m  20%[32m      ][0m     6            0.401009                0.33
[32m[ [0m  23%[32m      ][0m     7            0.400095                0.32
[32m[ [0m  26%[32m      ][0m     8            0.399272                0.31
[32m[ [0m  30%[32m      ][0m     9            0.398540                0.34
[32m[ [0m  33%[32m      ][0m    10            0.397911     

[32m[------------] [0mModel size: 13.84 MB
[32m[------------] [0mTime cost for model initial: 0.16 (sec)
[32m[1m[ ACTION     ] Start to train ...[0m
[32m[------------][0m Epoch      Train log_loss     Time cost (sec)
[32m[ [0m   3%[32m      ][0m     1            0.418982                0.22
[32m[ [0m   6%[32m      ][0m     2            0.411047                0.24
[32m[ [0m  10%[32m      ][0m     3            0.408261                0.24
[32m[ [0m  13%[32m      ][0m     4            0.406462                0.19
[32m[ [0m  16%[32m      ][0m     5            0.405093                0.23
[32m[ [0m  20%[32m      ][0m     6            0.404080                0.25
[32m[ [0m  23%[32m      ][0m     7            0.403230                0.24
[32m[ [0m  26%[32m      ][0m     8            0.402490                0.24
[32m[ [0m  30%[32m      ][0m     9            0.401844                0.24
[32m[ [0m  33%[32m      ][0m    10            0.401311     

In [None]:
result_df.sort_values(by='logloss')

Unnamed: 0,k,lambda,lr,logloss
27,8,0.0005,0.1,0.402134
19,4,0.0005,0.2,0.402175
10,2,0.0005,0.2,0.402269
1,1,0.0005,0.2,0.402337
9,2,0.0005,0.1,0.402343
18,4,0.0005,0.1,0.40235
31,8,0.001,0.2,0.402535
22,4,0.001,0.2,0.402543
0,1,0.0005,0.1,0.40259
13,2,0.001,0.2,0.402626


# Feature Ablation
- 일반적인 regerssion, classification에 대해서도 어떤 피쳐가 가장 중요한 역할을 하는지 분석합니다
- Tree 모델의 경우 Feature Importance로 표현하기도 합니다
- 현업에서 FM, FFM 계열의 모델을 학습할 때 피쳐의 개수를 최대한 줄이고 서빙속도를 빠르게 하는 것이 목표이므로 다양한 피쳐의 조합에 대해서 실험을 하고 가장 중요한 피쳐들만 선택하여 최종 모델을 만듭니다

In [None]:
# 우리가 사용한 피쳐 종류 => C14와 app_id의 피쳐를 제외하고 성능을 비교해봅시다
feature_dim

{'banner_pos': 7,
 'site_id': 1804,
 'site_domain': 1711,
 'site_category': 20,
 'app_id': 1776,
 'app_domain': 112,
 'app_category': 22,
 'device_model': 3751,
 'device_type': 4,
 'device_conn_type': 4,
 'C1': 7,
 'C14': 1934,
 'C15': 8,
 'C16': 9,
 'C17': 405,
 'C18': 4,
 'C19': 65,
 'C20': 159,
 'C21': 60}

## C14 피쳐를 제외했을 때 성능

In [None]:
no_c14 = [
    'banner_pos', 'site_id', 'site_domain','site_category', 'app_id', 'app_domain',
    'app_category', 'device_model', 'device_type', 'device_conn_type',
    'C1', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
]

In [None]:
with open('./no_c14_train.txt', 'w') as f:
    for _, row in train_df.iterrows():
        label = row['click']
        feature = [str(label)]+ [str(field) + ':' + str(elem) + ':1.0' for field, elem in enumerate(row[no_c14].values)]
        f.write(' '.join(feature) + '\n')

In [None]:
with open('./no_c14_test.txt', 'w') as f:
    for _, row in test_df.iterrows():
        label = row['click']
        feature = [str(label)]+ [str(field) + ':' + str(elem) + ':1.0' for field, elem in enumerate(row[no_c14].values)]
        f.write(' '.join(feature) + '\n')

In [None]:
# train
ffm_model = xl.create_ffm()         
ffm_model.setTrain("./no_c14_train.txt")
param = {'task':'binary', 'lr' : 0.1, 'lambda': 0.0005, 'k': 8, 'epoch': 30}
ffm_model.fit(param, "./no_c14_model.out")

# test
ffm_model.setTest("./no_c14_test.txt")
ffm_model.setSigmoid()
pCTR = ffm_model.predict("./no_c14_model.out")
print("log_loss: ", log_loss(test_click, pCTR))

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.44 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 32 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (./no_c14_train.txt.bin) found. Skip converting text to binary.
[32m[------------] [0mNumber of Feature: 11857
[32m[------------] [0mNumber of Field: 18
[32m[------------] [0mTime cost for reading problem: 0.22 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 13.12 MB
[32m[----------

## app_id 피쳐를 제외했을 때 성능

In [None]:
no_app_id = [
    'banner_pos', 'site_id', 'site_domain','site_category', 'app_domain',
    'app_category', 'device_model', 'device_type', 'device_conn_type',
    'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
]

In [None]:
with open('./no_app_id_train.txt', 'w') as f:
    for _, row in train_df.iterrows():
        label = row['click']
        feature = [str(label)]+ [str(field) + ':' + str(elem) + ':1.0' for field, elem in enumerate(row[no_app_id].values)]
        f.write(' '.join(feature) + '\n')

In [None]:
with open('./no_app_id_test.txt', 'w') as f:
    for _, row in test_df.iterrows():
        label = row['click']
        feature = [str(label)]+ [str(field) + ':' + str(elem) + ':1.0' for field, elem in enumerate(row[no_app_id].values)]
        f.write(' '.join(feature) + '\n')

In [None]:
# train
ffm_model = xl.create_ffm()         
ffm_model.setTrain("./no_app_id_train.txt")
param = {'task':'binary', 'lr' : 0.1, 'lambda': 0.0005, 'k': 8, 'epoch': 30}
ffm_model.fit(param, "./no_app_id_model.out")

# test
ffm_model.setTest("./no_app_id_test.txt")
ffm_model.setSigmoid()
pCTR = ffm_model.predict("./no_app_id_model.out")
print("log_loss: ", log_loss(test_click, pCTR))

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.44 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 32 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (./no_app_id_train.txt.bin) NOT found. Convert text file to binary file.
[32m[------------] [0mNumber of Feature: 11857
[32m[------------] [0mNumber of Field: 18
[32m[------------] [0mTime cost for reading problem: 0.36 (sec)
[32m[1m[ ACTION     ] Initialize model ...[0m
[32m[------------] [0mModel size: 13.12 MB
[32m[-