## CTR 예측

CTR : Click Through Rate : 전체 페이지 뷰 횟수 대비 광고 클릭 횟수 비율

## 데이터 필드

- id 
- click : 클릭하지 않은 경우 0, 클릭한 경우 1
- hour : YYYYMMDDHH 포맷
- C1 : 익명처리된 범주형 변수
- banner_pos : 배너위치 0, 1
- site_id 
- site_domain
- site_category
- app_id
- app_domain
- app_category
- device_id
- device_ip
- device_model
- device_type
- device_conn_type
- C14-C21 -- 익명처리된 범주형 변수

https://www.kaggle.com/c/avazu-ctr-prediction/data?select=train.gz

In [44]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [45]:
train_df = pd.read_csv('../rawdata/train.csv', nrows=100000)

In [46]:
train_df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [47]:
unused_columns, label_column = ["id", "hour", "device_id", "device_ip"], "click"

In [48]:
train_df = train_df.drop(unused_columns, axis=1)

In [49]:
X_dict_train = list(train_df.drop(label_column, axis=1).T.to_dict().values())

In [50]:
y_train = train_df[label_column]

In [51]:
test_df = pd.read_csv('../rawdata/train.csv', header=0, skiprows=(1, 100000), nrows=100000)

In [52]:
test_df = test_df.drop(unused_columns, axis=1)
X_dict_test = list(test_df.drop(label_column, axis=1).T.to_dict().values())

In [53]:
y_test = test_df[label_column]

## sklearn Tree 기반 모델 - feature 모두 수치형으로 해야함

In [54]:
# 원핫 인코딩, 딕셔너리 변환이유는 일괄적용을 위해서
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse=True)

In [55]:
X_train = vectorizer.fit_transform(X_dict_train)

In [56]:
X_train.shape

(100000, 4952)

In [57]:
X_train.toarray()[:5]

array([[ 1005., 15706.,   320., ...,     0.,     0.,     0.],
       [ 1005., 15704.,   320., ...,     0.,     0.,     0.],
       [ 1005., 15704.,   320., ...,     0.,     0.,     0.],
       [ 1005., 15706.,   320., ...,     0.,     0.,     0.],
       [ 1005., 18993.,   320., ...,     0.,     0.,     0.]])

In [58]:
X_test = vectorizer.fit_transform(X_dict_test)

In [59]:
X_test.shape

(100000, 4952)

## GridSearch

In [60]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [61]:
parameters = {"max_depth" : [3, 10, None]}

In [62]:
decision_tree = DecisionTreeClassifier(criterion="gini", min_samples_split=30)

In [63]:
grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1, cv=3, scoring="roc_auc")

In [64]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(min_samples_split=30),
             n_jobs=-1, param_grid={'max_depth': [3, 10, None]},
             scoring='roc_auc')

In [65]:
grid_search.best_params_

{'max_depth': 10}

In [66]:
decision_tree_best = grid_search.best_estimator_

In [67]:
from sklearn.tree import export_graphviz

export_graphviz(decision_tree_best, out_file="ctr_decision_tree.dot", feature_names=vectorizer.feature_names_, \
                class_names = ["0", "1"], rounded=True, filled=True, impurity=True)

In [68]:
import pydot

In [69]:
graph = pydot.graph_from_dot_file("ctr_decision_tree.dot")[0]

In [70]:
graph.write_png("ctr_decision_tree.png")

FileNotFoundError: [WinError 2] "dot" not found in path.

In [None]:
import os
os.getcwd()