In [1]:
import collections
import pathlib

# データ処理周り
import numpy as np
import pandas as pd
from IPython.core.display import display

# 可視化ライブラリ
import matplotlib.pyplot as plt
from matplotlib import cm # colormap
import seaborn as sns
%matplotlib inline

# pandasのwarningが邪魔なので
import warnings
warnings.filterwarnings('ignore')

# options
# pd.set_option('display.max_columns', 50)
plt.style.use('ggplot')


####
import sys
sys.path.append('../script')
data_path = pathlib.Path('../data')
my_data_path = pathlib.Path('../my-data')

In [2]:
# %reload_ext autoreload
import talking_data as td
import preprocess 

In [3]:
train = pd.read_csv(data_path / 'train.csv', dtype=td.dtypes)

In [4]:
test = pd.read_csv(data_path / 'test.csv', dtype=td.dtypes)

## サンプルデータ作成

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
for i in range(1, 4):
    n = 0.1**i
    shape = train.shape[0] * n
    print(shape)
    sample = train.sample(frac=n)
    y = sample['is_attributed']
    X = sample[td.base_X_keys]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    pd.concat([X_train, y_train], axis=1).to_csv('../my-data/train_{}.csv'.format(int(n*100)))
    pd.concat([X_test, y_test], axis=1).to_csv('../my-data/test_{}.csv'.format(int(n*100)))

## データマージ

In [5]:
merged = pd.concat([train, test])
del train, test

## IPカウントテーブルの作成

In [6]:
merged.head()

Unnamed: 0,app,attributed_time,channel,click_id,click_time,device,ip,is_attributed,os
0,3,,379,,2017-11-06 14:32:21,1,83230,0.0,13
1,3,,379,,2017-11-06 14:33:34,1,17357,0.0,19
2,3,,379,,2017-11-06 14:34:12,1,35810,0.0,13
3,14,,478,,2017-11-06 14:34:52,1,45745,0.0,13
4,3,,379,,2017-11-06 14:35:08,1,161007,0.0,13


In [7]:
ip_count = merged.groupby(['ip'])['channel'].count().reset_index()
ip_count.columns = ['ip', 'clicks_by_ip']
ip_count['clicks_by_ip'] = ip_count['clicks_by_ip'].astype('uint16')
ip_count.head()

Unnamed: 0,ip,clicks_by_ip
0,0,2
1,1,48
2,2,5
3,3,64
4,4,6


In [None]:
ip_count.to_csv(my_data_path / 'table' / 'ip_count.csv', index=False)

## prevクリック

In [6]:
preprocess.base(merged)

In [7]:
next_click = merged[['ip', 'click_time']]

In [8]:
next_click['next_click'] = merged.groupby(['ip', 'app', 'device', 'os'])['click_time'].diff(-1).dt.total_seconds()

ValueError: 

In [None]:
next_click.to_csv(my_data_path / 'table' / 'next_click.csv', index=False)

## データを見る

In [None]:
df = pd.read_csv(my_data_path / 'train_0.csv', dtype=td.dtypes)

In [None]:
df.head()

In [None]:
df[df.is_attributed > 0].head()