# load package and data

In [1]:
import google.datalab.storage as storage
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from io import BytesIO
from google.datalab import Context
import random, string

p = sns.color_palette()

In [2]:
shared_bucket = storage.Bucket('capstone-01')
for obj in shared_bucket.objects():
  if obj.key.find('/') < 0:
    print(obj.key) # Bucket 내에 포함된 데이터 셋 확인

clicks_test.csv
clicks_train.csv
dis_ad_ctr.csv
dis_several_ads.csv
documents_categories.csv
documents_entities.csv
documents_meta.csv
documents_topics.csv
events.csv
page_views_sample.csv
promoted_content.csv
sample_submission.csv


# clicks_train & clicks_test

In [3]:
my_bucket = storage.Bucket('capstone-01')

clicks_test = my_bucket.object('clicks_test.csv')
uri = clicks_test.uri
%gcs read --object $uri --variable c_test
clicks_test = pd.read_csv(BytesIO(c_test))
clicks_test.head(5)

Unnamed: 0,display_id,ad_id
0,16874594,66758
1,16874594,150083
2,16874594,162754
3,16874594,170392
4,16874594,172888


In [4]:
clicks_train = my_bucket.object('clicks_train.csv')
uri = clicks_train.uri
%gcs read --object $uri --variable c_test
clicks_train = pd.read_csv(BytesIO(c_test))
clicks_train.head(5)

Unnamed: 0,display_id,ad_id,clicked
0,1,42337,0
1,1,139684,0
2,1,144739,1
3,1,156824,0
4,1,279295,0


# cilcks_train의 ad_id를 ctr 내림차순으로 정렬 (views 5 이상) 

In [5]:
# ad_id별 views, clicks, ctr, ctr.percent 구하기
views = clicks_train.groupby('ad_id')['display_id'].count()
clicks = clicks_train.groupby('ad_id')['clicked'].sum()
ctr = round(clicks/views, 4)
ctr.percent = ctr * 100

# ad_id별 views, clicks, ctr, ctr.percent 테이블 만들기
ad_info = pd.concat([views, clicks, ctr, ctr.percent], axis=1)
ad_info.reset_index(level=['ad_id'], inplace=True) # group_by 후 index였던 ad_id를 컬럼으로 보내기
print(ad_info.head())

   ad_id  display_id  clicked       0      1
0      1           2        0  0.0000   0.00
1      2          22        1  0.0455   4.55
2      3         161       11  0.0683   6.83
3      4          32        4  0.1250  12.50
4      5           1        0  0.0000   0.00


In [6]:
# 컬럼명 변경
ad_info.columns = ['ad_id', 'views', 'clicks', 'ctr', 'ctr.percent']

# ctr 순으로 내림차순 정렬
ad_info_desc = ad_info.sort_values('ctr', ascending=False)
print(ad_info_desc.head())

         ad_id  views  clicks  ctr  ctr.percent
478949  548019      1       1  1.0        100.0
396495  439589      1       1  1.0        100.0
438221  486459      1       1  1.0        100.0
396501  439599      1       1  1.0        100.0
470153  530339      1       1  1.0        100.0


In [7]:
# CTR 신뢰도를 높이기 위해, views 수 5개 이상인 것만 추출
ad_info_views_over5 = ad_info[ad_info['views'] >= 5].sort_values('ctr', ascending=False)
print(ad_info_views_over5.shape) # (255975, 4)
print(ad_info_views_over5.head())

(255975, 5)
         ad_id  views  clicks  ctr  ctr.percent
45667    50785      5       5  1.0        100.0
472699  534762      5       5  1.0        100.0
355855  395356      7       7  1.0        100.0
355353  394799      5       5  1.0        100.0
230817  259272      5       5  1.0        100.0


# click_train의 ad_id의 ctr을 clicks_test에 복붙하자

In [8]:
# 혹시 모르니까 ad_info_unq로 복사
ad_info_unq = ad_info_views_over5

# clicks_test와 ad_info_unq(clicks_train의 ad 정보)를 merge
click_test_with_ad_info = pd.merge(clicks_test, ad_info_unq, on='ad_id', how='left')
print(click_test_with_ad_info.shape)
print(click_test_with_ad_info.head())

# merge한 데이터에서 NaN 있나 확인
nan_rows = click_test_with_ad_info[click_test_with_ad_info['views'].isnull()]
print(nan_rows.shape)
prtin(clicks_test.shape)

# NaN의 비율 = 4.69%
1510415/32225162*100

   display_id   ad_id    views   clicks     ctr  ctr.percent
0    16874594   66758   5642.0    373.0  0.0661         6.61
1    16874594  150083  77514.0   5261.0  0.0679         6.79
2    16874594  162754  66599.0  15919.0  0.2390        23.90
3    16874594  170392  15054.0   5261.0  0.3495        34.95
4    16874594  172888  15081.0   5162.0  0.3423        34.23
(32225162, 2)
(32225162, 6)
(1510415, 6)


In [24]:
# clicks_train, click_test 각각의 ad_id의 unique 값 비교
train_ad = clicks_train['ad_id'].unique()
test_ad = clicks_test['ad_id'].unique()
# print(train_ad.shape) # clicks_train의 ad_id unique 개수 = 478950
# print(test_ad.shape) # clicks_test의 ad_id unique 개수 = 381385
inter = np.intersect1d(test_ad, train_ad) # 둘의 교집합 inter
# print(inter.shape) # clicks_train, clicks_test 공통으로 가지고 있는 unique한 ad_id의 개수 = 316035

# clicks_train, click_test 전체 ad_id 비교 (unique 아니라 all)
train_ad = clicks_train['ad_id']
test_ad = clicks_test['ad_id']
inter = test_ad[test_ad.isin(train_ad)]
print(inter.shape) # 둘이 겹치는 ad_id 개수가 31116698

(31116698,)


In [25]:
# test set에 있는 ad 중에 train set에도 있는 비율은? 
316035/381385*100 # unique 기준 82.8%
31116698/32255162*100 # 전체 기준 96%

96.47044401761181

In [10]:
# 아까 merge한 데이터를 ctr 내림차순으로 정렬
click_test_with_ad_info_desc = click_test_with_ad_info.sort_values('ctr', ascending=False)
ready_for_submission = click_test_with_ad_info_desc[['display_id', 'ad_id', 'ctr']]
print(ready_for_submission.head())

          display_id   ad_id  ctr
13286893    19447550  436833  1.0
13253671    19441198  436833  1.0
5540386     17949678  393472  1.0
19996662    20752263  395356  1.0
25203137    21758924  156796  1.0


In [12]:
# clicks_test에서 'display_id', 'ad_id', 'ctr' 세 컬럼을 갖는 데이터를 'dis_ad_ctr.csv'로 저장
ready_for_submission.to_csv('dis_ad_ctr.csv', index = False)
!gsutil cp 'dis_ad_ctr.csv' 'gs://capstone-01/dis_ad_ctr.csv'



Updates are available for some Cloud SDK components.  To install them,
please run:
  $ gcloud components update

Copying file://dis_ad_ctr.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

\ [1 files][685.5 MiB/685.5 MiB]                                                
Operation completed over 1 objects/685.5 MiB.                

# display_id 별로 ad_id 묶어서 최종 submission 파일 만들자

In [11]:
# clicks_test에서 display_id 별로 ad_id를 묶고(groupby), 그 ad_id들이 담긴 list를 한 컬럼 만들어서 ready_for_submission_group에 저장
ready_for_submission_group = ready_for_submission.groupby('display_id')['ad_id'].apply(list)
print(type(ready_for_submission_group))

<class 'pandas.core.series.Series'>


In [34]:
# pandas series를 dataframe으로 변경
ready_for_submission_group_new = ready_for_submission_group.to_frame()
print(type(ready_for_submission_group_new))
# row name으로 들어가 있는 display_id를 열로 추가
ready_for_submission_group_new.reset_index(level=['display_id'], inplace=True)
print(type(ready_for_submission_group_new))

# csv 저장 전 데이터 살펴보기
print(ready_for_submission_group_new.head(10))

# display_id와 그에 해당하는 ad_id들이 CTR 내림차순으로 정렬된 리스트를 가지고 있는 ready_for_submission_group_new를 'dis_several_ads.csv'로 저장
ready_for_submission_group_new.to_csv('dis_several_ads.csv', index = False)
!gsutil cp 'dis_several_ads.csv' 'gs://capstone-01/dis_several_ads.csv'

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
   display_id                                              ad_id
0    16874594    [170392, 172888, 162754, 150083, 66758, 180797]
1    16874595                              [8846, 143982, 30609]
2    16874596  [289915, 11430, 289122, 132820, 57197, 153260,...
3    16874597  [305790, 285834, 143981, 182039, 155945, 18096...
4    16874598                    [145937, 335632, 67292, 250082]
5    16874599    [173130, 91681, 213116, 210516, 296295, 163776]
6    16874600        [30682, 2150, 57591, 70529, 133050, 114836]
7    16874601     [190713, 92003, 129490, 14082, 140942, 118470]
8    16874602           [154918, 281563, 269017, 131316, 268548]
9    16874603                    [163139, 156050, 44460, 140423]
Copying file://dis_several_ads.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. Thi

# ad_id들이 CTR 내림차순으로 잘 정렬되었는지 확인하자

In [33]:
# CTR 내림차순으로 잘 정렬되었나 확인하기
# "dis_ad_ctr.csv"에서 display_id가 16874594인 애들만 뽑아서 봤더니 CTR 내림차순으로 정렬되어 있더라!
ready_for_submission[ready_for_submission['display_id']==16874594]

Unnamed: 0,display_id,ad_id,ctr
3,16874594,170392,0.3495
4,16874594,172888,0.3423
2,16874594,162754,0.239
1,16874594,150083,0.0679
0,16874594,66758,0.0661
5,16874594,180797,0.0288
