In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns

import gc
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

%matplotlib inline
pd.set_option('max_rows', 10)
pal = sns.color_palette()

## **1. Descri玢o dos arquivos e estrutura de dados**
Source: https://www.kaggle.com/anokas/talkingdata-adtracking-eda


In [None]:
datapath = '../input/'

print('# File sizes')
for f in os.listdir(datapath):
    if 'zip' not in f:
        print(f.ljust(30) + str(round(os.path.getsize(datapath + f) / 1000000, 2)) + 'MB')

        
import subprocess

print('\n# Line count:')
for file in ['train.csv', 'test.csv', 'test_supplement.csv', 'train_sample.csv']:
    lines = subprocess.run(['wc', '-l', os.path.join(datapath, file)], stdout=subprocess.PIPE).stdout.decode('utf-8')
    print(lines, end='', flush=True)

In [None]:
#datapath = '/dados/Dados/Kaggle'
datapath = '../input/'
train_kaggle_sample = pd.read_csv(os.path.join(datapath, 'train_sample.csv'), parse_dates=['attributed_time', 'click_time'])
train_kaggle_sample.info()
for col in train_kaggle_sample.select_dtypes(include=np.number):
    train_kaggle_sample[col] = train_kaggle_sample[col].astype('category')

In [None]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        }

cols = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed']

print('Loading the training data...')
train = pd.read_csv(os.path.join(datapath, 'train.csv'), usecols=cols, dtype=dtypes)
print('End loading training data...\n')
# checking types
train.info()

train_head = pd.read_csv(os.path.join(datapath, 'train.csv'), nrows=10000, dtype=dtypes)
train_tail = pd.read_csv(os.path.join(datapath, 'train.csv'), skiprows=range(1, len(train)-10000 ), nrows=10000, dtype=dtypes)
train_head
train_tail

In [None]:
print('Loading the test data...')
test = pd.read_csv(os.path.join(datapath, 'test.csv'), dtype=dtypes, parse_dates=['click_time'])
print('End loading test data...\n')
test.info()
test

del train_head
del train_tail
gc.collect()

## **2. Descri玢o simples dos dados e missing values**
Source: https://www.kaggle.com/anokas/talkingdata-adtracking-eda

In [None]:
train.isnull().any()
test.isnull().any()
train_kaggle_sample.isnull().any()

In [None]:
train_kaggle_sample.info()
train_kaggle_sample[['attributed_time', 'is_attributed']].loc[train_kaggle_sample.is_attributed == 1].describe(include='all')
train_kaggle_sample[['attributed_time', 'is_attributed']].loc[train_kaggle_sample.is_attributed == 0].describe(include='all')

In [None]:
size=10000000
all_rows = len(train)
num_parts = all_rows//size + 1

In [None]:
#generate the first batch
chunk = train[0:size]

ip_gb = chunk[['ip', 'is_attributed']].groupby('ip').is_attributed.agg([sum, len])
app_gb = chunk[['app', 'is_attributed']].groupby('app').is_attributed.agg([sum, len])
device_gb = chunk[['device', 'is_attributed']].groupby('device').is_attributed.agg([sum, len])
os_gb = chunk[['os', 'is_attributed']].groupby('os').is_attributed.agg([sum, len])
channel_gb = chunk[['channel', 'is_attributed']].groupby('channel').is_attributed.agg([sum, len])

dfs_gb = [ip_gb, app_gb, device_gb, os_gb, channel_gb]

#add remaining batches
for p in range(1,num_parts):
    start = p*size
    end = p*size + size
    
    if end < all_rows:
        chunk = train[start:end]#[['ip', 'is_attributed']].groupby('ip', as_index=False).count()
    else:
        chunk = train[start:]#[['ip', 'is_attributed']].groupby('ip', as_index=False).count()
    
    ip_c = chunk[['ip', 'is_attributed']].groupby('ip').is_attributed.agg([sum, len])
    app_c = chunk[['app', 'is_attributed']].groupby('app').is_attributed.agg([sum, len])
    device_c = chunk[['device', 'is_attributed']].groupby('device').is_attributed.agg([sum, len])
    os_c = chunk[['os', 'is_attributed']].groupby('os').is_attributed.agg([sum, len])
    channel_c = chunk[['channel', 'is_attributed']].groupby('channel').is_attributed.agg([sum, len])
    
    dfs_c = [ip_c, app_c, device_c, os_c, channel_c]
    
    dfs_gb[:] = [(df_gb
                   .join(df_c, how='outer', lsuffix='_gb', rsuffix='_c')
                   .assign(sum=lambda df: np.nansum((df['sum_gb'], df['sum_c']), axis = 0), len=lambda df: np.nansum((df['len_gb'], df['len_c']), axis = 0))
                   .drop(columns=['sum_gb', 'len_gb', 'sum_c', 'len_c'])) for df_gb, df_c in zip(dfs_gb, dfs_c)]
    
    print("Finalizou chunk {}".format(p))
    
ip_gb, app_gb, device_gb, os_gb, channel_gb = dfs_gb[:]

In [None]:
sns.set()
sns.set(font_scale=1.2)
fig = plt.figure(figsize=(24,20))
gs = GridSpec(2, 2)

cols = ['ip', 'app', 'device', 'os', 'channel']
uniques = [len(df) for df in dfs_gb]
uniques_test = [len(test[col].unique()) for col in cols]
uniques_total = [len(df.join(test[col].value_counts(), how='outer')) for df, col in zip(dfs_gb, cols)]

ax0 = plt.subplot(gs[0,:])
ax0 = sns.barplot(cols, uniques_total, palette=pal, log=True)
settings = ax0.set(ylabel='log(unique)', title='Quantidade de valores 鷑icos por Feature (Train + Test)')
for p, value in zip(ax0.patches, uniques_total):
    height = p.get_height()
    text = ax0.text(p.get_x()+p.get_width()/2., height + 10, value, ha="center")

ax1 = plt.subplot(gs[1,0])
ax1 = sns.barplot(cols, uniques, palette=pal, log=True)
settings = ax1.set(title='Quantidade de valores 鷑icos por Feature (Train)') 
for p, value in zip(ax1.patches, uniques):
    height = p.get_height()
    text = ax1.text(p.get_x()+p.get_width()/2., height + 10, value, ha="center")
    
ax2 = plt.subplot(gs[1,1], sharey=ax1)
ax2 = sns.barplot(cols, uniques_test, palette=pal, log=True)
settings = ax2.set(title='Quantidade de valores 鷑icos por Feature (Test)') 
for p, value in zip(ax2.patches, uniques_test):
    height = p.get_height()
    text = ax2.text(p.get_x()+p.get_width()/2., height + 10, value, ha="center")   

In [None]:
cols = ['ip', 'app', 'device', 'os', 'channel']
exclusivo_train = [x-y for x, y in zip(uniques_total, uniques_test)]
exclusivo_test = [x-y for x, y in zip(uniques_total, uniques)]
both_train_test = [x+y-z for x, y, z in zip(uniques, uniques_test, uniques_total)]
pct_uniques = pd.DataFrame(columns=cols, index=['Interse玢o Train e Test', 'Exclusivo Train', 'Exclusivo Test'], 
                           data=[[round(100*x/y, 2) for x, y in zip(both_train_test, uniques_total)],
                                 [round(100*x/y, 2) for x, y in zip(exclusivo_train, uniques_total)],
                                 [round(100*x/y, 2) for x, y in zip(exclusivo_test, uniques_total)]])

sns.set()
sns.set(font_scale=1.4)
sns.set_style("white")
fig = plt.figure(figsize=(20,10))
ax0 = plt.subplot(111)
ax0 = pct_uniques.T.plot(kind='bar', stacked=True, ax=ax0, legend=True, rot=0)
settings = ax0.set(ylim=(0,115), ylabel='porcentagem de valores exclusivos e compatilhados dos datasets', xlabel='Features', title='Porcentagem de valores exclusivos e compartilhados dos datasets Train e Test por Feature na composi玢o Total dos dados')
for p, v0, v1, v2  in zip(ax0.patches, pct_uniques.iloc[0], pct_uniques.iloc[1], pct_uniques.iloc[2]):
    height0 = v0 / 2
    height1 = v1 / 2 + v0
    height2 = v2 / 2 + v0 + v1
    for value, height in zip ([v0, v1, v2],[height0, height1, height2]):
        if value > 0:
            text = ax0.text(p.get_x()+p.get_width()/2., height, "{}%".format(value), ha="center")
ax0 = sns.despine()
            
sns.set()
sns.set(font_scale=1.2)
fig, [ax1, ax2] = plt.subplots(figsize=(20,10), nrows=1, ncols=2)

ax1 = plt.subplot(121)
ax1 = sns.barplot(cols, exclusivo_train, palette=pal, log=True)
settings = ax1.set(title='Quantidade de valores 鷑icos exclusivos de Train')
for p, value in zip(ax1.patches, exclusivo_train):
    height = p.get_height() + 0.1*10**np.log10(value+1)
    text = ax1.text(p.get_x()+p.get_width()/2., height, value, ha="center")
    
ax2 = plt.subplot(122, sharey=ax1)
ax2 = sns.barplot(cols, exclusivo_test, palette=pal, log=True)
settings = ax2.set(title='Quantidade de valores 鷑icos exclusivos de Test')
for p, value in zip(ax2.patches, exclusivo_test):
    height = p.get_height() + 0.1*10**np.log10(value+1)
    if height < 10: height = height + 16
    text = ax2.text(p.get_x()+p.get_width()/2., height, value, ha="center")   

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
download_pct = app_gb['sum'].sum() / app_gb['len'].sum()
settings = ax.set(xlabel='Target Value', ylabel='Probability', title='Target value distribution')
ax = sns.barplot(['Dowloaded (1)', 'Not Dowloaded (0)'], [download_pct, 1-download_pct], palette=pal)
for p, value in zip(ax.patches, [download_pct, 1-download_pct]):
    height = p.get_height()
    text = ax.text(p.get_x()+p.get_width()/2., height+0.01, '{}%'.format(round(value * 100, 2)), ha="center") 

## Rela玢o de cada Feature com Target
### - IP

In [None]:
ip_gb['ip_pct'] = ip_gb['sum'] / ip_gb['len']
ip_gb.sort_values(by=['len'], ascending=False).iloc[:100]


### - App

In [None]:
app_gb_copy = app_gb.copy()
app_gb_copy['Download_pct'] = app_gb_copy['sum'] / app_gb_copy['len']
data = app_gb_copy.sort_values(by=['len'], ascending=False)[:100].reset_index().rename(columns={'len':'Count'})

fig, ax = plt.subplots(figsize=(10, 10))
ax = data.Count.plot(ax=ax, logy=True, legend=True)
settings = ax.set(ylabel='log Count of clicks')
ax = data.Download_pct.plot(secondary_y=True, ax=ax, legend=True)
settings = ax.set(xlabel='Target Value', ylabel='Probability', title='Conversion Rates over Counts of 100 Most Popular Apps')
plt.show()
