# 0.0. Intruducao

## 0.1. Biblíotecas

In [1]:
import pandas as pd
import numpy  as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings('ignore')

## 0.2. Loading dos Dados

In [2]:
df = pd.read_csv( 'train_users_2.csv', low_memory = True )
df_sessions = pd.read_csv( 'sessions.csv', low_memory= True )

## 0.3. Funcoes Auxiliares

In [3]:
# get_stats
def get_stats(x):
    # Tendencia Central - mean, median
    tc1 = pd.DataFrame( x.apply( np.mean ) ).T
    tc2 = pd.DataFrame( x.apply( np.median ) ).T

    # Tendencia de dispersao - std, min, max, range, skew, kurtosis
    td1 = pd.DataFrame( x.apply( np.std ) ).T
    td2 = pd.DataFrame( x.apply( min ) ).T
    td3 = pd.DataFrame( x.apply( max ) ).T
    td4 = pd.DataFrame( x.apply( lambda i: i.max() - i.min() ) ).T
    td5 = pd.DataFrame( x.apply( lambda i: i.skew() ) ).T
    td6 = pd.DataFrame( x.apply( lambda i: i.kurtosis() ) ).T

    # concat
    ct = pd.concat( [td2, td3, td4, tc1, tc2, td1, td5, td6] ).T.reset_index()
    ct.columns = ['atributos', 'min', 'max', 'range', 'media', 'mediana', 'std', 'skew', 'kurtosis']
    
    return ct

# 1.0. Descricao dos Dados

In [4]:
df1 = df.copy()
df1_sessions = df_sessions

## 1.1. Dimensionalidade

In [5]:
print( 'Número de linhas: {}'.format( df1.shape[0] ) )
print( 'Número de colunas: {}'.format( df1.shape[1] ) )

Número de linhas: 213451
Número de colunas: 16


In [6]:
print( 'Número de linhas: {}'.format( df1_sessions.shape[0] ) )
print( 'Número de colunas: {}'.format( df1_sessions.shape[1] ) )

Número de linhas: 1056773
Número de colunas: 6


## 1.2. Data type

In [7]:
df1.dtypes

id                          object
date_account_created        object
timestamp_first_active       int64
date_first_booking          object
gender                      object
age                        float64
signup_method               object
signup_flow                  int64
language                    object
affiliate_channel           object
affiliate_provider          object
first_affiliate_tracked     object
signup_app                  object
first_device_type           object
first_browser               object
country_destination         object
dtype: object

In [8]:
df1_sessions.dtypes

user_id           object
action            object
action_type       object
action_detail     object
device_type       object
secs_elapsed     float64
dtype: object

In [9]:
aux = df1[df1['age'].isna()]
aux['country_destination'].value_counts( normalize=True )

NDF      0.768428
US       0.156529
other    0.028617
FR       0.014888
IT       0.009081
GB       0.006251
ES       0.006171
CA       0.003989
DE       0.002387
NL       0.001818
AU       0.001171
PT       0.000671
Name: country_destination, dtype: float64

## 1.2. Check Na

### 1.2.1. Check df1

In [10]:
#porcentagem dos dados faltantes por feature
df1.isna().sum() / len( df1 )

id                         0.000000
date_account_created       0.000000
timestamp_first_active     0.000000
date_first_booking         0.583473
gender                     0.000000
age                        0.412226
signup_method              0.000000
signup_flow                0.000000
language                   0.000000
affiliate_channel          0.000000
affiliate_provider         0.000000
first_affiliate_tracked    0.028414
signup_app                 0.000000
first_device_type          0.000000
first_browser              0.000000
country_destination        0.000000
dtype: float64

In [11]:
# remover missing values
#df1 = df1.dropna()

#date_first_booking
date_first_booking_max = pd.to_datetime( df1['date_first_booking'] ).max().strftime( '%Y-%m-%d' )
df1['date_first_booking'] = df1['date_first_booking'].fillna( date_first_booking_max )

#age
avg_age = df1['age'].mean().astype( int )
df1['age'] = df1['age'].fillna( avg_age )

#first_affiliate_tracked
df1 = df1[~df1['first_affiliate_tracked'].isna()]

df1.shape

(207386, 16)

### 1.2.2. Check df1_sessions

In [12]:
#porcentagem dos dados faltantes por feature
df1_sessions.isna().sum() / len( df1_sessions )

user_id          0.003225
action           0.007399
action_type      0.106714
action_detail    0.106714
device_type      0.000000
secs_elapsed     0.012966
dtype: float64

In [13]:
# drop na

#user_id - 0.3%
df1_sessions = df1_sessions[~df1_sessions['user_id'].isna()]

#action - 0.7%
df1_sessions = df1_sessions[~df1_sessions['action'].isna()]

#action_type - 10.6%
df1_sessions = df1_sessions[~df1_sessions['action_type'].isna()]

#action_detail - 10.6%
df1_sessions = df1_sessions[~df1_sessions['action_detail'].isna()]

#secs_elepsed - 1.2%
df1_sessions = df1_sessions[~df1_sessions['secs_elapsed'].isna()]

## 1.3. Change Types

In [14]:
#date_account_created
df1['date_account_created'] = pd.to_datetime( df['date_account_created'] )

#timestamp_first_active
df1['timestamp_first_active'] = pd.to_datetime( df['timestamp_first_active'], format = "%Y%m%d%H%M%S" )

#date_first_booking
df1['date_first_booking'] = pd.to_datetime( df['date_first_booking'] )

#age
df1['age'] = df1['age'].astype( int )

## 1.4. Check Balancemento dos Dados

In [15]:
df1['country_destination'].value_counts( normalize=True )

NDF      0.579673
US       0.294461
other    0.048128
FR       0.023623
IT       0.013458
GB       0.011066
ES       0.010671
CA       0.006707
DE       0.005020
NL       0.003621
AU       0.002541
PT       0.001032
Name: country_destination, dtype: float64

## 1.5. Estatistica Descritiva

In [16]:
atributos_num = df1.select_dtypes( include = [int, 'float64'] )
atributos_cat = df1.select_dtypes( exclude = [int, 'float64'] )
atributos_time = df1.select_dtypes( include = ['datetime64[ns]'] )

In [17]:
atributos_num_sessions = df1_sessions.select_dtypes( include = ['float64'] )
atributos_cat_sessions = df1_sessions.select_dtypes( exclude = ['float64'] )

### 1.5.1. Atributos Numericos

#### 1.5.1.1. Users

In [18]:
get_stats( atributos_num )

Unnamed: 0,atributos,min,max,range,media,mediana,std,skew,kurtosis
0,age,1.0,2014.0,2013.0,49.183142,49.0,118.421689,16.33382,267.655434
1,signup_flow,0.0,25.0,25.0,3.15149,0.0,7.543915,2.287158,3.567794


#### 1.5.1.2. Sesseions

In [19]:
get_stats( atributos_num_sessions )

Unnamed: 0,atributos,min,max,range,media,mediana,std,skew,kurtosis
0,secs_elapsed,0.0,1799646.0,1799646.0,20072.882262,1373.0,89371.933206,11.100057,153.74554
