# 0.0 Imports

In [44]:
import os

import pandas as pd
import numpy as np
import inflection


## 0.1 Load Data

In [45]:
path = os.path.dirname(os.getcwd())
path_dataset = os.path.join(path, 'data', 'df_ready.csv')
df = pd.read_csv(path_dataset)

## 0.2 Help function

In [46]:
def snakecase(x):
    return inflection.underscore(x)

# 1.0 Descrição dos dados

In [47]:
df1 = df.copy()

# 1.1 Data dimensions

In [48]:
print(f'Number of rows: {df1.shape[0]}' )
print(f'Number of columns: {df1.shape[1]}' )


Number of rows: 23151
Number of columns: 32


## 1.2 Data types

In [49]:
df1.dtypes

Unnamed: 0           int64
Date_imp            object
Date_imp_d          object
Cluster              int64
Category_name       object
name                object
price              float64
disc_price         float64
merchant            object
condition           object
Disc_percentage    float64
isSale              object
Imp_count            int64
brand               object
p_description       object
currency            object
dateAdded           object
dateSeen            object
dateUpdated         object
imageURLs           object
manufacturer        object
shipping            object
sourceURLs          object
weight              object
Date_imp_d.1        object
Day_n               object
month                int64
month_n             object
day                  int64
Week_Number          int64
Zscore_1           float64
price_std          float64
dtype: object

## 1.3 Check NA

In [50]:
df1.isna().sum()

Unnamed: 0             0
Date_imp               0
Date_imp_d             0
Cluster                0
Category_name          0
name                   0
price                  0
disc_price             0
merchant               0
condition              0
Disc_percentage        0
isSale                 0
Imp_count              0
brand                  0
p_description          0
currency               0
dateAdded              0
dateSeen               0
dateUpdated            0
imageURLs          22696
manufacturer       10639
shipping           11497
sourceURLs          4930
weight                68
Date_imp_d.1           0
Day_n                  0
month                  0
month_n                0
day                    0
Week_Number            0
Zscore_1             171
price_std              0
dtype: int64

In [51]:
df1.columns

Index(['Unnamed: 0', 'Date_imp', 'Date_imp_d', 'Cluster', 'Category_name',
       'name', 'price', 'disc_price', 'merchant', 'condition',
       'Disc_percentage', 'isSale', 'Imp_count', 'brand', 'p_description',
       'currency', 'dateAdded', 'dateSeen', 'dateUpdated', 'imageURLs',
       'manufacturer', 'shipping', 'sourceURLs', 'weight', 'Date_imp_d.1',
       'Day_n', 'month', 'month_n', 'day', 'Week_Number', 'Zscore_1',
       'price_std'],
      dtype='object')

In [52]:
df1 = df1.drop(columns=['Unnamed: 0', 'Date_imp', 'Cluster', 'condition',
                         'Date_imp_d', 'Zscore_1', 'price_std', 'imageURLs', 
                         'shipping', 'weight', 'currency'])

In [53]:
cols_new = list(map(snakecase, df1.columns))
df1.columns = cols_new
df1['date_imp'] = df1['date_imp_d.1']

## 1.5 Changer Types

In [54]:
df1['date_imp'] = pd.to_datetime(df1['date_imp'])

## 1.7 Descriptive Statistics

In [55]:
num_attributes = df1.select_dtypes(include=['float', 'int64'])
cat_attributes = df1.select_dtypes(exclude=['float', 'int64', 'datetime64[ns]'])


## 1.7.1 Numerical atributes

In [56]:
# Central Tendency - mean, median
ct1 = pd.DataFrame(num_attributes.apply(np.mean)).T
ct2 = pd.DataFrame(num_attributes.apply(np.median)).T

# Dispersion - sdt, min, max, range, skew, kurtosis

d1 = pd.DataFrame( num_attributes.apply(np.std)).T
d2 = pd.DataFrame( num_attributes.apply(np.min)).T
d3 = pd.DataFrame( num_attributes.apply(np.max)).T
d4 = pd.DataFrame( num_attributes.apply(lambda x: x.max() - x.min())).T
d5 = pd.DataFrame( num_attributes.apply(lambda x: x.skew())).T
d6 = pd.DataFrame( num_attributes.apply(lambda x: x.kurtosis())).T

m = pd.concat([d2, d3, d4, ct1, ct2, d1, d5, d6]).T.reset_index()
m.columns = ['attributes','min','max','range','mean','median','std','skew','kurtosis']
m

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,price,1.0,10879.95,10878.95,513.037803,199.99,859.091448,4.59516,34.411146
1,disc_price,1.0,10879.95,10878.95,494.104295,199.0,808.571509,4.748349,39.612839
2,disc_percentage,0.0,0.8,0.8,0.016933,0.0,0.077847,5.217632,28.75492
3,imp_count,1.0,31.0,30.0,6.560926,5.0,5.403049,1.286351,1.639569
4,month,1.0,12.0,11.0,7.65103,8.0,2.592787,-0.099417,-0.628859
5,day,1.0,31.0,30.0,15.693879,16.0,9.681413,0.032258,-1.331838
6,week_number,1.0,51.0,50.0,31.34275,32.0,11.316253,-0.26698,-0.648027
