### <font color = '#D8218C'> EXPLORATORY DATA ANALYSIS </font>

#### <font color = '#D8218C'> Imports</font>

In [1]:
import pandas as pd
import numpy  as np
from sqlalchemy import create_engine

##### Data collection

In [2]:
# path = r'C:\Users\fabic\repos\Data_Engineering'
# database_name = 'database_hm.sqlite'
# conn = create_engine('sqlite:///' + path + database_name, echo=False)
conn = create_engine('sqlite:///database_hm.sqlite', echo=False)

In [3]:
query = """
    SELECT * FROM showroom
"""

In [4]:
df_raw = pd.read_sql(query, con=conn)

#### <font color = '#D8218C'> Data Description</font>

In [5]:
df01 = df_raw.copy()

##### Data Dimension

In [6]:
print('Number of rows: {}'.format(df01.shape[0]))
print('Number of columns: {}'.format(df01.shape[1]))

Number of rows: 597
Number of columns: 12


##### Data types

In [7]:
df01.dtypes

product_id           object
style_id             object
article_id           object
product_name         object
product_colors       object
fit                  object
product_price       float64
cotton              float64
polyester           float64
spandex             float64
elastomultiester    float64
scrapy_datetime      object
dtype: object

In [8]:
df01['scrapy_datetime'] = pd.to_datetime(df01['scrapy_datetime'])

##### Missing values

In [9]:
df01.isna().sum()

product_id          0
style_id            0
article_id          0
product_name        0
product_colors      0
fit                 0
product_price       0
cotton              0
polyester           0
spandex             0
elastomultiester    0
scrapy_datetime     0
dtype: int64

##### Data description

In [10]:
num_attributes = df01.select_dtypes(include=['int64', 'float64'])
cat_attributes = df01.select_dtypes(exclude=['int64', 'float64', 'datetime64[ns]'])

In [11]:
# numeric data

# central tendency - mean, median
t1 = pd.DataFrame(num_attributes.apply(np.mean)).T
t2 = pd.DataFrame(num_attributes.apply(np.median)).T

# dispension - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame(num_attributes.apply(np.std)).T
d2 = pd.DataFrame(num_attributes.apply(np.min)).T
d3 = pd.DataFrame(num_attributes.apply(np.max)).T
d4 = pd.DataFrame(num_attributes.apply(lambda x: x.max() - x.min())).T
d5 = pd.DataFrame(num_attributes.apply(lambda x: x.skew())).T
d6 = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis())).T

# concate
m1 = pd.concat([d2, d3, d4, t1, t2, d1, d5, d6]).T.reset_index()
m1.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
m1

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,product_price,7.99,49.99,42.0,27.988325,24.99,10.395931,0.46648,-0.814279
1,cotton,0.72,1.0,0.28,0.97206,0.99,0.055579,-2.753831,6.743137
2,polyester,0.0,1.0,1.0,0.324573,0.0,0.356893,0.303936,-1.635348
3,spandex,0.0,0.02,0.02,0.011441,0.01,0.006892,-0.196682,-0.900533
4,elastomultiester,0.0,0.09,0.09,0.005042,0.0,0.020238,3.788648,12.454446
