# 0.0. Imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import HTML

## 0.1. Helper Functions

In [57]:
def descriptive_statistics(num_attr):
    # Central Tendency: mean, median
    c1 = pd.DataFrame(num_attr.apply(np.mean))
    c2 = pd.DataFrame(num_attr.apply(np.median))

    # Dispension: min, max, range, std, skew, kurtosis
    d1 = pd.DataFrame(num_attr.apply(min))
    d2 = pd.DataFrame(num_attr.apply(max))
    d3 = pd.DataFrame(num_attr.apply(lambda x: x.max() - x.min()))
    d4 = pd.DataFrame(num_attr.apply(lambda x: x.std()))
    d5 = pd.DataFrame(num_attr.apply(lambda x: x.skew()))
    d6 = pd.DataFrame(num_attr.apply(lambda x: x.kurtosis()))

    # concat
    m = pd.concat([d1,d2,d3,c1,c2,d4,d5,d6], axis=1).reset_index()
    m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
    return m


def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'ggplot')
    plt.rcParams['figure.figsize'] = [24, 9]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()
    
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## 0.2. Load Dataset

In [14]:
df_raw = pd.read_csv( '../data/raw/Ecommerce.csv', encoding='latin1' )

# drop extra column
df_raw = df_raw.drop(columns=['Unnamed: 8'])

# 1.0. Data Description

In [15]:
df1 = df_raw.copy()

## 1.1. Rename Columns

**Attribute Description**


* **InvoiceNo** Invoice number (A 6-digit integral number uniquely assigned to each transaction)
* **StockCode** Product (item) code
* **Description** Product (item) name
* **Quantity** The quantities of each product (item) per transaction
* **InvoiceDate** The day when each transaction was generated
* **UnitPrice** Unit price (Product price per unit)
* **CustomerID** Customer number (Unique ID assigned to each customer)
* **Country** Country name (The name of the country where each customer resides)

In [17]:
cols_new = ['invoice_no', 'stock_code', 'description', 'quantity', 'invoice_date', 'unit_price', 'customer_id', 'country']
df1.columns = cols_new

## 1.2. Data Dimensions

In [22]:
print('Number of Rows: {}'.format(df1.shape[0]))
print('Number of Columns/Features: {}'.format(df1.shape[1]))

Number of Rows: 541909
Number of Columns/Features: 8


## 1.3. Data Types

In [18]:
df1.dtypes

invoice_no       object
stock_code       object
description      object
quantity          int64
invoice_date     object
unit_price      float64
customer_id     float64
country          object
dtype: object

## 1.4. Check NA

In [24]:
df1.isna().mean()

invoice_no      0.000000
stock_code      0.000000
description     0.002683
quantity        0.000000
invoice_date    0.000000
unit_price      0.000000
customer_id     0.249267
country         0.000000
dtype: float64

## 1.5. Replace NA 

In [35]:
df1 = df1.dropna(subset=['description', 'customer_id'])

print('Removed data: {}%'.format(round(1 - (df1.shape[0] / df_raw.shape[0]), 2) *100))

Removed data: 25.0%


## 1.6. Change Dtypes

In [41]:
df1['invoice_date'] = pd.to_datetime(df1['invoice_date'])

df1['customer_id'] = df1['customer_id'].astype(int)

## 1.7. Descriptive Statistics

In [43]:
num_attr = df1.select_dtypes(include=['int64', 'float64'])
cat_attr = df1.select_dtypes(include=['object'])

### 1.7.1. Numerical Attributes



In [58]:
descriptive_statistics(num_attr)

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,quantity,-80995.0,80995.0,161990.0,12.061303,5.0,248.69337,0.182663,94317.563673
1,unit_price,0.0,38970.0,38970.0,3.460471,1.95,69.315162,452.219019,246924.542988
2,customer_id,12346.0,18287.0,5941.0,15287.69057,15152.0,1713.600303,0.029835,-1.179982


# 2.0. Feature Engeering

# 3.0. Data Filtering

# 4.0. Exploratory Data Analysis

# 5.0. Data Preparation

# 6.0. Feature Selection

# 7.0. Hyperparameter Fine-Tunning

# 8.0. Model Training

# 9.0. Cluster Analysis

# 10.0. Deploy to Production