# Loading Prerequisites

In [None]:
# Install updated version of pandas_profiling.
#!pip install pandas_profiling==3.3.0

In [None]:
## Importing python packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil

from itertools import product
from pandas_profiling import ProfileReport

In [None]:
## Some configuration

%matplotlib inline

# for better resolution plots
%config InlineBackend.figure_format = 'retina'

# Set seaborn style
sns.set()

# Loading Data

In [None]:
## Load csv file into a dataframe 
data_path = "https://raw.githubusercontent.com/fpontejos/DMDM_2223/main/data/datamining.csv"

df = pd.read_csv(data_path)

In [None]:
## OR 
## to load from csv on your Google Drive

## Allow Colab to see Google Drive files

## Uncomment these two lines:

#from google.colab import drive
#drive.mount('/content/drive')


## Load csv file into a dataframe 
#data_path = "/content/drive/MyDrive/DMDM_Practical/data/datamining.csv"
#df = pd.read_csv(data_path)

## Refer to slides [Extra Content]

## Metadata
- *id* - The unique identifier of the customer
- *age* - The year of birht of the customer
- *income* - The income of the customer
- *frq* - Frequency: number of purchases made by the customer
- *rcn* - Recency: number of days since last customer purchase
- *mnt* - Monetary: amount of € spent by the customer in purchases
- *clothes* - Percentage in monetary value spent by customer on clothes items
- *kitchen* - Percentage in monetary value spent by customer on kitchen items
- *small_appliances* - Percentage in monetary value spent by customer on small_appliances items
- *toys* - Percentage in monetary value spent by customer on toys items
- *house_keeping* - Percentage in monetary value spent by customer on house_keeping items
- *dependents* - Binary. Whether or not the customer has dependents
- *per_net_purchase* - Percentage of purchases made online
- *education* - Education level of the customer
- *status* - Marital status of the customer
- *gender* - Gender of the customer
- *description* - Last customer's recommendation description

# Initial Analysis

In [None]:
## Look at the first 10 rows of the data
df.head(10)

In [None]:
## Look at dataset data types
df.dtypes

In [None]:
## Count of missing values
df.isna().sum()

In [None]:
## Check for duplicate rows
df.duplicated().sum()

In [None]:
## Check descriptive statistics: Numeric features only
df.describe().T 

In [None]:
## Check descriptive statistics: All features
df.describe(include="all").T 

In [None]:
## Get unique values in a column
df['education'].unique()

In [None]:
## Fix wrong data types
df.dependents = df.dependents.astype("boolean")

In [None]:
## TIP
# Different ways to access a column

df['age']
df.age
df.loc[:,'age']

In [None]:
# Split features into metric/numeric vs non-metric/non-numeric

non_metric_features = ["education", "status", "gender", "dependents", "description"]
metric_features = df.columns.drop(non_metric_features).to_list()


# Visual Exploration

Matplotlib tutorials: https://matplotlib.org/3.3.1/tutorials/index.html

Matplotlib gallery: https://matplotlib.org/3.3.1/tutorials/introductory/sample_plots.html#sphx-glr-tutorials-introductory-sample-plots-py

Seaborn tutorials: https://seaborn.pydata.org/tutorial.html


Seaborn gallery: https://seaborn.pydata.org/examples/index.html

**More examples for visualizing distributions:**
- http://seaborn.pydata.org/tutorial/distributions.html

## Pyplot-style vs Object-Oriented-style
- Explicitly create figures and axes, and call methods on them (the "object-oriented (OO) style").
- Rely on pyplot to automatically create and manage the figures and axes, and use pyplot functions for plotting.

## Univariate Distribution

In [None]:
# Single Metric Variable Histogram
plt.hist(df["age"], bins=10)  # mess around with the bins
plt.title("Histogram of Age Variable")

plt.show()

In [None]:
# Single Metric Variable Box Plot
sns.boxplot(y=df["age"])
plt.title("Box Plot of Age Variable")

plt.show()

In [None]:
## Define a function that plots multiple histograms

def plot_multiple_histograms(data, feats, title="Numeric Variables' Histograms"):

    # Prepare figure. Create individual axes where each histogram will be placed
    fig, axes = plt.subplots(2, ceil(len(feats) / 2), figsize=(20, 11))

    # Plot data
    # Iterate across axes objects and associate each histogram (hint: use the ax.hist() instead of plt.hist()):
    for ax, feat in zip(axes.flatten(), feats): # Notice the zip() function and flatten() method
      ax.hist(data[feat])
      ax.set_title(feat)

    # Layout
    # Add a centered title to the figure:
    plt.suptitle(title)

    plt.show()

    return


## Define a function that plots multiple box plots

def plot_multiple_boxplots(data, feats, title="Numeric Variables' Box Plots"):

    # Prepare figure. Create individual axes where each histogram will be placed
    fig, axes = plt.subplots(2, ceil(len(feats) / 2), figsize=(20, 11))

    # Plot data
    # Iterate across axes objects and associate each histogram (hint: use the ax.hist() instead of plt.hist()):
    for ax, feat in zip(axes.flatten(), feats): # Notice the zip() function and flatten() method
      sns.boxplot(x=data[feat], ax=ax)
      ax.set_title(feat)

    # Layout
    # Add a centered title to the figure:
    plt.suptitle(title)

    plt.show()

    return




In [None]:
# All Numeric Variables' Histograms in one figure
sns.set()

plot_multiple_histograms(df, metric_features)



In [None]:
# All Numeric Variables' Histograms in one figure
sns.set()

plot_multiple_boxplots(df, metric_features)


### Insights:
- univariate distributions
- potential univariate outliers

--------------------------------------

### During our Exploratory Data Analysis (EDA), we must also account for:
- Coherence check
- Outliers
- Missing values
- Feature Engineering

--------------------------------------

### Depending on the context, various steps must be considered when performing Data Preprocessing. The most relevant steps are the following:
- Coherence check (find inconsistent values, missing values, outliers and any other problem you may find in your dataset)
- Data editing (fix inconsistent values)
- Data cleansing (drop observations - Outlier removal and removal of inconsistent values and/or features)
- Data wrangling (feature extraction/engineering and transformation)
- Data reduction (reducing the dimensionality of a dataset, producing summary statistics, reducing the number of records in a dataset)

## Pairwise Relationship of Numeric Variables

In [None]:
# Single Metric Variable Scatter plot
plt.scatter(df["age"], df["income"], edgecolors="white")
plt.xlabel("age")
plt.ylabel("income")

plt.show()

In [None]:
# making a joint plot with default formatting
sns.jointplot(data=df, x="house_keeping", y="frq")
plt.show()


In [None]:
# Making the same visualization with customized formatting
sns.set(style="ticks")
sns.jointplot(data=df, x="house_keeping", y="frq", kind="hex", color="red")
plt.show()

In [None]:
# Pairwise Relationship of All Numerical Variables
sns.set()

# Setting pairplot
sns.pairplot(df[metric_features], diag_kind="hist")

# Layout
plt.subplots_adjust(top=0.95)
plt.suptitle("Pairwise Relationship of Numerical Variables", fontsize=20)

plt.show()

### Insights:
- possible bivariate relationships
- potential bivariate outliers
- univariate distributions (diagonal)

## Categorical Features Frequencies

In [None]:
# Single Non-Metric variable bar plot
sns.set()
sns.countplot(x=df["education"])

plt.show()

In [None]:
# Same plot, custom color used
sns.countplot(x=df["education"], color='#007acc')
plt.show()


In [None]:
## Define a function that plots multiple histograms

def plot_categorical_frequencies(data, feats, 
                             title="Categorical Variables' Frequencies"):
  
    # Prepare figure. Create individual axes where each histogram will be placed
    fig, axes = plt.subplots(2, ceil(len(feats) / 2), figsize=(20, 11))

    # Plot data
    # Iterate across axes objects and associate each histogram (hint: use the ax.hist() instead of plt.hist()):
    for ax, feat in zip(axes.flatten(), feats): # Notice the zip() function and flatten() method
        sns.countplot(x=df[feat].astype(object), ax=ax, color='#007acc')

    # Layout
    # Add a centered title to the figure:
    plt.suptitle(title)


    plt.show()

    return



In [None]:
plot_categorical_frequencies(df, non_metric_features)

## Comparing Two Categorical Features

In [None]:
sns.set()
df_counts = df\
    .groupby(['description', 'dependents'])\
    .size()\
    .unstack()\
    .plot.bar(stacked=True)

## Comparing Categorical vs Continuous Variables

In [None]:
# Pairwise Relationship of Numerical Variables
sns.set()

# Setting pairplot
sns.pairplot(df[metric_features + ['gender']], diag_kind="hist", hue='gender')

# Layout
plt.subplots_adjust(top=0.95)
plt.suptitle("Pairwise Relationship of Numerical Variables", fontsize=20)

plt.show()

In [None]:
# notice we drop missing values in order to not plot it as a distinct value
educ_vals = df.education.dropna().unique()
educ_vals = educ_vals

fig, axes = plt.subplots(len(metric_features), len(educ_vals), figsize=(25,18), sharex=True, sharey="row")


for ax, (feat, educ_deg) in zip(axes.flatten(), product(metric_features, educ_vals)):
    # get the data for each subplot
    data = df.loc[df.education == educ_deg,:].copy()
    data['dependents'] = data['dependents'].astype(object)
    
    # we are distinguishing points according to the variable "dependents"
    sns.pointplot(x="dependents", y=feat, hue="gender", hue_order=["F", "M"], data=data, capsize=.2,
             height=6, aspect=.75, ax=ax)
    
    # remove the typical default y and x labels and legend of each axis
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.legend('')

# set columns' titles (education)
for ax, label in zip(axes[0,:], educ_vals):
    ax.set_title(label, fontsize=13)

# set metric names
for ax, label in zip(axes[:,0], metric_features):
    ax.set_ylabel(label, fontsize=13)

# set x axis label (dependents)
for ax in axes[-1,:]:
    ax.set_xlabel('dependents', fontsize=13)

# Set legend (gender)
handles, _ = axes[0,0].get_legend_handles_labels()
fig.legend(handles, ["F","M"], loc=(0.07,0.94), title="Gender", title_fontsize=13)

# set figure
plt.subplots_adjust(top=0.92)
plt.suptitle("Three-way ANOVA for each metric variable", fontsize=25)

plt.show()

## Metric Variables Correlation Matrix

In [None]:
# Prepare figure
fig = plt.figure(figsize=(10, 8))

# Obtain correlation matrix. Round the values to 2 decimal cases. Use the DataFrame corr() and round() method.
corr = np.round(df[metric_features].corr(method="pearson"), decimals=2)

# Build annotation matrix (values above |0.5| will appear annotated in the plot)
mask_annot = np.absolute(corr.values) >= 0.5
annot = np.where(mask_annot, corr.values, np.full(corr.shape,"")) # Try to understand what this np.where() does

# Plot heatmap of the correlation matrix
sns.heatmap(data=corr, annot=annot, cmap=sns.diverging_palette(220, 10, as_cmap=True), 
            fmt='s', vmin=-1, vmax=1, center=0, square=True, linewidths=.5)

# Layout
fig.subplots_adjust(top=0.95)
fig.suptitle("Correlation Matrix", fontsize=20)

plt.show()


## Using `pandas-profiling`

## A tool to assist you through your exploratory data analysis

Optionally, you may use `pandas-profiling` as a first approach to your data analysis. Remember, although this tool provides excelent insights about the data you're working with, it is not enough to perform a proper analysis.

In [None]:
profile = ProfileReport(
    df, 
    title='Tugas Customer Data',
    correlations={
        "pearson": {"calculate": True},
        "spearman": {"calculate": False},
        "kendall": {"calculate": False},
        "phi_k": {"calculate": False},
        "cramers": {"calculate": False},
    },
)

In [None]:
profile.to_notebook_iframe()