# Exploratory Data Analysis

In [13]:
# import the libraries
%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

# apply style to all the charts
sns.set_style('whitegrid')

## Load the clean dataset

In [14]:
# Load the data
df = pd.read_csv('black_friday_processed.csv')

## Data Perspective

### Demographic Analysis

In [15]:
# Analize transactions distribuction considering demographic data

fig = plt.figure(figsize=(15, 15))
fig.subplots_adjust(top=0.85, wspace=0.3)

ax1 = fig.add_subplot(3,3,1)
bar1 = sns.countplot(x = "Gender", order=["M","F"], palette="Set2", data = df)

ax2 = fig.add_subplot(3,3,2)
bar2 = sns.countplot(x = "Age", order=["0-17","18-25","26-35","36-45","46-50","55+"], palette="Set2", data = df)

ax3 = fig.add_subplot(3,3,3)
bar3 = sns.countplot(x = "Occupation", palette="Set2", data = df)

ax4 = fig.add_subplot(3,3,4)
bar4 = sns.countplot(x = "City_Category", order=["A","B","C"], palette="Set2", data = df)

ax5 = fig.add_subplot(3,3,5)
bar5 = sns.countplot(x = "Stay_In_Current_City_Years", order=["0","1","2","3","4+"], palette="Set2", data = df)

ax6 = fig.add_subplot(3,3,6)
bar6 = sns.countplot(x = "Marital_Status", palette="Set2", data = df)

plt.savefig('figures/Exploratory_Demographic_Analysis.png')
plt.clf()
plt.close()

## Business Perspective

### Business Analysis

In [16]:
# Analyse transactions distribution considering the Occupation and City where the Purchases occured
sns.catplot(x = "Occupation",
            y = "Purchase",
            hue = "City_Category",
            hue_order = ["A", "B", "C"],
            aspect = 3,
            kind = "bar",
            data = df)

plt.savefig('figures/Exploratory_Business_Analysis.png')
plt.clf()
plt.close()

This figure accounts for the occupation of our customers across different cities in terms of purchase. Let's first address the spike for Occupation 8 in City A, it showed in the earlier countplot that occupation 8 made up only of a tiny number of purchases, the vertical std bar helps explain this spike probably by a few large transactions. However, we can see that apart from the occupation 8, the graph shows that purchases are highest in City C regardless of occupations. We can therefore make assumptions about the purchasing power from customers in City C.