# Exploratory Data Analysis

See 'Notebooks/Data_Cleaning.ipynb' for data cleaning. Raw data imported from square.com was cleaned and some relevant features impacting the sales, such as weather and social media, were added.

- [Creation of secondary dataframes](#Creation-of-secondary-dataframes)
- [Daily info](#Daily-info)
- [Sales of products](#Sales-of-products)

In [None]:
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

In [None]:
#importing cleaned data
sales_data = pd.read_csv('insert_path')
sales_data = sales_data.drop('Unnamed: 0', axis='columns')
sales_data['date'] = pd.to_datetime(sales_data['date'])

In [None]:
sales_data.select_dtypes(include=[np.number]).columns

In [None]:
descript_data = sales_data.select_dtypes(include=[np.number]).drop(['month','day','year','hour','customer_id_no','transaction_no'], axis = 'columns')

In [None]:
#descriptive statistics summary
descript_data.describe()

# Creation of secondary dataframes

In [None]:
# Remove any non-beer transactions
sales_data_beer = sales_data[sales_data['beer_style'] != 'None']
sales_data_beer.head()

In [None]:
# Remove any cash transactions as the customers cannot be tracked
sales_data_trackable = sales_data[sales_data['customer_id_no'] != -1]
sales_data_trackable.head()

In [None]:
# Group by date
daily_grouped = sales_data.groupby(['date','weekday']).agg({'daily_no_customers':'max',
                                                 'no_styles_sold':'max',
                                                 'gross_sales': 'sum'}).reset_index()

In [None]:
daily_grouped[['gross_sales', 'daily_no_customers', 'no_styles_sold']].describe()

In [None]:
daily_grouped.head()

# Daily info
### Initialize plots

In [None]:
day_dict = {'Sunday':0, 'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5, 'Saturday':6}
daily_grouped['day_id'] = daily_grouped['weekday'].map(day_dict)
custom_palette = sns.color_palette("Blues")
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})

### Sales by day of the week

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data = daily_grouped, x = 'weekday', y='gross_sales', order = day_dict.keys(), palette=custom_palette, linewidth = 0.5)
plt.title('Sales by day of week', fontsize = 16)
plt.xlabel('Day of the Week', fontsize = 12)
plt.ylabel('Sales (in dollars)', fontsize = 12)

medians = daily_grouped.groupby(['day_id','weekday'])['gross_sales'].median()

for i, median in enumerate(medians):
    plt.text(i, median, f"{median}", horizontalalignment='center', color='black', weight='bold')
plt.tight_layout()
# plt.savefig('plot/sales_by_weekday.png',bbox_inches='tight')
# See project/folder/plots
plt.show()

### Customer count by day of week

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data = daily_grouped, x = 'weekday', y='daily_no_customers', order = day_dict.keys(), palette=custom_palette, linewidth = 0.5)
plt.title('Number of customers by day of week', fontsize = 16)
plt.xlabel('Day of the Week', fontsize = 12)
plt.ylabel('Number of customers', fontsize = 12)

medians = daily_grouped.groupby(['day_id','weekday'])['daily_no_customers'].median()

for i, median in enumerate(medians):
    plt.text(i, median, f"{median}", horizontalalignment='left', color='black', weight='bold')
plt.tight_layout()

plt.show()

### Distribution of daily gross_sales

In [None]:
max_daily_sale = daily_grouped['gross_sales'].max()
max_daily_sale

In [None]:
plt.figure(figsize=(10, 5))
sns.displot(data = daily_grouped, x='gross_sales', hue='weekday', kde=True)
plt.title('Distribution of Daily Gross Sales', fontsize = 16)
plt.xlabel('Range of Daily Gross Sales', fontsize = 12)
plt.ylabel('Count', fontsize = 12)
plt.xticks(np.arange(0, max_daily_sale, 500))
plt.show()

# Sales of products

### Sales by product type daily

In [None]:
daily_grouped_type = sales_data[sales_data['year']==2023].groupby(['date','weekday','product_type']).agg({'gross_sales':'sum'}).reset_index() #sum gross sales daily by date
daily_grouped_type_df = daily_grouped_type.groupby(['weekday', 'product_type']).agg({'gross_sales':'mean'}).reset_index() #average gross sales by weekday
daily_grouped_type_df['day_id'] = daily_grouped_type_df['weekday'].map(day_dict)

daily_grouped_type_df.head()

In [None]:
daily_grouped_type = sales_data.groupby(['product_type', 'weekday'])['gross_sales'].mean().reset_index()
unique_products = daily_grouped_type['product_type'].unique()

days_of_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

num_cols = 2
num_rows = int(round(len(day_dict.keys()) / num_cols, 0))

fig, axs = plt.subplots(num_rows, num_cols, figsize=(10, 4 * num_rows))
axs = axs.flatten()

for i, day in enumerate(day_dict.keys()):
    ax = axs[i] if num_rows > 1 else axs
    product_data = daily_grouped_type[daily_grouped_type['weekday'] == day]
    
    # Initialize sales for each prod to 0
    sales_per_day = {prod: 0 for prod in unique_products}
    
    for product, sales in zip(product_data['product_type'], product_data['gross_sales']):
        sales_per_day[product] = sales
    
    ax.bar(sales_per_day.keys(), sales_per_day.values())
    ax.set_title(f"Sales for {day}")
    ax.set_xlabel("Product")
    ax.set_ylabel("Gross Sales")

plt.tight_layout()
plt.show()


### Sales by product type monthly

In [None]:
monthly_grouped_type = sales_data.groupby(['year','month','product_type']).agg({'gross_sales':'sum'}).reset_index() #sum gross sales monthly by date
monthly_grouped_type_df = monthly_grouped_type.groupby(['month', 'product_type']).agg({'gross_sales':'mean'}).reset_index() #average gross sales by month
monthly_grouped_type_df

In [None]:
monthly_sales_2019 = monthly_grouped_type_df[monthly_grouped_type_df['product_type'].isin(['Can','Draft','Growler','Keg'])]
monthly_sales_2019 = pd.DataFrame(monthly_sales_2019)

fig = plt.figure(figsize=(20, 6))
sns.lineplot(data=monthly_sales_2019, x = 'month', y = 'gross_sales', hue='product_type', errorbar=None, palette = 'colorblind')
plt.xticks([1,2,3,4,5,6,7,8,9,10,11,12], ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], rotation=20, fontsize=12)
plt.title('Average Monthly Gross Sales for Years 2019 - 2023', fontsize=20)
plt.xlabel('Month', fontsize=16)
plt.ylabel('Gross Sales', fontsize=16)
plt.show()

### Sales by product type yearly

In [None]:
product_sales = sales_data_beer.groupby(['year','product_type']).agg({'gross_sales':'sum'})
product_sales = pd.DataFrame(product_sales).reset_index()
product_sales = product_sales[product_sales['product_type']!='Other']
grouped_sales = product_sales.groupby('year')

for year, group in grouped_sales:
    sales_by_product_type = group.groupby('product_type')['gross_sales'].sum()
    sns.set(font_scale = 1.2)
    sns.set_palette('colorblind')

    plt.figure(figsize=(8, 6))
    plt.title(f'Gross Sales Distribution for Year {year}', fontsize=20)
    plt.pie(sales_by_product_type, labels=sales_by_product_type.index, autopct='%1.1f%%', startangle=25,
            pctdistance=1.125, labeldistance=None, textprops={'fontsize':14})
    centre_circle = plt.Circle((0,0),0.6,fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)
    plt.legend(loc='upper left')
    plt.show()

### Yearly sales by beer style

In [None]:
style_sales = sales_data_beer[(sales_data_beer['beer_style'] != 'Seltzer')&(sales_data_beer['beer_style'] != 'Flight')
                              &(sales_data_beer['beer_style'] != 'Mixed')&(sales_data_beer['beer_style'] != 'Growler')]
style_sales= style_sales.groupby(['year','beer_style']).agg({'gross_sales':'sum'})
style_sales = pd.DataFrame(style_sales).reset_index()

style_ratings = sales_data_beer[(sales_data_beer['beer_style'] != 'Seltzer')&(sales_data_beer['beer_style'] != 'Flight')
                                &(sales_data_beer['beer_style'] != 'Mixed')&(sales_data_beer['beer_style'] != 'Growler')]
style_ratings= style_ratings.groupby(['beer_style']).agg({'global_rating':'mean','rating_count':'max'})
style_ratings = pd.DataFrame(style_ratings).reset_index()
style_ratings = style_ratings.sort_values('global_rating', ascending=True)

In [None]:
grouped_styles = style_sales.groupby('year')
for year, group in grouped_styles:
    sales_by_style = group.groupby('beer_style')['gross_sales'].sum()
    sns.set(font_scale = 1.1)

    plt.figure(figsize=(8, 6)) 
    plt.title(f'Gross Sales Distribution for Year per Beer Style {year}', fontsize=20)
    plt.pie(sales_by_style, labels=sales_by_style.index, autopct='%1.1f%%', startangle=55, pctdistance=0.85, textprops={'fontsize':10})   
    centre_circle = plt.Circle((0,0),0.6,fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)
    plt.xticks(rotation=45)
    plt.show()

### Yearly sales by primary style

In [None]:
primary_sales= sales_data_beer.groupby(['year','primary_beer_type']).agg({'gross_sales':'sum'})
primary_sales = pd.DataFrame(primary_sales).reset_index()
primary_sales = primary_sales[primary_sales['primary_beer_type'] != 'None']

In [None]:
grouped_styles = primary_sales.groupby('year')
for year, group in grouped_styles:
    sales_by_primary_style = group.groupby('primary_beer_type')['gross_sales'].sum()
    sns.set(font_scale = 1.1)
    sns.set_palette('colorblind')
    plt.figure(figsize=(8, 6)) 
    plt.title(f'Gross Sales Distribution for Year per Beer Style {year}', fontsize=20)
    plt.pie(sales_by_primary_style, labels=sales_by_primary_style.index, autopct='%1.1f%%', startangle=70, pctdistance=0.85)  
    centre_circle = plt.Circle((0,0),0.6,fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)
    plt.xticks(rotation=45)
    plt.show()

### Number of available styles over time

In [None]:
no_styles_sold = sales_data[sales_data['year'] > 2019].groupby('date')['no_styles_sold'].mean()
no_styles_sold = pd.DataFrame(no_styles_sold)
no_styles_sold = no_styles_sold['no_styles_sold'].resample('M').mean().reset_index()

fig, ax = plt.subplots(1,1, figsize=(15, 6))

sns.lineplot(data=no_styles_sold, x='date', y='no_styles_sold')
plt.xticks(rotation=45, fontsize=12)

ax.xaxis.set_major_locator(mdates.MonthLocator(bymonth=(1, 4, 7, 10)))
date_fmt = mdates.DateFormatter('%Y-%m')
ax.xaxis.set_major_formatter(date_fmt)
ax.xaxis.set_minor_locator(mdates.MonthLocator())
    
plt.title('Number of available beer styles over time from 2020 to Present (monthly average)', fontsize=20)
plt.ylabel('Number of available beer styles', fontsize=16)
plt.xlabel('Date', fontsize=16)
plt.show()

### Individual beer style sales

In [None]:
period_sales_primary = sales_data_beer[(sales_data_beer['year'] > 2019) & (sales_data_beer['beer_style']!='Flight')& (sales_data_beer['beer_style']!='Growler')& (sales_data_beer['beer_style']!='Seltzer')& (sales_data_beer['beer_style']!='Mixed')]
period_sales_primary = period_sales_primary.groupby(['date','beer_style'])['gross_sales'].sum()
period_sales_primary = pd.DataFrame(period_sales_primary)
period_sales_primary = period_sales_primary.reset_index()
period_sales_primary['total_sales'] = period_sales_primary.groupby(period_sales_primary['date'])['gross_sales'].transform('sum')
period_sales_primary['portion'] = period_sales_primary['gross_sales'] / period_sales_primary['total_sales']
period_sales_primary

In [None]:
import datetime as dt
import matplotlib.dates as mdates

styles_list = list(period_sales_primary['beer_style'].unique())
for style in styles_list:
    style_gross = period_sales_primary[period_sales_primary['beer_style']==style]
    style_gross.set_index('date', inplace=True)
    style_gross = style_gross['portion'].resample('M').mean().reset_index()

    fig, ax = plt.subplots(1, 1, figsize=(12, 6))    
    sns.lineplot(style_gross, x='date', y='portion')
    plt.title(str(style) + ' sales over time (monthly average)', fontsize=20)
    plt.xlim([dt.date(2020, 1, 1), dt.date(2023, 4, 30)])
    plt.xticks(rotation=45, fontsize=12)
    plt.xlabel('Date', fontsize=16)
    ax.xaxis.set_major_locator(mdates.MonthLocator(bymonth=(1, 4, 7, 10)))
    date_fmt = mdates.DateFormatter('%Y-%m')
    ax.xaxis.set_major_formatter(date_fmt)
    ax.xaxis.set_minor_locator(mdates.MonthLocator())
    ax.grid(True)
    plt.ylabel('Share of overall sales', fontsize=16)   
    plt.show()