In [1]:

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import pearsonr

# Set visualization style
plt.style.use('fivethirtyeight')
sns.set(font_scale=1.2)

# Create directory for visualizations if it doesn't exist
os.makedirs('visualizations', exist_ok=True)

# Try to load the prepared dataset first
prepared_paths = [
    'data/coffee_sales_prepared.csv',
    './coffee_sales_prepared.csv',
    '../data/coffee_sales_prepared.csv'
]

file_path = None
# Try to find the prepared dataset
for path in prepared_paths:
    if os.path.exists(path):
        file_path = path
        print(f"Found prepared dataset at: {path}")
        break

# If prepared dataset not found, try to find and load the original dataset
if file_path is None:
    original_paths = [
        'DatasetForCoffeeSales2.csv',
        './DatasetForCoffeeSales2.csv',
        '../DatasetForCoffeeSales2.csv',
        'data/DatasetForCoffeeSales2.csv'
    ]

    for path in original_paths:
        if os.path.exists(path):
            file_path = path
            print(f"Found original dataset at: {path}. Will prepare it now.")
            break

    if file_path is None:
        raise FileNotFoundError("Could not find any dataset file. Please run 01_data_preparation.ipynb first or place the CSV file in the same directory.")

# Load the dataset
coffee_df = pd.read_csv(file_path)

# If we're using the original dataset, we need to prepare it
if 'prepared' not in file_path:
    print("Preparing the dataset...")
    # Convert date string to datetime
    coffee_df['Date'] = pd.to_datetime(coffee_df['Date'])

    # Extract time-based features
    coffee_df['Year'] = coffee_df['Date'].dt.year
    coffee_df['Month'] = coffee_df['Date'].dt.month
    coffee_df['Day'] = coffee_df['Date'].dt.day
    coffee_df['Day_of_week'] = coffee_df['Date'].dt.dayofweek
    coffee_df['Quarter'] = coffee_df['Date'].dt.quarter

    # Create price categories
    price_map = {
        30: 'Budget',
        35: 'Standard',
        40: 'Premium',
        45: 'Luxury'
    }
    coffee_df['Price_Category'] = coffee_df['Unit Price'].map(price_map)

    # Create bean category
    coffee_df['Bean_Category'] = coffee_df['Unit Price'].apply(
        lambda x: 'Premium' if x >= 40 else 'Standard')

    # Rename Product column for clarity
    coffee_df['Coffee_Bean_Type'] = coffee_df['Product']
else:
    # Convert back to datetime if loading the prepared dataset
    coffee_df['Date'] = pd.to_datetime(coffee_df['Date'])

print("Dataset loaded successfully with", coffee_df.shape[0], "records")

# Unit Price distribution
plt.figure(figsize=(10, 6))
sns.histplot(coffee_df['Unit Price'], kde=True, bins=4, color='#1f77b4')
plt.title('Distribution of Coffee Bean Prices', fontsize=15)
plt.xlabel('Unit Price', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.grid(True, alpha=0.3)
plt.savefig('visualizations/unit_price_distribution.png', dpi=300)
plt.close()

# Price statistics
price_stats = coffee_df['Unit Price'].describe()
print("Unit Price Statistics:")
print(price_stats)

# Quantity distribution
plt.figure(figsize=(10, 6))
sns.histplot(coffee_df['Quantity'], kde=True, bins=15, color='#ff7f0e')
plt.title('Distribution of Purchase Quantities', fontsize=15)
plt.xlabel('Quantity', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.grid(True, alpha=0.3)
plt.savefig('visualizations/quantity_distribution.png', dpi=300)
plt.close()

# Quantity statistics
qty_stats = coffee_df['Quantity'].describe()
print("Quantity Statistics:")
print(qty_stats)

# Sales Amount distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(coffee_df['Sales Amount'], kde=True, bins=15, color='#2ca02c')
plt.title('Distribution of Sales Amounts', fontsize=14)
plt.xlabel('Sales Amount', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
sns.histplot(coffee_df['Final Sales'], kde=True, bins=15, color='#d62728')
plt.title('Distribution of Final Sales', fontsize=14)
plt.xlabel('Final Sales', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('visualizations/sales_distributions.png', dpi=300)
plt.close()

# Statistics
sales_stats = pd.DataFrame({
    'Sales Amount': coffee_df['Sales Amount'].describe(),
    'Final Sales': coffee_df['Final Sales'].describe()
})
print("Sales Statistics:")
print(sales_stats)

# Bean type distribution
plt.figure(figsize=(12, 6))
bean_counts = coffee_df['Coffee_Bean_Type'].value_counts()
sns.barplot(x=bean_counts.index, y=bean_counts.values, palette='viridis')
plt.title('Distribution of Coffee Bean Types', fontsize=16)
plt.xlabel('Coffee Bean Type', fontsize=14)
plt.ylabel('Number of Purchases', fontsize=14)
plt.xticks(rotation=45)
plt.grid(True, axis='y', alpha=0.3)

# Add count labels on bars
for i, count in enumerate(bean_counts.values):
    plt.text(i, count + 5, str(count), ha='center')

plt.tight_layout()
plt.savefig('visualizations/bean_type_distribution.png', dpi=300)
plt.close()

print("Coffee Bean Type Distribution:")
print(bean_counts)
print("\nPercentage Distribution:")
print((bean_counts / len(coffee_df) * 100).round(1))

# City distribution
plt.figure(figsize=(12, 7))
city_counts = coffee_df['City'].value_counts()
sns.barplot(x=city_counts.values, y=city_counts.index, palette='mako')
plt.title('Distribution of Sales by City', fontsize=16)
plt.xlabel('Number of Purchases', fontsize=14)
plt.ylabel('City', fontsize=14)
plt.grid(True, axis='x', alpha=0.3)

# Add count labels on bars
for i, count in enumerate(city_counts.values):
    plt.text(count + 5, i, str(count), va='center')

plt.tight_layout()
plt.savefig('visualizations/city_distribution.png', dpi=300)
plt.close()

print("City Distribution:")
print(city_counts)
print("\nPercentage Distribution:")
print((city_counts / len(coffee_df) * 100).round(1))

# Discount usage distribution
plt.figure(figsize=(8, 6))
discount_counts = coffee_df['Used_Discount'].value_counts()
labels = ['No Discount', 'With Discount']
plt.pie(discount_counts, labels=labels, autopct='%1.1f%%',
        colors=['#ff9999','#66b3ff'], explode=(0, 0.1),
        startangle=90, shadow=True)
plt.title('Distribution of Discount Usage', fontsize=16)
plt.axis('equal')
plt.savefig('visualizations/discount_usage_distribution.png', dpi=300)
plt.close()

print("Discount Usage Distribution:")
print(discount_counts)
print("\nPercentage Distribution:")
print((discount_counts / len(coffee_df) * 100).round(1))

# Monthly sales analysis
monthly_sales = coffee_df.groupby('Month')['Final Sales'].sum().reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(x='Month', y='Final Sales', data=monthly_sales, palette='YlOrBr')
plt.title('Monthly Sales Distribution', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Total Sales', fontsize=14)
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/monthly_sales.png', dpi=300)
plt.close()

print("Monthly Sales Statistics:")
print(monthly_sales)

# Day of week analysis
day_mapping = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday',
               4:'Friday', 5:'Saturday', 6:'Sunday'}
coffee_df['Day_Name'] = coffee_df['Day_of_week'].map(day_mapping)

day_sales = coffee_df.groupby('Day_Name')['Final Sales'].mean().reindex(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

plt.figure(figsize=(12, 6))
sns.barplot(x=day_sales.index, y=day_sales.values, palette='coolwarm')
plt.title('Average Sales by Day of Week', fontsize=16)
plt.xlabel('Day of Week', fontsize=14)
plt.ylabel('Average Sales', fontsize=14)
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/day_of_week_sales.png', dpi=300)
plt.close()

print("Day of Week Sales Statistics:")
print(day_sales)

# Calculate correlation matrix
numerical_cols = ['Unit Price', 'Quantity', 'Sales Amount', 'Final Sales', 'Discount_Amount']
corr_matrix = coffee_df[numerical_cols].corr()

# Visualize correlation matrix
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap=cmap, mask=mask,
            linewidths=.5, cbar_kws={'shrink': .7})
plt.title('Correlation Matrix of Key Variables', fontsize=16)
plt.tight_layout()
plt.savefig('visualizations/correlation_matrix.png', dpi=300)
plt.close()

print("Correlation Matrix:")
print(corr_matrix.round(2))

# Price vs Quantity relationship
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Unit Price', y='Quantity', hue='Coffee_Bean_Type',
                alpha=0.7, data=coffee_df)
plt.title('Price vs. Quantity Relationship by Coffee Bean Type', fontsize=16)
plt.xlabel('Unit Price', fontsize=14)
plt.ylabel('Quantity', fontsize=14)
plt.legend(title='Coffee Bean Type', title_fontsize=12, fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/price_quantity_relationship.png', dpi=300)
plt.close()

# Calculate correlation
price_qty_corr = coffee_df['Unit Price'].corr(coffee_df['Quantity'])
price_qty_spearman = coffee_df['Unit Price'].corr(coffee_df['Quantity'], method='spearman')
print(f"Price-Quantity Pearson Correlation: {price_qty_corr:.3f}")
print(f"Price-Quantity Spearman Correlation: {price_qty_spearman:.3f}")

# Quantity distribution by price category
plt.figure(figsize=(12, 7))
order = ['Budget', 'Standard', 'Premium', 'Luxury']
sns.boxplot(x='Price_Category', y='Quantity', data=coffee_df,
            order=order, palette='viridis')
plt.title('Quantity Distribution Across Price Categories', fontsize=16)
plt.xlabel('Price Category', fontsize=14)
plt.ylabel('Quantity', fontsize=14)
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/price_category_quantity.png', dpi=300)
plt.close()

# Calculate average quantities by price category
price_qty_avg = coffee_df.groupby('Price_Category')['Quantity'].agg(['mean', 'median', 'std'])
price_qty_avg = price_qty_avg.reindex(order)
print("Quantity Statistics by Price Category:")
print(price_qty_avg.round(2))

# Bean Type vs Price
plt.figure(figsize=(12, 7))
sns.boxplot(x='Coffee_Bean_Type', y='Unit Price', data=coffee_df, palette='mako')
plt.title('Unit Price by Coffee Bean Type', fontsize=16)
plt.xlabel('Coffee Bean Type', fontsize=14)
plt.ylabel('Unit Price', fontsize=14)
plt.xticks(rotation=45)
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/bean_type_price.png', dpi=300)
plt.close()

# Calculate price statistics by bean type
bean_price = coffee_df.groupby('Coffee_Bean_Type')['Unit Price'].agg(['mean', 'median', 'std'])
print("Price Statistics by Bean Type:")
print(bean_price.round(2))

# Discount effect on quantity
plt.figure(figsize=(10, 6))
sns.boxplot(x='Used_Discount', y='Quantity', data=coffee_df, palette='Set2')
plt.title('Quantity Distribution: With vs Without Discount', fontsize=16)
plt.xlabel('Discount Used', fontsize=14)
plt.ylabel('Quantity', fontsize=14)
plt.xticks([0, 1], ['No Discount', 'With Discount'])
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/discount_quantity_effect.png', dpi=300)
plt.close()

# Calculate statistics
discount_qty = coffee_df.groupby('Used_Discount')['Quantity'].agg(['mean', 'median', 'std'])
print("Quantity Statistics by Discount Usage:")
print(discount_qty.round(2))

# Create a cross-tabulation of City vs Bean Type
city_bean = pd.crosstab(coffee_df['City'], coffee_df['Coffee_Bean_Type'])
city_bean_pct = city_bean.div(city_bean.sum(axis=1), axis=0) * 100

# Visualize regional preferences as a heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(city_bean_pct, annot=True, fmt='.1f', cmap='YlGnBu', linewidths=.5)
plt.title('Regional Preferences for Coffee Bean Types (%)', fontsize=16)
plt.xlabel('Coffee Bean Type', fontsize=14)
plt.ylabel('City', fontsize=14)
plt.tight_layout()
plt.savefig('visualizations/regional_preferences_heatmap.png', dpi=300)
plt.close()

print("Regional Bean Type Preferences (%):")
print(city_bean_pct.round(1))

# Summarize key findings
print("Summary of Key Exploratory Findings:")
print("-----------------------------------")
print("1. Price Distribution:")
print(f"   - Unit prices range from ${coffee_df['Unit Price'].min()} to ${coffee_df['Unit Price'].max()}")
print(f"   - Most common price point: ${coffee_df['Unit Price'].mode()[0]}")
print(f"   - Mean price: ${coffee_df['Unit Price'].mean():.2f}")
print("\n2. Quantity Patterns:")
print(f"   - Average quantity per purchase: {coffee_df['Quantity'].mean():.2f}")
print(f"   - Median quantity: {coffee_df['Quantity'].median()}")
print(f"   - Correlation between price and quantity: {coffee_df['Unit Price'].corr(coffee_df['Quantity']):.3f}")
print("\n3. Bean Type Preferences:")
most_popular = coffee_df['Coffee_Bean_Type'].value_counts().index[0]
print(f"   - Most popular bean type: {most_popular}")
highest_price = coffee_df.groupby('Coffee_Bean_Type')['Unit Price'].mean().idxmax()
print(f"   - Highest priced bean type: {highest_price}")
print("\n4. Regional Patterns:")
largest_city = coffee_df['City'].value_counts().index[0]
print(f"   - Largest market by purchase volume: {largest_city}")
highest_price_city = coffee_df.groupby('City')['Unit Price'].mean().idxmax()
print(f"   - City with highest average price point: {highest_price_city}")
print("\n5. Discount Impact:")
discount_impact = coffee_df.groupby('Used_Discount')['Quantity'].mean()
if True in discount_impact.index and False in discount_impact.index:
    print(f"   - Average quantity with discount: {discount_impact[True]:.2f}")
    print(f"   - Average quantity without discount: {discount_impact[False]:.2f}")
    print(f"   - Quantity increase with discount: {(discount_impact[True]/discount_impact[False]-1)*100:.1f}%")


Found original dataset at: DatasetForCoffeeSales2.csv. Will prepare it now.
Preparing the dataset...
Dataset loaded successfully with 730 records
Unit Price Statistics:
count    730.000000
mean      36.794521
std        4.955104
min       30.000000
25%       35.000000
50%       35.000000
75%       40.000000
max       45.000000
Name: Unit Price, dtype: float64
Quantity Statistics:
count    730.000000
mean      26.080822
std       14.480971
min        1.000000
25%       14.000000
50%       27.000000
75%       39.000000
max       49.000000
Name: Quantity, dtype: float64
Sales Statistics:
       Sales Amount  Final Sales
count    730.000000   730.000000
mean     959.924658   862.531507
std      551.282730   509.032315
min       30.000000    24.000000
25%      495.000000   448.000000
50%      960.000000   840.000000
75%     1400.000000  1260.000000
max     2205.000000  2205.000000



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=bean_counts.index, y=bean_counts.values, palette='viridis')


Coffee Bean Type Distribution:
Coffee_Bean_Type
Costa Rica    165
Colombian     152
Brazilian     146
Guatemala     139
Ethiopian     128
Name: count, dtype: int64

Percentage Distribution:
Coffee_Bean_Type
Costa Rica    22.6
Colombian     20.8
Brazilian     20.0
Guatemala     19.0
Ethiopian     17.5
Name: count, dtype: float64



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=city_counts.values, y=city_counts.index, palette='mako')


City Distribution:
City
Hail        87
Riyadh      79
Jeddah      77
Mecca       77
Khobar      73
Dammam      72
Medina      71
Buraidah    69
Abha        66
Tabuk       59
Name: count, dtype: int64

Percentage Distribution:
City
Hail        11.9
Riyadh      10.8
Jeddah      10.5
Mecca       10.5
Khobar      10.0
Dammam       9.9
Medina       9.7
Buraidah     9.5
Abha         9.0
Tabuk        8.1
Name: count, dtype: float64
Discount Usage Distribution:
Used_Discount
True     371
False    359
Name: count, dtype: int64

Percentage Distribution:
Used_Discount
True     50.8
False    49.2
Name: count, dtype: float64



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Month', y='Final Sales', data=monthly_sales, palette='YlOrBr')


Monthly Sales Statistics:
    Month  Final Sales
0       1        52074
1       2        51451
2       3        56519
3       4        47882
4       5        51575
5       6        57208
6       7        56222
7       8        54931
8       9        53420
9      10        47432
10     11        51569
11     12        49365



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=day_sales.index, y=day_sales.values, palette='coolwarm')


Day of Week Sales Statistics:
Day_Name
Monday       899.257143
Tuesday      849.076923
Wednesday    842.634615
Thursday     856.509615
Friday       904.250000
Saturday     853.028846
Sunday       832.895238
Name: Final Sales, dtype: float64
Correlation Matrix:
                 Unit Price  Quantity  Sales Amount  Final Sales  \
Unit Price             1.00      0.00          0.24         0.23   
Quantity               0.00      1.00          0.96         0.94   
Sales Amount           0.24      0.96          1.00         0.98   
Final Sales            0.23      0.94          0.98         1.00   
Discount_Amount        0.11      0.42          0.44         0.24   

                 Discount_Amount  
Unit Price                  0.11  
Quantity                    0.42  
Sales Amount                0.44  
Final Sales                 0.24  
Discount_Amount             1.00  
Price-Quantity Pearson Correlation: 0.004
Price-Quantity Spearman Correlation: 0.006



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Price_Category', y='Quantity', data=coffee_df,


Quantity Statistics by Price Category:
                 mean  median    std
Price_Category                      
Budget          25.97    28.0  15.32
Standard        25.91    26.5  14.28
Premium         26.79    27.0  14.09
Luxury          25.77    25.0  14.59



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Coffee_Bean_Type', y='Unit Price', data=coffee_df, palette='mako')


Price Statistics by Bean Type:
                  mean  median  std
Coffee_Bean_Type                   
Brazilian         30.0    30.0  0.0
Colombian         40.0    40.0  0.0
Costa Rica        35.0    35.0  0.0
Ethiopian         45.0    45.0  0.0
Guatemala         35.0    35.0  0.0



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Used_Discount', y='Quantity', data=coffee_df, palette='Set2')


Quantity Statistics by Discount Usage:
                mean  median    std
Used_Discount                      
False          26.18    27.0  14.74
True           25.99    26.0  14.25
Regional Bean Type Preferences (%):
Coffee_Bean_Type  Brazilian  Colombian  Costa Rica  Ethiopian  Guatemala
City                                                                    
Abha                   19.7       15.2        30.3       19.7       15.2
Buraidah               30.4       15.9        17.4       23.2       13.0
Dammam                 18.1       27.8        22.2       13.9       18.1
Hail                   19.5       24.1        18.4       12.6       25.3
Jeddah                 16.9       13.0        23.4       20.8       26.0
Khobar                 16.4       24.7        19.2       17.8       21.9
Mecca                  22.1       19.5        18.2       19.5       20.8
Medina                 22.5       23.9        26.8        9.9       16.9
Riyadh                 15.2       17.7        26.6 