In [1]:

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, pearsonr, chi2_contingency
import statsmodels.api as sm
from statsmodels.formula.api import ols
import os

# Set visualization style
plt.style.use('fivethirtyeight')
sns.set(font_scale=1.2)

# Create directory for visualizations if it doesn't exist
os.makedirs('visualizations', exist_ok=True)

# Try to load the prepared dataset first
prepared_paths = [
    'data/coffee_sales_prepared.csv',
    './coffee_sales_prepared.csv',
    '../data/coffee_sales_prepared.csv'
]

file_path = None
# Try to find the prepared dataset
for path in prepared_paths:
    if os.path.exists(path):
        file_path = path
        print(f"Found prepared dataset at: {path}")
        break

# If prepared dataset not found, try to find and load the original dataset
if file_path is None:
    original_paths = [
        'DatasetForCoffeeSales2.csv',
        './DatasetForCoffeeSales2.csv',
        '../DatasetForCoffeeSales2.csv',
        'data/DatasetForCoffeeSales2.csv'
    ]

    for path in original_paths:
        if os.path.exists(path):
            file_path = path
            print(f"Found original dataset at: {path}. Will prepare it now.")
            break

    if file_path is None:
        raise FileNotFoundError("Could not find any dataset file. Please run 01_data_preparation.ipynb first or place the CSV file in the same directory.")

# Load the dataset
coffee_df = pd.read_csv(file_path)

# If we're using the original dataset, we need to prepare it
if 'prepared' not in file_path:
    print("Preparing the dataset...")
    # Convert date string to datetime
    coffee_df['Date'] = pd.to_datetime(coffee_df['Date'])

    # Extract time-based features
    coffee_df['Year'] = coffee_df['Date'].dt.year
    coffee_df['Month'] = coffee_df['Date'].dt.month
    coffee_df['Day'] = coffee_df['Date'].dt.day
    coffee_df['Day_of_week'] = coffee_df['Date'].dt.dayofweek
    coffee_df['Quarter'] = coffee_df['Date'].dt.quarter

    # Create price categories
    price_map = {
        30: 'Budget',
        35: 'Standard',
        40: 'Premium',
        45: 'Luxury'
    }
    coffee_df['Price_Category'] = coffee_df['Unit Price'].map(price_map)

    # Create bean category
    coffee_df['Bean_Category'] = coffee_df['Unit Price'].apply(
        lambda x: 'Premium' if x >= 40 else 'Standard')

    # Rename Product column for clarity
    coffee_df['Coffee_Bean_Type'] = coffee_df['Product']
else:
    # Convert back to datetime if loading the prepared dataset
    coffee_df['Date'] = pd.to_datetime(coffee_df['Date'])

print("Dataset loaded successfully with", coffee_df.shape[0], "records")

# =====================================================================
# Hypothesis 1: Consumers are willing to pay more for premium coffee beans
# =====================================================================

# Define premium beans based on their type (Ethiopian and Colombian are the highest priced)
# According to our analysis, price distribution shows:
# Ethiopian: $45
# Colombian: $40
# Costa Rica, Guatemala: $35
# Brazilian: $30
premium_beans = ['Ethiopian', 'Colombian']
standard_beans = ['Costa Rica', 'Guatemala', 'Brazilian']

# Extract quantities for each group
premium_qty = coffee_df[coffee_df['Coffee_Bean_Type'].isin(premium_beans)]['Quantity']
standard_qty = coffee_df[coffee_df['Coffee_Bean_Type'].isin(standard_beans)]['Quantity']

# Compare basic statistics
premium_stats = premium_qty.describe()
standard_stats = standard_qty.describe()

comparison = pd.DataFrame({
    'Premium Beans': premium_stats,
    'Standard Beans': standard_stats
})

print("Hypothesis 1: Consumers are willing to pay more for premium coffee beans")
print("\nComparison of Purchase Quantities:")
print(comparison.round(2))

# T-test for quantity difference
t_stat, p_value = ttest_ind(premium_qty, standard_qty, equal_var=False)
print(f"\nT-test results for quantity purchased (Premium vs Standard beans):")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"Significant difference at 0.05 level: {p_value < 0.05}")

# Visualize the comparison with a boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x='Bean_Category', y='Quantity', data=coffee_df, palette='Set2')
plt.title('Quantity Comparison: Premium vs Standard Coffee Beans', fontsize=16)
plt.xlabel('Bean Category', fontsize=14)
plt.ylabel('Quantity', fontsize=14)
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/premium_standard_quantity.png', dpi=300)
plt.close()

# Compare with a violin plot to see distribution shapes
plt.figure(figsize=(10, 6))
sns.violinplot(x='Bean_Category', y='Quantity', data=coffee_df, palette='Set2')
plt.title('Quantity Distribution: Premium vs Standard Coffee Beans', fontsize=16)
plt.xlabel('Bean Category', fontsize=14)
plt.ylabel('Quantity', fontsize=14)
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/premium_standard_violin.png', dpi=300)
plt.close()

# Compare average sales
premium_sales = coffee_df[coffee_df['Coffee_Bean_Type'].isin(premium_beans)]['Final Sales'].mean()
standard_sales = coffee_df[coffee_df['Coffee_Bean_Type'].isin(standard_beans)]['Final Sales'].mean()
print(f"\nAverage sales for Premium beans: ${premium_sales:.2f}")
print(f"Average sales for Standard beans: ${standard_sales:.2f}")
print(f"Sales difference: ${premium_sales - standard_sales:.2f}")

# Compare purchase patterns by bean type
plt.figure(figsize=(12, 6))
sns.barplot(x='Coffee_Bean_Type', y='Quantity', data=coffee_df,
            order=['Ethiopian', 'Colombian', 'Costa Rica', 'Guatemala', 'Brazilian'],
            palette='viridis')
plt.title('Average Quantity by Coffee Bean Type', fontsize=16)
plt.xlabel('Coffee Bean Type', fontsize=14)
plt.ylabel('Average Quantity', fontsize=14)
plt.axhline(y=coffee_df['Quantity'].mean(), color='red', linestyle='--', label='Overall Average')
plt.legend()
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/bean_type_quantity.png', dpi=300)
plt.close()

# =====================================================================
# Hypothesis 2: Price influences purchase quantity
# =====================================================================

print("\nHypothesis 2: Price influences purchase quantity")

# Correlation test between price and quantity
corr_coef, p_value = pearsonr(coffee_df['Unit Price'], coffee_df['Quantity'])
print(f"\nCorrelation between Unit Price and Quantity:")
print(f"Correlation coefficient: {corr_coef:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"Significant correlation at 0.05 level: {p_value < 0.05}")

# Simple linear regression
model = ols('Quantity ~ Q("Unit Price")', data=coffee_df).fit()
print("\nRegression Results:")
print(model.summary().tables[1])  # Print only the parameter table

# Plot regression line
plt.figure(figsize=(10, 6))
sns.regplot(x='Unit Price', y='Quantity', data=coffee_df,
            scatter_kws={'alpha':0.5}, line_kws={'color': 'red'})
plt.title('Linear Regression: Price vs Quantity', fontsize=16)
plt.xlabel('Unit Price', fontsize=14)
plt.ylabel('Quantity', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/price_quantity_regression.png', dpi=300)
plt.close()

# Compare average quantities by price category
plt.figure(figsize=(10, 6))
sns.barplot(x='Price_Category', y='Quantity', data=coffee_df,
            order=['Budget', 'Standard', 'Premium', 'Luxury'],
            palette='viridis')
plt.title('Average Quantity by Price Category', fontsize=16)
plt.xlabel('Price Category', fontsize=14)
plt.ylabel('Average Quantity', fontsize=14)
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/quantity_by_price_category.png', dpi=300)
plt.close()

# Calculate price elasticity (% change in quantity / % change in price)
# Using the average quantities at each price point
price_qty = coffee_df.groupby('Unit Price')['Quantity'].mean()
prices = sorted(coffee_df['Unit Price'].unique())
print("\nPrice Elasticity:")
for i in range(len(prices)-1):
    price1, price2 = prices[i], prices[i+1]
    qty1, qty2 = price_qty[price1], price_qty[price2]

    pct_change_price = (price2 - price1) / price1
    pct_change_qty = (qty2 - qty1) / qty1
    elasticity = pct_change_qty / pct_change_price

    print(f"Between ${price1} and ${price2}: {elasticity:.4f}")

# =====================================================================
# Hypothesis 3: Regional preferences influence coffee purchasing patterns
# =====================================================================

print("\nHypothesis 3: Regional preferences influence coffee purchasing patterns")

# City preferences for coffee bean types
city_bean = pd.crosstab(coffee_df['City'], coffee_df['Coffee_Bean_Type'])
city_bean_pct = city_bean.div(city_bean.sum(axis=1), axis=0) * 100
print("\nRegional Bean Type Preferences (%):")
print(city_bean_pct.round(1))

# Chi-square test for independence between City and Bean Type
chi2, p, dof, expected = chi2_contingency(city_bean)
print(f"\nChi-square test (City vs Bean Type):")
print(f"Chi-square value: {chi2:.2f}")
print(f"p-value: {p:.4f}")
print(f"Degrees of freedom: {dof}")
print(f"Significant relationship at 0.05 level: {p < 0.05}")

# Visualize regional preferences
plt.figure(figsize=(14, 10))
# Convert to long format for easier plotting
city_bean_long = city_bean_pct.reset_index().melt(
    id_vars='City',
    value_vars=city_bean_pct.columns,
    var_name='Coffee_Bean_Type',
    value_name='Percentage'
)
# Plot the heatmap
heatmap_data = city_bean_long.pivot(index='City', columns='Coffee_Bean_Type', values='Percentage')
sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='YlGnBu', linewidths=.5)
plt.title('Regional Preferences for Coffee Bean Types (%)', fontsize=16)
plt.xlabel('Coffee Bean Type', fontsize=14)
plt.ylabel('City', fontsize=14)
plt.tight_layout()
plt.savefig('visualizations/regional_preferences_heatmap.png', dpi=300)
plt.close()

# Average price paid by city
city_price = coffee_df.groupby('City')['Unit Price'].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x=city_price.values, y=city_price.index, palette='mako')
plt.title('Average Unit Price by City', fontsize=16)
plt.xlabel('Average Unit Price', fontsize=14)
plt.ylabel('City', fontsize=14)
plt.grid(True, axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/city_price_comparison.png', dpi=300)
plt.close()

# ANOVA test for price differences between cities
model = ols('Q("Unit Price") ~ City', data=coffee_df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("\nANOVA Results for Price by City:")
print(anova_table)

# =====================================================================
# Hypothesis 4: Discount usage affects purchasing behavior
# =====================================================================

print("\nHypothesis 4: Discount usage affects purchasing behavior")

# Compare quantities for purchases with and without discounts
discount_qty = coffee_df[coffee_df['Used_Discount'] == True]['Quantity']
no_discount_qty = coffee_df[coffee_df['Used_Discount'] == False]['Quantity']

# T-test for quantity difference
t_stat, p_value = ttest_ind(discount_qty, no_discount_qty, equal_var=False)
print(f"\nT-test results for Quantity (Discount vs No Discount):")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"Significant difference at 0.05 level: {p_value < 0.05}")

# Compare average quantities
with_discount_avg = discount_qty.mean()
without_discount_avg = no_discount_qty.mean()
print(f"\nAverage quantity with discount: {with_discount_avg:.2f}")
print(f"Average quantity without discount: {without_discount_avg:.2f}")
print(f"Difference: {with_discount_avg - without_discount_avg:.2f}")

# Plot the comparison
plt.figure(figsize=(10, 6))
sns.boxplot(x='Used_Discount', y='Quantity', data=coffee_df, palette='Set2')
plt.title('Quantity Comparison: With vs Without Discount', fontsize=16)
plt.xlabel('Discount Used', fontsize=14)
plt.ylabel('Quantity', fontsize=14)
plt.xticks([0, 1], ['No Discount', 'With Discount'])
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/discount_quantity_comparison.png', dpi=300)
plt.close()

# Analyze discount usage by coffee bean type
discount_by_bean = pd.crosstab(coffee_df['Coffee_Bean_Type'], coffee_df['Used_Discount'])
discount_by_bean_pct = discount_by_bean.div(discount_by_bean.sum(axis=1), axis=0) * 100
print("\nDiscount Usage by Coffee Bean Type (%):")
print(discount_by_bean_pct.round(1))

# Plot discount usage by bean type
plt.figure(figsize=(12, 7))
discount_usage = discount_by_bean_pct[True].sort_values(ascending=False)
sns.barplot(x=discount_usage.index, y=discount_usage.values, palette='viridis')
plt.title('Discount Usage Percentage by Coffee Bean Type', fontsize=16)
plt.xlabel('Coffee Bean Type', fontsize=14)
plt.ylabel('Discount Usage (%)', fontsize=14)
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/discount_usage_by_bean.png', dpi=300)
plt.close()

# Analyze discount usage by price category
discount_by_price = pd.crosstab(coffee_df['Price_Category'], coffee_df['Used_Discount'])
discount_by_price_pct = discount_by_price.div(discount_by_price.sum(axis=1), axis=0) * 100
print("\nDiscount Usage by Price Category (%):")
print(discount_by_price_pct.round(1))

# Plot discount usage by price category
plt.figure(figsize=(10, 6))
discount_usage_price = discount_by_price_pct[True]
sns.barplot(x=discount_usage_price.index, y=discount_usage_price.values,
            order=['Budget', 'Standard', 'Premium', 'Luxury'],
            palette='YlOrBr')
plt.title('Discount Usage Percentage by Price Category', fontsize=16)
plt.xlabel('Price Category', fontsize=14)
plt.ylabel('Discount Usage (%)', fontsize=14)
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/discount_usage_by_price.png', dpi=300)
plt.close()

# =====================================================================
# Summary of Hypothesis Testing Results
# =====================================================================

print("\nSummary of Hypothesis Testing Results")
print("=====================================")

print("\n1. Consumers are willing to pay more for premium coffee beans:")
print(f"   - T-test p-value: {ttest_ind(premium_qty, standard_qty, equal_var=False)[1]:.4f}")
if ttest_ind(premium_qty, standard_qty, equal_var=False)[1] < 0.05:
    direction = "higher" if premium_qty.mean() > standard_qty.mean() else "lower"
    print(f"   - Result: Significant difference found. Premium bean quantities are {direction}.")
    print(f"   - Premium bean avg quantity: {premium_qty.mean():.2f}, Standard bean avg: {standard_qty.mean():.2f}")
else:
    print("   - Result: No significant difference in quantities between premium and standard beans.")

print("\n2. Price influences purchase quantity:")
print(f"   - Correlation coefficient: {corr_coef:.4f}, p-value: {p_value:.4f}")
if p_value < 0.05:
    direction = "negative" if corr_coef < 0 else "positive"
    print(f"   - Result: Significant {direction} correlation between price and quantity.")
    print(f"   - As price increases, quantity tends to {('decrease' if corr_coef < 0 else 'increase')}.")
else:
    print("   - Result: No significant correlation between price and quantity.")

print("\n3. Regional preferences influence coffee purchasing patterns:")
print(f"   - Chi-square test p-value: {p:.4f}")
if p < 0.05:
    print("   - Result: Significant relationship between city and bean type preferences.")
    print("   - Different cities show distinct preferences for particular coffee beans.")
else:
    print("   - Result: No significant relationship between city and bean type preferences.")

print("\n4. Discount usage affects purchasing behavior:")
print(f"   - T-test p-value: {ttest_ind(discount_qty, no_discount_qty, equal_var=False)[1]:.4f}")
if ttest_ind(discount_qty, no_discount_qty, equal_var=False)[1] < 0.05:
    direction = "higher" if discount_qty.mean() > no_discount_qty.mean() else "lower"
    print(f"   - Result: Significant difference found. Quantities with discount are {direction}.")
    print(f"   - With discount avg: {discount_qty.mean():.2f}, Without discount avg: {no_discount_qty.mean():.2f}")
    print(f"   - Percent increase with discount: {((discount_qty.mean() / no_discount_qty.mean()) - 1) * 100:.1f}%")
else:
    print("   - Result: No significant difference in quantities between discounted and non-discounted purchases.")

Found original dataset at: DatasetForCoffeeSales2.csv. Will prepare it now.
Preparing the dataset...
Dataset loaded successfully with 730 records
Hypothesis 1: Consumers are willing to pay more for premium coffee beans

Comparison of Purchase Quantities:
       Premium Beans  Standard Beans
count         280.00          450.00
mean           26.32           25.93
std            14.30           14.61
min             1.00            1.00
25%            15.00           14.00
50%            26.00           27.00
75%            40.00           39.00
max            49.00           49.00

T-test results for quantity purchased (Premium vs Standard beans):
t-statistic: 0.3557
p-value: 0.7222
Significant difference at 0.05 level: False



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Bean_Category', y='Quantity', data=coffee_df, palette='Set2')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x='Bean_Category', y='Quantity', data=coffee_df, palette='Set2')



Average sales for Premium beans: $996.18
Average sales for Standard beans: $779.37
Sales difference: $216.81



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Coffee_Bean_Type', y='Quantity', data=coffee_df,



Hypothesis 2: Price influences purchase quantity

Correlation between Unit Price and Quantity:
Correlation coefficient: 0.0041
p-value: 0.9121
Significant correlation at 0.05 level: False

Regression Results:
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          25.6407      4.021      6.376      0.000      17.746      33.535
Q("Unit Price")     0.0120      0.108      0.110      0.912      -0.201       0.225



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Price_Category', y='Quantity', data=coffee_df,



Price Elasticity:
Between $30 and $35: -0.0142
Between $35 and $40: 0.2373
Between $40 and $45: -0.3057

Hypothesis 3: Regional preferences influence coffee purchasing patterns

Regional Bean Type Preferences (%):
Coffee_Bean_Type  Brazilian  Colombian  Costa Rica  Ethiopian  Guatemala
City                                                                    
Abha                   19.7       15.2        30.3       19.7       15.2
Buraidah               30.4       15.9        17.4       23.2       13.0
Dammam                 18.1       27.8        22.2       13.9       18.1
Hail                   19.5       24.1        18.4       12.6       25.3
Jeddah                 16.9       13.0        23.4       20.8       26.0
Khobar                 16.4       24.7        19.2       17.8       21.9
Mecca                  22.1       19.5        18.2       19.5       20.8
Medina                 22.5       23.9        26.8        9.9       16.9
Riyadh                 15.2       17.7        26.6     


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=city_price.values, y=city_price.index, palette='mako')



ANOVA Results for Price by City:
                sum_sq     df         F    PR(>F)
City        125.320732    9.0  0.564068  0.826983
Residual  17773.857350  720.0       NaN       NaN

Hypothesis 4: Discount usage affects purchasing behavior

T-test results for Quantity (Discount vs No Discount):
t-statistic: -0.1786
p-value: 0.8583
Significant difference at 0.05 level: False

Average quantity with discount: 25.99
Average quantity without discount: 26.18
Difference: -0.19



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Used_Discount', y='Quantity', data=coffee_df, palette='Set2')



Discount Usage by Coffee Bean Type (%):
Used_Discount     False  True 
Coffee_Bean_Type              
Brazilian          52.1   47.9
Colombian          47.4   52.6
Costa Rica         51.5   48.5
Ethiopian          50.0   50.0
Guatemala          44.6   55.4



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=discount_usage.index, y=discount_usage.values, palette='viridis')



Discount Usage by Price Category (%):
Used_Discount   False  True 
Price_Category              
Budget           52.1   47.9
Luxury           50.0   50.0
Premium          47.4   52.6
Standard         48.4   51.6



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=discount_usage_price.index, y=discount_usage_price.values,



Summary of Hypothesis Testing Results

1. Consumers are willing to pay more for premium coffee beans:
   - T-test p-value: 0.7222
   - Result: No significant difference in quantities between premium and standard beans.

2. Price influences purchase quantity:
   - Correlation coefficient: 0.0041, p-value: 0.8583
   - Result: No significant correlation between price and quantity.

3. Regional preferences influence coffee purchasing patterns:
   - Chi-square test p-value: 0.3985
   - Result: No significant relationship between city and bean type preferences.

4. Discount usage affects purchasing behavior:
   - T-test p-value: 0.8583
   - Result: No significant difference in quantities between discounted and non-discounted purchases.
