In [1]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro
from scipy.stats import kruskal
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.stats import levene

In [2]:
# Load dataset
file_path = "../Interim/cleaned_food_prices.csv"
df = pd.read_csv(file_path)

### prepare data

In [3]:
#dropping columns related to food price index
df_nofpi = df.drop(columns=['country','Province', 'City', 'lat', 'lon', 'month', 'o_food_price_index', 'h_food_price_index', 'l_food_price_index', 'c_food_price_index', 'inflation_food_price_index', 'trust_food_price_index'])

In [4]:
# Convert 'Date' column to datetime format
df_nofpi['Date'] = pd.to_datetime(df_nofpi['Date'])

In [5]:
#dropping columns related to inflation
df_noinf = df_nofpi.drop(columns=['inflation_beans','inflation_cabbage', 'inflation_carrots', 'inflation_eggs', 'inflation_garlic', 'inflation_meat_beef_chops', 'inflation_meat_chicken_whole', 'inflation_meat_pork', 'inflation_onions', 'inflation_potatoes', 'inflation_rice', 'inflation_tomatoes'])

In [6]:
#dropping columns related to trust scores
df_cleaned = df_noinf.drop(columns=['trust_beans','trust_cabbage', 'trust_carrots', 'trust_eggs', 'trust_garlic', 'trust_meat_beef_chops', 'trust_meat_chicken_whole', 'trust_meat_pork', 'trust_onions', 'trust_potatoes', 'trust_rice', 'trust_tomatoes'])

In [7]:
# Reshaping from wide to long format (including year and month as part of the identifier)
df_melted = df_cleaned.melt(id_vars=['Region', 'Date', 'year'], var_name='Food_Items', value_name='Price')

## Start Analysis

### One-way ANOVA

#### Check if one-way ANOVA is appropriate

##### one-way ANOVA can only be used of normality and homogeneity of variances holds. If not, there are two test that can be used, namely, Welch's ANOVA and Kruskall-Wallis Test.
##### * Welch's ANOVA can be used when your data is normally distributed, but the variances are unequal.
##### * Kruskall-Wallis Test, on the other hand,  is used when your data is non-normal, ordinal, or you can’t assume equal variances.


##### 1. Normality

In [8]:
# Grouping by region and year
grouped = df_melted.groupby(['year', 'Food_Items'])['Price'].apply(list)

# Shapiro-Wilk Test for each group
alpha = 0.05  # Significance level
for group, prices in grouped.items():
    stat, p = shapiro(prices)
    result = "Pass" if p > alpha else "Shapiro-Wilk Test Failed"
    print(f'{group}: W-statistic={stat:.4f}, p-value={p:.4f}, Result={result}')

(2007, 'beans'): W-statistic=nan, p-value=nan, Result=Shapiro-Wilk Test Failed
(2007, 'c_beans'): W-statistic=0.9744, p-value=0.0000, Result=Shapiro-Wilk Test Failed
(2007, 'c_cabbage'): W-statistic=0.9698, p-value=0.0000, Result=Shapiro-Wilk Test Failed
(2007, 'c_carrots'): W-statistic=0.9208, p-value=0.0000, Result=Shapiro-Wilk Test Failed
(2007, 'c_eggs'): W-statistic=0.7932, p-value=0.0000, Result=Shapiro-Wilk Test Failed
(2007, 'c_garlic'): W-statistic=0.8767, p-value=0.0000, Result=Shapiro-Wilk Test Failed
(2007, 'c_meat_beef_chops'): W-statistic=0.9786, p-value=0.0000, Result=Shapiro-Wilk Test Failed
(2007, 'c_meat_chicken_whole'): W-statistic=0.9837, p-value=0.0000, Result=Shapiro-Wilk Test Failed
(2007, 'c_meat_pork'): W-statistic=0.9850, p-value=0.0000, Result=Shapiro-Wilk Test Failed
(2007, 'c_onions'): W-statistic=0.9738, p-value=0.0000, Result=Shapiro-Wilk Test Failed
(2007, 'c_potatoes'): W-statistic=0.9533, p-value=0.0000, Result=Shapiro-Wilk Test Failed
(2007, 'c_rice')

##### only few passed, and most failed the test. But to be sure, Homogeneity of variances must be tested.

##### 2. Homogeneity of Variance

In [9]:
# Group the data by year and Food_Items
grouped_data = df_melted.groupby(['year', 'Food_Items'])

for (year, food_item), group in grouped_data:
    # Extract prices for all regions within the group
    region_prices = [group[group['Region'] == region]['Price'].values 
                     for region in group['Region'].unique()] 
    
    if len(region_prices) > 1:  # Ensure there's more than one region
        try:
            stat, p = levene(*region_prices)
            if p > 0.05:  # Levene's test not significant (p > 0.05)
                print(f'{food_item} ({year}): Levene\'s Test passed (p-value: {p:.4f})')
            else:
                print(f'{food_item} ({year}): Levene\'s Test failed (p-value: {p:.4f})')
        except ValueError:
            print(f"{food_item} ({year}): Levene's test failed. "
                  "Possible issue with data (e.g., all values are the same).")
    else:
        print(f'{food_item} ({year}): Only one region, cannot apply Levene\'s test')

beans (2007): Levene's Test failed (p-value: nan)
c_beans (2007): Levene's Test failed (p-value: 0.0000)
c_cabbage (2007): Levene's Test failed (p-value: 0.0000)
c_carrots (2007): Levene's Test failed (p-value: 0.0000)
c_eggs (2007): Levene's Test failed (p-value: 0.0000)
c_garlic (2007): Levene's Test failed (p-value: 0.0000)
c_meat_beef_chops (2007): Levene's Test failed (p-value: 0.0000)
c_meat_chicken_whole (2007): Levene's Test failed (p-value: 0.0000)
c_meat_pork (2007): Levene's Test failed (p-value: 0.0025)
c_onions (2007): Levene's Test failed (p-value: 0.0000)
c_potatoes (2007): Levene's Test failed (p-value: 0.0000)
c_rice (2007): Levene's Test failed (p-value: 0.0004)
c_tomatoes (2007): Levene's Test failed (p-value: 0.0000)
cabbage (2007): Levene's Test failed (p-value: nan)
carrots (2007): Levene's Test failed (p-value: nan)
eggs (2007): Levene's Test failed (p-value: nan)
garlic (2007): Levene's Test failed (p-value: nan)
h_beans (2007): Levene's Test failed (p-value: 0.

using Levene's test, all regions per item and year failed to show homogeneity in their Variances

This means the use of Kruskall-Wallis test for this dataset.

In [10]:
# Group the data by 'year', 'Food_Items', and 'Region' and collect prices into lists
grouped_data = df_melted.groupby(['year', 'Food_Items', 'Region'])['Price'].apply(list).reset_index()

# Get unique combinations of 'year' and 'Food_Items'
unique_combinations = grouped_data.groupby(['year', 'Food_Items'])

# Perform Kruskal-Wallis test for each combination of year and food item across regions
results = []

for (year, food_item), group in unique_combinations:
    # Extract price data for each region
    price_groups = [prices for prices in group['Price']]
    
    # Check if there are at least two regions to compare
    if len(price_groups) > 1:
        # Perform Kruskal-Wallis test
        stat, p_value = kruskal(*price_groups)
        results.append({
            'Year': year,
            'Food_Item': food_item,
            'H-statistic': stat,
            'p-value': p_value,
            'Result': 'Significant' if p_value < 0.05 else 'Not Significant'
        })
    else:
        results.append({
            'Year': year,
            'Food_Item': food_item,
            'H-statistic': None,
            'p-value': None,
            'Result': 'Not enough regions to test'
        })

# Convert results into a DataFrame for better visualization
results_df = pd.DataFrame(results)

# Display the results
results_df.to_csv("hypothesis_testing_result.csv", index=False)

##### upon checking the results, all of the regions per year and item have significant differences, which means that the prices of every item in the region per year are different, which may be due to various reasons like seasonality, transportation, and agricultural reasons.