In [1]:
import pandas as pd
from scipy.stats import chi2_contingency

def clean_data(data):
    data = data.dropna(subset=['loss_percentage', 'commodity'])
    data['loss_percentage'] = pd.to_numeric(data['loss_percentage'], errors='coerce')
    data = data.drop_duplicates()
    return data

def chi_square_test(data, categorical_column, value_column):
    """
    Perform a chi-square test on a contingency table.
    """
    contingency_table = pd.crosstab(data[categorical_column], data[value_column])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    return {"Chi2": chi2, "p-value": p, "DOF": dof, "Expected Frequencies": expected}

if __name__ == "__main__":
    data = pd.read_csv(r"C:\Users\dshah\OneDrive\Documents/cleaned_FAO_food_loss.csv")
    cleaned_data = clean_data(data)
    cleaned_data["loss_category"] = pd.cut(
        cleaned_data["loss_percentage"],
        bins=[0, 10, 20, 30, 50, 100],
        labels=['0-10', '10-20', '20-30', '30-50', '50+']
    )
    result = chi_square_test(cleaned_data, "commodity", "loss_category")
    print(result)


{'Chi2': np.float64(791.8564663094187), 'p-value': np.float64(6.859645986722733e-94), 'DOF': 134, 'Expected Frequencies': array([[8.19438445e+00, 8.05615551e-01],
       [1.82097432e+00, 1.79025678e-01],
       [2.54936405e+01, 2.50635949e+00],
       [2.64041277e+01, 2.59587233e+00],
       [7.28389729e+00, 7.16102712e-01],
       [9.10487161e+00, 8.95128390e-01],
       [2.27621790e+01, 2.23782097e+00],
       [2.23979842e+02, 2.20201584e+01],
       [8.92277418e+01, 8.77225822e+00],
       [7.28389729e+00, 7.16102712e-01],
       [1.27468203e+01, 1.25317975e+00],
       [1.18363331e+01, 1.16366691e+00],
       [8.19438445e+00, 8.05615551e-01],
       [4.55243581e+00, 4.47564195e-01],
       [1.72992561e+01, 1.70074394e+00],
       [3.64194864e+00, 3.58051356e-01],
       [1.18363331e+01, 1.16366691e+00],
       [1.54782817e+01, 1.52171826e+00],
       [1.72992561e+01, 1.70074394e+00],
       [7.28389729e+00, 7.16102712e-01],
       [3.18670506e+01, 3.13294936e+00],
       [1.8209743