In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mode

In [2]:
# Load dataset
file_path = "../Interim/cleaned_food_prices.csv"
df = pd.read_csv(file_path)

In [3]:
# View the first few rows of the dataset
print(df.head())

       country                            Region Province  City        lat  \
0  Philippines  Cordillera Administrative region     Abra  Abra  17.600651   
1  Philippines  Cordillera Administrative region     Abra  Abra  17.600651   
2  Philippines  Cordillera Administrative region     Abra  Abra  17.600651   
3  Philippines  Cordillera Administrative region     Abra  Abra  17.600651   
4  Philippines  Cordillera Administrative region     Abra  Abra  17.600651   

          lon        Date  year  month  beans  ...  l_tomatoes  c_tomatoes  \
0  120.619362  2007-01-01  2007      1    NaN  ...       26.03       27.84   
1  120.619362  2007-02-01  2007      2    NaN  ...       26.31       26.82   
2  120.619362  2007-03-01  2007      3    NaN  ...       24.95       26.21   
3  120.619362  2007-04-01  2007      4    NaN  ...       24.93       26.74   
4  120.619362  2007-05-01  2007      5    NaN  ...       26.30       27.19   

   inflation_tomatoes  trust_tomatoes  o_food_price_index  h_f

In [4]:
# Check the structure and datatypes
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23653 entries, 0 to 23652
Data columns (total 99 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       23653 non-null  object 
 1   Region                        23653 non-null  object 
 2   Province                      23653 non-null  object 
 3   City                          23653 non-null  object 
 4   lat                           23436 non-null  float64
 5   lon                           23436 non-null  float64
 6   Date                          23653 non-null  object 
 7   year                          23653 non-null  int64  
 8   month                         23653 non-null  int64  
 9   beans                         2384 non-null   float64
 10  cabbage                       3696 non-null   float64
 11  carrots                       3747 non-null   float64
 12  eggs                          5391 non-null   float64
 13  g

In [5]:
# Get summary statistics of numerical columns
print(df.describe())

                lat           lon          year         month        beans  \
count  23436.000000  23436.000000  23653.000000  23653.000000  2384.000000   
mean      11.709358    122.876852   2015.543779      6.474654    98.049673   
std        3.640368      1.949177      5.216106      3.464248    15.231500   
min        5.029099    118.735278   2007.000000      1.000000    55.490000   
25%        8.481003    121.086142   2011.000000      3.000000    87.162500   
50%       11.570265    122.851105   2016.000000      6.000000    98.190000   
75%       14.716339    124.667869   2020.000000      9.000000   107.500000   
max       18.194082    126.207645   2025.000000     12.000000   229.540000   

          cabbage      carrots         eggs       garlic  meat_beef_chops  \
count  3696.00000  3747.000000  5391.000000  3020.000000      2702.000000   
mean     72.09049    91.267086     7.008577   169.917825       274.926121   
std      32.25439    37.874833     1.511605    60.216075        70

In [6]:
#dropping columns related to food price index
df_nofpi = df.drop(columns=['o_food_price_index', 'h_food_price_index', 'l_food_price_index', 'c_food_price_index', 'inflation_food_price_index', 'trust_food_price_index'])

In [7]:
# Convert 'Date' column to datetime format
df_nofpi['Date'] = pd.to_datetime(df_nofpi['Date'])

In [8]:
# Check the unique values of the 'Date' column (like ranges)
print(df_nofpi['Date'].min(), df_nofpi['Date'].max())

2007-01-01 00:00:00 2025-01-01 00:00:00


In [9]:
#dropping columns related to inflation
df_noinf = df_nofpi.drop(columns=['inflation_beans','inflation_cabbage', 'inflation_carrots', 'inflation_eggs', 'inflation_garlic', 'inflation_meat_beef_chops', 'inflation_meat_chicken_whole', 'inflation_meat_pork', 'inflation_onions', 'inflation_potatoes', 'inflation_rice', 'inflation_tomatoes'])

In [10]:
#dropping columns related to trust scores
df_cleaned = df_noinf.drop(columns=['trust_beans','trust_cabbage', 'trust_carrots', 'trust_eggs', 'trust_garlic', 'trust_meat_beef_chops', 'trust_meat_chicken_whole', 'trust_meat_pork', 'trust_onions', 'trust_potatoes', 'trust_rice', 'trust_tomatoes'])

In [11]:
# View the dataset after dropping columns
print(df_cleaned.head())

       country                            Region Province  City        lat  \
0  Philippines  Cordillera Administrative region     Abra  Abra  17.600651   
1  Philippines  Cordillera Administrative region     Abra  Abra  17.600651   
2  Philippines  Cordillera Administrative region     Abra  Abra  17.600651   
3  Philippines  Cordillera Administrative region     Abra  Abra  17.600651   
4  Philippines  Cordillera Administrative region     Abra  Abra  17.600651   

          lon       Date  year  month  beans  ...  l_potatoes  c_potatoes  \
0  120.619362 2007-01-01  2007      1    NaN  ...       34.28       35.26   
1  120.619362 2007-02-01  2007      2    NaN  ...       34.21       34.21   
2  120.619362 2007-03-01  2007      3    NaN  ...       33.13       33.83   
3  120.619362 2007-04-01  2007      4    NaN  ...       33.09       33.85   
4  120.619362 2007-05-01  2007      5    NaN  ...       33.44       34.56   

   o_rice  h_rice  l_rice  c_rice  o_tomatoes  h_tomatoes  l_tomatoe

In [12]:
#dropping uneeded columns
df_unneeded = df_cleaned.drop(columns=['country', 'City', 'lat', 'lon'])

# Reshaping from wide to long format (including year and month as part of the identifier)
df_melted = df_unneeded.melt(id_vars=['Region', 'Province', 'Date', 'year', 'month'], var_name='Food_Items', value_name='Price')

In [13]:
# View the dataset after reshaping
print(df_melted.head())

                             Region Province       Date  year  month  \
0  Cordillera Administrative region     Abra 2007-01-01  2007      1   
1  Cordillera Administrative region     Abra 2007-02-01  2007      2   
2  Cordillera Administrative region     Abra 2007-03-01  2007      3   
3  Cordillera Administrative region     Abra 2007-04-01  2007      4   
4  Cordillera Administrative region     Abra 2007-05-01  2007      5   

  Food_Items  Price  
0      beans    NaN  
1      beans    NaN  
2      beans    NaN  
3      beans    NaN  
4      beans    NaN  


## Start of analysis

### 1.Exploratory Data Analysis

In [14]:
print(df_melted['Region'].unique())
print(df_melted['Province'].unique())
print(df_melted['Food_Items'].unique())

['Cordillera Administrative region' 'Region XIII' 'Region VI' 'Region V'
 'Region III' 'Autonomous region in Muslim Mindanao' 'Region IV-A'
 'Region VIII' 'Region VII' 'Region X' 'Region II' 'Region IV-B'
 'Region XII' 'Region XI' 'Region I' 'National Capital region' 'Region IX'
 'Market Average']
['Abra' 'Agusan del Norte' 'Agusan del Sur' 'Aklan' 'Albay' 'Antique'
 'Apayao' 'Aurora' 'Benguet' 'Basilan' 'Bataan' 'Batangas' 'Biliran'
 'Bohol' 'Bukidnon' 'Bulacan' 'Nueva Ecija' 'Cagayan' 'Misamis Oriental'
 'Oriental Mindoro' 'Camarines Norte' 'Masbate' 'Camiguin' 'Capiz'
 'Catanduanes' 'Southern Leyte' 'Cavite' 'Cebu' 'North Cotabato'
 'Maguindanao' 'Davao del Sur' 'Compostela Valley' 'Davao del Norte'
 'Davao Oriental' 'Dinagat Islands' 'Eastern Samar' 'Guimaras' 'Ifugao'
 'Ilocos Norte' 'Ilocos Sur' 'Iloilo' 'Isabela' 'Kalinga' 'South Cotabato'
 'La Union' 'Laguna' 'Lanao del Norte' 'Lanao del Sur' 'Leyte'
 'Pangasinan' 'Marinduque' 'Metropolitan Manila' 'Misamis Occidental'
 'Mounta

#### Measures of Central Tendency

In [15]:
grouped = df_melted.groupby(['Region', 'Province', 'Food_Items', 'year'])
print(grouped)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001CD933A4BF0>


##### mean and median

In [16]:
central_tendency = grouped['Price'].agg(['mean', 'median']).reset_index()
print(central_tendency)

                                     Region         Province Food_Items  year  \
0      Autonomous region in Muslim Mindanao          Basilan      beans  2007   
1      Autonomous region in Muslim Mindanao          Basilan      beans  2008   
2      Autonomous region in Muslim Mindanao          Basilan      beans  2009   
3      Autonomous region in Muslim Mindanao          Basilan      beans  2010   
4      Autonomous region in Muslim Mindanao          Basilan      beans  2011   
...                                     ...              ...        ...   ...   
91195                           Region XIII  Surigao del Sur   tomatoes  2021   
91196                           Region XIII  Surigao del Sur   tomatoes  2022   
91197                           Region XIII  Surigao del Sur   tomatoes  2023   
91198                           Region XIII  Surigao del Sur   tomatoes  2024   
91199                           Region XIII  Surigao del Sur   tomatoes  2025   

          mean  median  
0 

##### mode

In [17]:
#calculate mode
def calculate_mode(series):
    # Drop NaN values
    valid_values = series.dropna()
    
    if valid_values.empty:  # If no valid values, return NaN
        return np.nan
    
    # Compute the mode
    result = mode(valid_values)
    
    # Handle cases where result.mode might be scalar
    if isinstance(result.mode, np.ndarray) and len(result.mode) > 0:
        return result.mode[0]
    elif np.isscalar(result.mode):  # For scalar mode
        return result.mode
    else:
        return np.nan

central_tendency['Mode'] = grouped['Price'].transform(calculate_mode)


In [18]:
print(central_tendency)

                                     Region         Province Food_Items  year  \
0      Autonomous region in Muslim Mindanao          Basilan      beans  2007   
1      Autonomous region in Muslim Mindanao          Basilan      beans  2008   
2      Autonomous region in Muslim Mindanao          Basilan      beans  2009   
3      Autonomous region in Muslim Mindanao          Basilan      beans  2010   
4      Autonomous region in Muslim Mindanao          Basilan      beans  2011   
...                                     ...              ...        ...   ...   
91195                           Region XIII  Surigao del Sur   tomatoes  2021   
91196                           Region XIII  Surigao del Sur   tomatoes  2022   
91197                           Region XIII  Surigao del Sur   tomatoes  2023   
91198                           Region XIII  Surigao del Sur   tomatoes  2024   
91199                           Region XIII  Surigao del Sur   tomatoes  2025   

          mean  median  Mod

#### Measures of Dispersion

##### Range

In [19]:
#dropping uneeded columns
df_unneeded = df_cleaned.drop(columns=['country', 'City', 'lat', 'lon', 'Region', 'Province', 'Date', 'month'])

# Reshaping from wide to long format (including year and month as part of the identifier)
df_melted = df_unneeded.melt(id_vars=['year'], var_name='Food_Items', value_name='Price')

grouped_range = df_melted.groupby(['Food_Items', 'year'])
range_df = grouped_range['Price'].agg(lambda x: x.max() - x.min()).reset_index()
range_df.rename(columns={'Price': 'Range'}, inplace=True)


# Save cleaned data to a CSV file
range_df.to_csv("range_results.csv", index=False)

##### Variance and Standard Deviation

In [20]:
dispersion_df = grouped['Price'].agg(['var', 'std']).reset_index()
dispersion_df.rename(columns={'var': 'Variance', 'std': 'Standard Deviation'}, inplace=True)

##### Interquartile Range

In [21]:
def compute_iqr(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    return q3 - q1

iqr_df = grouped['Price'].agg(compute_iqr).reset_index()
iqr_df.rename(columns={'Price': 'IQR'}, inplace=True)

##### combine all into one DataFrame

In [23]:
dispersion_merge_df = pd.merge(central_tendency, dispersion_df, on=['Region', 'Province', 'Food_Items', 'year'])
final_eda_df = pd.merge(dispersion_merge_df, iqr_df, on=['Region', 'Province', 'Food_Items', 'year'])

In [24]:
print(final_eda_df)

                                     Region         Province Food_Items  year  \
0      Autonomous region in Muslim Mindanao          Basilan      beans  2007   
1      Autonomous region in Muslim Mindanao          Basilan      beans  2008   
2      Autonomous region in Muslim Mindanao          Basilan      beans  2009   
3      Autonomous region in Muslim Mindanao          Basilan      beans  2010   
4      Autonomous region in Muslim Mindanao          Basilan      beans  2011   
...                                     ...              ...        ...   ...   
91195                           Region XIII  Surigao del Sur   tomatoes  2021   
91196                           Region XIII  Surigao del Sur   tomatoes  2022   
91197                           Region XIII  Surigao del Sur   tomatoes  2023   
91198                           Region XIII  Surigao del Sur   tomatoes  2024   
91199                           Region XIII  Surigao del Sur   tomatoes  2025   

          mean  median  Mod

In [25]:
# Save cleaned data to a CSV file
final_eda_df.to_csv("eda_results.csv", index=False)