In [None]:
# 1. Import Libraries and Functions

import pandas as pd
from functions_clean_data import safe_float, find_continents, replace_none

In [None]:
# 2. Load and Preprocess Raw Data

# Import the data from the main dataset
FAOSTAT = pd.read_csv("FAOSTAT_data_2025.csv")

# Convert the values of the 'Value' column, accounting for interval data
FAOSTAT['Value'] = FAOSTAT['Value'].apply(
    lambda x: safe_float(x) if not isinstance(x, (float, int)) else x
)

In [None]:
# 3. Drop Irrelevant or Redundant Columns

df = FAOSTAT.drop(['Domain Code', 'Domain', 'Element Code',
                   'Element', 'Item Code',
                   'Year Code', 'Note', 'Flag'], axis=1)

In [None]:
# 4. Convert and Enrich Data

# Convert values to float to ensure consistency
df['Value'] = df['Value'].apply(lambda x: float(x))

# Add continent information based on 'Area'
df['continents'] = df['Area'].apply(lambda x: find_continents(x))

# Replace None values in 'Unit' with a default value like "index"
df['Unit'] = df['Unit'].apply(replace_none)

In [None]:
# 5. Group Data for Imputation Preparation

# Group the data by year, item, continent, and area for aggregation
df_1 = df.groupby(['Year', 'Item', 'continents', 'Area']).agg({
    "Flag Description": "first",
    'Unit': 'first',
    'Value': 'mean'
})
df_1.reset_index(inplace=True)


In [None]:
# 6. First Stage Imputation (by Year, Item, Continent)

# Create list to store imputed values
results = []

# Loop over each unique item with missing 'Value'
for i in df_1.loc[df_1['Value'].isna(), 'Item'].unique():
    filtered = df_1[df_1['Item'] == i]
    grouped = filtered.groupby(['Year', 'continents']).agg({
        'Flag Description': 'first',
        'Unit': 'first',
        'Value': 'mean',
        'Item': 'first'
    }).reset_index()
    results.append(grouped)

# Combine all the grouped results into a single DataFrame
means_value_item_one_year = pd.concat(results, ignore_index=True)

In [None]:
# 7. Merge First Imputed Values Back

# Merge the imputed values back to the main dataframe
df_test = df_1.merge(
    means_value_item_one_year[['Year', 'Item', 'continents', 'Value']],
    on=['Year', 'Item', 'continents'],
    how='left',
    suffixes=('', '_patch')
)

# Fill missing values with the patch values
df_test['Value'] = df_test['Value'].fillna(df_test['Value_patch'])

# Drop the temporary patch column
df_test.drop('Value_patch', axis=1, inplace=True)


In [None]:
# 8. Second Stage Imputation (by Country and Year)

# Create list for additional imputation
results = []

# Loop over remaining missing items
for i in df_test.loc[df_test['Value'].isna(), 'Item'].unique():
    filtered = df_test[df_test['Item'] == i]
    
    # Group progressively to get country and year level means
    grouped = filtered.groupby(['Item', 'Area', 'Year', 'continents']).agg({
        'Flag Description': 'first',
        'Unit': 'first',
        'Value': 'mean',
    }).reset_index()

    grouped = grouped.groupby(['Item', 'continents', 'Year']).agg({
        'Flag Description': 'first',
        'Unit': 'first',
        'Value': 'mean',
    }).reset_index()

    grouped = grouped.groupby(['Item', 'Year']).agg({
        'Flag Description': 'first',
        'Unit': 'first',
        'Value': 'mean',
        'continents': 'first'
    }).reset_index()

    results.append(grouped)

# Combine results into single DataFrame
means_value_item_one_year = pd.concat(results, ignore_index=True)

In [None]:
# 9. Merge Secondary Imputed Values into DataFrame

df_test = df_test.merge(
    means_value_item_one_year[['Year', 'Item', 'Value', 'continents']],
    on=['Year', 'Item', 'continents'],
    how='left',
    suffixes=('', '_patch')
)

# Fill missing values again
df_test['Value'] = df_test['Value'].fillna(df_test['Value_patch'])
df_test.drop('Value_patch', axis=1, inplace=True)

# Drop rows that are entirely empty (threshold = 1 non-NA required)
df_test.dropna(axis=0, inplace=True, thresh=1)


In [None]:
# 10. Filter Out Irrelevant Items and Continents

items_to_drop = [
    'Prevalence of exclusive breastfeeding among infants 0-5 months of age (percent)',
    'Number of children under 5 years affected by wasting (million)',
    'Percentage of children under 5 years affected by wasting (percent)'
]
continent_to_drop = ['South_America', 'Europe', 'North_America']

# Drop the rows based on the item and continent condition
df_test = df_test[(~df_test['Item'].isin(items_to_drop)) | (~df_test['continents'].isin(continent_to_drop))]


In [None]:
# 11. Export the Cleaned Dataset

df_test.to_csv('FAOSTAT.csv')