In [250]:
import pandas as pd

In [251]:
df = pd.read_csv("FAOSTAT.csv")

df.head()

Unnamed: 0,Column1,Year,Item,continents,Area,Flag Description,Unit,Value
0,0,2000,Average dietary energy requirement (kcal/cap/day),Africa,Algeria,Estimated value,kcal/cap/d,2308.0
1,1,2000,Average dietary energy requirement (kcal/cap/day),Africa,Angola,Estimated value,kcal/cap/d,2121.0
2,2,2000,Average dietary energy requirement (kcal/cap/day),Africa,Benin,Estimated value,kcal/cap/d,2165.0
3,3,2000,Average dietary energy requirement (kcal/cap/day),Africa,Botswana,Estimated value,kcal/cap/d,2286.0
4,4,2000,Average dietary energy requirement (kcal/cap/day),Africa,Burkina Faso,Estimated value,kcal/cap/d,2183.0


In [252]:
df['Flag Description'].unique()

array(['Estimated value', 'Figure from international organizations',
       'Missing value', 'Missing value; suppressed', 'Official figure'],
      dtype=object)

In [253]:
# Drop rows where flag description is "Missing Value", "Missing value; suppressed"
df['Flag Description'].unique()

df = df[~df['Flag Description'].isin(["Missing value", "Missing value; suppressed"])]

In [254]:
pivot_df = df.pivot_table(index = ['Year','continents','Area','Flag Description'], columns = 'Item',values= 'Value').reset_index()

In [255]:
# Expand interval years for richer dataset

## Input is a series
def expand_year(row):
    y = row["Year"]
    if isinstance(y, str) and "-" in y:
        start, end = map(int, y.split("-"))
        expanded = []
        for yr in range(start, end + 1):
            row_copy = row.copy()
            row_copy["Year"] = str(yr)
            expanded.append(row_copy)
        return expanded
    else:
        return [row]
    
   

In [256]:
for _, row in df.iterrows():
    print(row)
    break

Column1                                                             0
Year                                                             2000
Item                Average dietary energy requirement (kcal/cap/day)
continents                                                     Africa
Area                                                          Algeria
Flag Description                                      Estimated value
Unit                                                       kcal/cap/d
Value                                                          2308.0
Name: 0, dtype: object


In [257]:
# Expand rows
expanded_rows = []
for _, row in pivot_df.iterrows():
    expanded_rows.extend(expand_year(row))

# Recreate the DataFrame
df_expanded = pd.DataFrame(expanded_rows)

In [258]:
# Define columns that are used to identify unique rows
key_cols = ['Year', 'continents', 'Area','Flag Description']

# Merge rows by taking first non-null for each group
df_merged = (
    df_expanded
    .groupby(key_cols, as_index=False)
    .first()
)

In [259]:
# df_merged.to_csv("Pivotted_considering_flag.csv",index=False)

## Break

In [260]:
conflict_columns = {}

# Check for conflicts
value_columns = [col for col in df_merged.columns if col not in ['Year', 'Area','continents', 'Flag Description']]

for col in value_columns:
    sub_df = df_merged[[col, 'Flag Description']].dropna(subset=[col])
    flags_present = sub_df['Flag Description'].unique()
    
    if len(flags_present) > 1:
        conflict_columns[col] = flags_present.tolist()

In [261]:
# conflict_columns

In [267]:
conflicted_cols = list(conflict_columns.keys())
non_conflicted_cols = [col for col in value_columns if col not in conflicted_cols]

# Define flag priority
flag_priority = {
    'Official figure': 0,
    'Estimated value': 1,
    'Figure from international organizations': 2 # Not really needed because it does not conflict with other flags
}

# Assign flag rank
df_merged['Flag Rank'] = df_merged['Flag Description'].map(flag_priority)

# Keep only necessary columns for conflicted + rank
df_conflicted = df_merged[['Year', 'continents', 'Area', 'Flag Rank'] + conflicted_cols].copy()
# df_conflicted = df_conflicted.sort_values(by=['Year', 'continents', 'Area', 'Flag Rank'])
# df_conflicted_cleaned = df_conflicted.drop_duplicates(subset=['Year', 'continents', 'Area'], keep='first')
df_conflicted_cleaned = (
    df_conflicted
    .sort_values(by=['Year', 'continents', 'Area', 'Flag Rank'])
    .groupby(['Year', 'continents', 'Area'], as_index=False)
    .first()
)

# Handle non-conflicted columns using .first()
df_non_conflicted = (
    df_merged[['Year', 'continents', 'Area'] + non_conflicted_cols]
    .groupby(['Year', 'continents', 'Area'], as_index=False)
    .first()
)

# Final merge
df_final = pd.merge(df_non_conflicted, df_conflicted_cleaned, on=['Year', 'continents', 'Area'], how='outer')
df_final = df_final.drop(columns=['Flag Rank'])


group_keys = ['Year', 'continents', 'Area']
other_cols = sorted([col for col in df_final.columns if col not in group_keys])

df_final1 = df_final[group_keys + other_cols]


In [273]:
df_final1.to_csv("Pivotted_final_updated.csv")