# Script para extraer datos utiles de cada tabla desde el formato zip directamente

In [1]:
import pandas as pd
import numpy as np
import zipfile
import io

#supress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
zip_path = r'.\data\FAOSTAT\all_raw\16-sample_Emissions_livestock_E_All_Data_(Normalized).zip'

### Abrir el CSV dentro del zip file que incluye el patron "All_Data_" y guardarlo en un dataframe

In [3]:
# Initialize an empty DataFrame
df = pd.DataFrame()

# Open the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # List all files in the zip archive
    all_files = zip_ref.namelist()
    print("Files in the zip archive:", all_files)
    
    # Iterate through the file names in the zip archive
    for file_name in all_files:
        # Check if the file name contains the pattern "All_Data_"
        if "All_Data_" in file_name and file_name.endswith(".csv"):
            print(f"Found matching file: {file_name}")
            # Read the CSV file into a DataFrame
            with zip_ref.open(file_name) as file:
                temp_df = pd.read_csv(file, encoding = "ISO-8859-1")
                # Concatenate the data from this CSV to the main DataFrame
                df = pd.concat([df, temp_df], ignore_index=True)

Files in the zip archive: ['Emissions_livestock_E_All_Data_(Normalized).csv', 'Emissions_livestock_E_AreaCodes.csv', 'Emissions_livestock_E_Flags.csv', 'Emissions_livestock_E_ItemCodes.csv']
Found matching file: Emissions_livestock_E_All_Data_(Normalized).csv


### Determinar si es una tabla de PAISES, COMIDAS, MIXTA o OTRAS

(mirando bien creo que no hay tablas solo de comidas)

In [4]:
condition_paises = all(col in df.columns for col in ['Area Code', 'Year'])
condition_comida = df['Item'].isin(['Meat of chickens; fresh or chilled', 'Tapioca of cassava', 'Wheat', 'Barley', 'Cattle']).any()

if condition_paises and not condition_comida:
    print("La tabla parece ser SOLO de PAISES")
elif condition_comida and condition_paises:
    print("La tabla parece ser de COMIDAS (comidas, cultivos o animales) Y PAISES")
else:
    print("Ni idea que es esta tabla")

La tabla parece ser de COMIDAS (comidas, cultivos o animales) Y PAISES


In [5]:
df.sample(10)

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Source Code,Source,Unit,Value,Flag,Note
54112,3,'008,Albania,1749,'F1749,Sheep and Goats,72300,Manure left on pasture (Emissions N2O),2009,2009,3050,FAO TIER 1,kt,0.8168,E,
1502314,61,'226,Equatorial Guinea,1053,'F1053,"Chickens, broilers",723811,Manure applied to soils that volatilises (N co...,1996,1996,3050,FAO TIER 1,kg,2938.4982,E,
5400524,5204,'013,Central America,976,'02122,Sheep,72254,Enteric fermentation (Emissions CH4),2050,2050,3050,FAO TIER 1,kt,49.1449,F,
696633,29,'108,Burundi,2029,'F2029,Poultry Birds,723812,Manure applied to soils that leaches (N content),1970,1970,3050,FAO TIER 1,kg,52353.702,E,
6071402,5802,'432,Land Locked Developing Countries,1749,'F1749,Sheep and Goats,72360,Manure left on pasture (Indirect emissions N2O),1968,1968,3050,FAO TIER 1,kt,7.8075,E,
4109273,200,'702,Singapore,1053,'F1053,"Chickens, broilers",72431,Livestock total (Emissions N2O),1983,1983,3050,FAO TIER 1,kt,0.002,E,
4288567,38,'144,Sri Lanka,1755,'F1755,All Animals,72306,Manure management (Emissions N2O),1998,1998,3050,FAO TIER 1,kt,0.0654,E,
894381,351,'159,China,960,'F0960,"Cattle, dairy",72360,Manure left on pasture (Indirect emissions N2O),2006,2006,3050,FAO TIER 1,kt,1.1622,E,
3755106,146,'498,Republic of Moldova,961,'F0961,"Cattle, non-dairy",723802,Manure left on pasture that leaches (N content),2009,2009,3050,FAO TIER 1,kg,172127.5625,E,
208211,10,'036,Australia,1079,'02152,Turkeys,72341,Manure applied to soils (Direct emissions N2O),1965,1965,3050,FAO TIER 1,kt,0.0053,E,


Ver las diferentes combinaciones de item y element que hay

In [6]:
pd.set_option('display.max_rows', None)
df.groupby('Item')['Element'].value_counts().to_frame().sort_values(by = 'count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Item,Element,Unnamed: 2_level_1
All Animals,Livestock total (Emissions CH4),16208
All Animals,Manure management (Emissions CH4),16123
All Animals,Enteric fermentation (Emissions CH4),16121
All Animals,Livestock total (Emissions N2O),16012
All Animals,Manure management (Emissions N2O),16012
All Animals,Manure applied to soils (Direct emissions N2O),15435
All Animals,Manure applied to soils (N content),15434
All Animals,Manure management (Direct emissions N2O),15422
Poultry Birds,Stocks,15398
All Animals,"Manure management (manure treated, N content)",15372


# Convertir tablas mediante "group by" y SUMANDO el valor para cada combinacion

### Seleccionando elementos e items especificos

Por ejemplo para esta tabla solo:

Items: Enteric Fermentation, Agrifood systems

Element: Emissions (CO2eq) (AR5)

In [15]:
# Define selected Elements and selected Items
selected_Elements = ['Livestock total (Emissions CH4)']
selected_Items = ['Chickens', 'Camels']

# Filter the DataFrame
filtered_df = df[(df['Element'].isin(selected_Elements)) & (df['Item'].isin(selected_Items))]

# Group by Area, Year, Item, and Element, then sum the values
grouped = filtered_df.groupby(['Area Code', 'Area', 'Year', 'Item', 'Element'])['Value'].sum().reset_index()

# Pivot the table to have combinations of Items and Elements as columns
pivot_df = grouped.pivot_table(index=['Area Code', 'Area', 'Year'], 
                               columns=['Item', 'Element'], 
                               values='Value', 
                               aggfunc='sum').reset_index()

# Flatten the multi-level column index and rename columns
pivot_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in pivot_df.columns.values]

**El codigo genera una columna nueva para cada combinacion de item y element**

In [16]:
pd.reset_option('display.max_rows')
pivot_df

Unnamed: 0,Area Code,Area,Year,Camels_Livestock total (Emissions CH4),Chickens_Livestock total (Emissions CH4)
0,1,Armenia,1992,,0.0893
1,1,Armenia,1993,,0.0266
2,1,Armenia,1994,,0.0269
3,1,Armenia,1995,,0.0271
4,1,Armenia,1996,,0.0272
...,...,...,...,...,...
14139,5873,OECD,2019,0.3455,237.0858
14140,5873,OECD,2020,0.3294,236.4638
14141,5873,OECD,2021,0.3260,235.3482
14142,5873,OECD,2030,0.2962,236.9513


**Guardar tabla si queremos esta opcion**

Sino continuar

In [9]:
#new_file_name = file_name.split('.')[0]
#pivot_df.to_csv(rf'.\data\FAOSTAT\clean_data\country_tables\{new_file_name}_clean_groupby_country_year.csv')

### Seleccionando TODOS las combinaciones de item y element que hay

In [10]:
# Extract all unique items and elements
unique_items = df['Item'].unique()
unique_elements = df['Element'].unique()

# Filter the DataFrame
filtered_df = df[(df['Element'].isin(unique_elements)) & (df['Item'].isin(unique_items))]

# Group by Area, Year, Item, and Element, then sum the values
grouped = filtered_df.groupby(['Area Code', 'Area', 'Year', 'Item', 'Element'])['Value'].sum().reset_index()

# Pivot the table to have combinations of Items and Elements as columns
pivot_df = grouped.pivot_table(index=['Area Code', 'Area', 'Year'], 
                               columns=['Item', 'Element'], 
                               values='Value', 
                               aggfunc='sum').reset_index()

# Flatten the multi-level column index and rename columns
pivot_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in pivot_df.columns.values]

In [11]:
pivot_df

Unnamed: 0,Area Code,Area,Year,All Animals_Emissions (N2O) (Manure applied),All Animals_Enteric fermentation (Emissions CH4),All Animals_Indirect emissions (N2O that leaches) (Manure applied),All Animals_Indirect emissions (N2O that leaches) (Manure on pasture),All Animals_Indirect emissions (N2O that volatilises) (Manure applied),All Animals_Indirect emissions (N2O that volatilises) (Manure on pasture),All Animals_Livestock total (Emissions CH4),...,Turkeys_Manure left on pasture (Indirect emissions N2O),Turkeys_Manure left on pasture (N content),Turkeys_Manure left on pasture that leaches (N content),Turkeys_Manure left on pasture that volatilises (N content),Turkeys_Manure management (Direct emissions N2O),Turkeys_Manure management (Emissions CH4),Turkeys_Manure management (Emissions N2O),Turkeys_Manure management (Indirect emissions N2O),"Turkeys_Manure management (manure treated, N content)",Turkeys_Stocks
0,1,Armenia,1992,0.1258,26.8300,0.0199,0.1595,0.0177,0.1417,28.0780,...,0.0035,5.216171e+05,1.564851e+05,1.043234e+05,0.0017,0.0040,0.0017,0.0000,2.130549e+05,400000.0
1,1,Armenia,1993,0.0642,23.9849,0.0101,0.1371,0.0090,0.1219,24.9554,...,0.0026,3.912128e+05,1.173639e+05,7.824257e+04,0.0013,0.0030,0.0013,0.0000,1.597912e+05,300000.0
2,1,Armenia,1994,0.0637,23.6415,0.0101,0.1330,0.0089,0.1183,24.6155,...,0.0026,3.912128e+05,1.173639e+05,7.824257e+04,0.0013,0.0030,0.0013,0.0000,1.597912e+05,300000.0
3,1,Armenia,1995,0.0630,23.3360,0.0099,0.1293,0.0088,0.1150,24.3106,...,0.0017,2.608086e+05,7.824257e+04,5.216171e+04,0.0008,0.0020,0.0008,0.0000,1.065274e+05,200000.0
4,1,Armenia,1996,0.0624,23.3053,0.0098,0.1287,0.0088,0.1144,24.2789,...,0.0017,2.608086e+05,7.824257e+04,5.216171e+04,0.0008,0.0020,0.0008,0.0000,1.065274e+05,200000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14212,5873,OECD,2019,190.4058,20860.9811,30.0641,53.0452,26.7236,47.1513,24968.3867,...,0.2596,3.887458e+07,1.166237e+07,7.774916e+06,2.7055,15.9601,2.7567,0.0512,3.498877e+08,211665832.0
14213,5873,OECD,2020,190.1513,20687.2388,30.0239,52.6270,26.6879,46.7796,24786.3914,...,0.2609,3.906094e+07,1.171828e+07,7.812189e+06,2.6847,15.8414,2.7353,0.0506,3.471659e+08,210285394.0
14214,5873,OECD,2021,188.7676,20738.8354,29.8054,53.1869,26.4937,47.2772,24789.9359,...,0.2593,3.883273e+07,1.164982e+07,7.766547e+06,2.6335,15.5198,2.6824,0.0489,3.403803e+08,206466645.0
14215,5873,OECD,2030,195.7446,22540.5992,30.9070,61.0169,27.4729,54.2372,26619.6309,...,0.3457,5.176058e+07,1.552817e+07,1.035212e+07,3.2249,18.8967,3.2933,0.0684,4.184029e+08,255985609.0


# Alternativa para este caso en particular:

La tabla final tendria 252 columnas (252 combinaciones de item y element). No nos interesa tantos datos de cada tabla.

Pueden seleccionar todos los items pero solo 1 Element--->

In [12]:
# Extract all unique items and elements
unique_items = df['Item'].unique()
unique_elements = ['Emissions (CO2eq) (AR5)']

# Filter the DataFrame
filtered_df = df[(df['Element'].isin(unique_elements)) & (df['Item'].isin(unique_items))]

# Group by Area, Year, Item, and Element, then sum the values
grouped = filtered_df.groupby(['Area Code', 'Area', 'Year', 'Item', 'Element'])['Value'].sum().reset_index()

# Pivot the table to have combinations of Items and Elements as columns
pivot_df = grouped.pivot_table(index=['Area Code', 'Area', 'Year'], 
                               columns=['Item', 'Element'], 
                               values='Value', 
                               aggfunc='sum').reset_index()

# Flatten the multi-level column index and rename columns
pivot_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in pivot_df.columns.values]

In [13]:
pivot_df

Unnamed: 0,Area Code,Area,Year


# A partir de aqui se pueden eliminar o retener las columnas que sean utiles

In [14]:
#new_file_name = file_name.split('.')[0]
#pivot_df.to_csv(rf'.\data\FAOSTAT\clean_data\country_tables\{new_file_name}_clean_groupby_country_year.csv')