# Script para extraer datos utiles de cada tabla desde el formato zip directamente

In [1]:
import pandas as pd
import numpy as np
import zipfile
import io

#supress warnings
import warnings
warnings.filterwarnings('ignore')

In [7]:
zip_path = r'.\data\FAOSTAT\all_raw\Emissions_Totals_E_All_Data_(Normalized).zip'

### Abrir el CSV dentro del zip file que incluye el patron "All_Data_" y guardarlo en un dataframe

In [8]:
# Initialize an empty DataFrame
df = pd.DataFrame()

# Open the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # List all files in the zip archive
    all_files = zip_ref.namelist()
    print("Files in the zip archive:", all_files)
    
    # Iterate through the file names in the zip archive
    for file_name in all_files:
        # Check if the file name contains the pattern "All_Data_"
        if "All_Data_" in file_name and file_name.endswith(".csv"):
            print(f"Found matching file: {file_name}")
            # Read the CSV file into a DataFrame
            with zip_ref.open(file_name) as file:
                temp_df = pd.read_csv(file, encoding = "ISO-8859-1")
                # Concatenate the data from this CSV to the main DataFrame
                df = pd.concat([df, temp_df], ignore_index=True)

Files in the zip archive: ['Emissions_Totals_E_All_Data_(Normalized).csv', 'Emissions_Totals_E_AreaCodes.csv', 'Emissions_Totals_E_Flags.csv']
Found matching file: Emissions_Totals_E_All_Data_(Normalized).csv


### Determinar si es una tabla de PAISES, COMIDAS, MIXTA o OTRAS

(mirando bien creo que no hay tablas solo de comidas)

In [9]:
condition_paises = all(col in df.columns for col in ['Area Code', 'Year'])
condition_comida = df['Item'].isin(['Meat of chickens; fresh or chilled', 'Tapioca of cassava', 'Wheat', 'Barley', 'Cattle']).any()

if condition_paises and not condition_comida:
    print("La tabla parece ser SOLO de PAISES")
elif condition_comida and condition_paises:
    print("La tabla parece ser de COMIDAS (comidas, cultivos o animales) Y PAISES")
else:
    print("Ni idea que es esta tabla")

La tabla parece ser SOLO de PAISES


In [10]:
df.head()

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Source Code,Source,Unit,Value,Flag,Note
0,2,'004,Afghanistan,5064,Crop Residues,7234,Direct emissions (N2O),1961,1961,3050,FAO TIER 1,kt,0.8762,E,
1,2,'004,Afghanistan,5064,Crop Residues,7234,Direct emissions (N2O),1962,1962,3050,FAO TIER 1,kt,0.8829,E,
2,2,'004,Afghanistan,5064,Crop Residues,7234,Direct emissions (N2O),1963,1963,3050,FAO TIER 1,kt,0.8236,E,
3,2,'004,Afghanistan,5064,Crop Residues,7234,Direct emissions (N2O),1964,1964,3050,FAO TIER 1,kt,0.8882,E,
4,2,'004,Afghanistan,5064,Crop Residues,7234,Direct emissions (N2O),1965,1965,3050,FAO TIER 1,kt,0.8978,E,


Ver las diferentes combinaciones de item y element que hay

In [17]:
pd.set_option('display.max_rows', None)
df.groupby('Item')['Element'].value_counts().to_frame().sort_values(by = 'count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Item,Element,Unnamed: 2_level_1
IPCC Agriculture,Emissions (CO2eq) (AR5),17095
IPCC Agriculture,Emissions (N2O),17057
IPCC Agriculture,Emissions (CO2eq) from N2O (AR5),17057
IPCC Agriculture,Emissions (CO2eq) from CH4 (AR5),17019
IPCC Agriculture,Emissions (CH4),17019
Agricultural Soils,Emissions (N2O),16808
Agricultural Soils,Emissions (CO2eq) from N2O (AR5),16808
Agricultural Soils,Emissions (CO2eq) (AR5),16808
Manure Management,Emissions (CO2eq) (AR5),16286
Manure Management,Emissions (CO2eq) from CH4 (AR5),16247


# Convertir tablas mediante "group by" y SUMANDO el valor para cada combinacion

### Seleccionando elementos e items especificos

Por ejemplo para esta tabla solo:

Items: Enteric Fermentation, Agrifood systems

Element: Emissions (CO2eq) (AR5)

In [18]:
# Define selected Elements and selected Items
selected_Elements = ['Emissions (CO2eq) (AR5)']
selected_Items = ['Enteric Fermentation', 'Agrifood systems']

# Filter the DataFrame
filtered_df = df[(df['Element'].isin(selected_Elements)) & (df['Item'].isin(selected_Items))]

# Group by Area, Year, Item, and Element, then sum the values
grouped = filtered_df.groupby(['Area Code', 'Area', 'Year', 'Item', 'Element'])['Value'].sum().reset_index()

# Pivot the table to have combinations of Items and Elements as columns
pivot_df = grouped.pivot_table(index=['Area Code', 'Area', 'Year'], 
                               columns=['Item', 'Element'], 
                               values='Value', 
                               aggfunc='sum').reset_index()

# Flatten the multi-level column index and rename columns
pivot_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in pivot_df.columns.values]

**El codigo genera una columna nueva para cada combinacion de item y element**

In [20]:
pd.reset_option('display.max_rows')
pivot_df

Unnamed: 0,Area Code,Area,Year,Agrifood systems_Emissions (CO2eq) (AR5),Enteric Fermentation_Emissions (CO2eq) (AR5)
0,1,Armenia,1992,2.997345e+03,751.2400
1,1,Armenia,1993,1.841726e+03,671.5772
2,1,Armenia,1994,1.848546e+03,661.9620
3,1,Armenia,1995,1.959937e+03,653.4080
4,1,Armenia,1996,1.834244e+03,652.5484
...,...,...,...,...,...
15463,5873,OECD,2019,3.512659e+06,584107.4736
15464,5873,OECD,2020,3.438269e+06,579242.6892
15465,5873,OECD,2021,3.530872e+06,580687.3884
15466,5873,OECD,2030,,631136.7748


**Guardar tabla si queremos esta opcion**

Sino continuar

In [None]:
#new_file_name = file_name.split('.')[0]
#pivot_df.to_csv(rf'.\data\FAOSTAT\clean_data\country_tables\{new_file_name}_clean_groupby_country_year.csv')

### Seleccionando TODOS las combinaciones de item y element que hay

In [26]:
# Extract all unique items and elements
unique_items = df['Item'].unique()
unique_elements = df['Element'].unique()

# Filter the DataFrame
filtered_df = df[(df['Element'].isin(unique_elements)) & (df['Item'].isin(unique_items))]

# Group by Area, Year, Item, and Element, then sum the values
grouped = filtered_df.groupby(['Area Code', 'Area', 'Year', 'Item', 'Element'])['Value'].sum().reset_index()

# Pivot the table to have combinations of Items and Elements as columns
pivot_df = grouped.pivot_table(index=['Area Code', 'Area', 'Year'], 
                               columns=['Item', 'Element'], 
                               values='Value', 
                               aggfunc='sum').reset_index()

# Flatten the multi-level column index and rename columns
pivot_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in pivot_df.columns.values]

In [27]:
pivot_df

Unnamed: 0,Area Code,Area,Year,AFOLU_Direct emissions (N2O),AFOLU_Emissions (CH4),AFOLU_Emissions (CO2),AFOLU_Emissions (CO2eq) (AR5),AFOLU_Emissions (CO2eq) from CH4 (AR5),AFOLU_Emissions (CO2eq) from N2O (AR5),AFOLU_Emissions (N2O),...,Synthetic Fertilizers_Emissions (CO2eq) (AR5),Synthetic Fertilizers_Emissions (CO2eq) from N2O (AR5),Synthetic Fertilizers_Emissions (N2O),Synthetic Fertilizers_Indirect emissions (N2O),Waste_Emissions (CH4),Waste_Emissions (CO2),Waste_Emissions (CO2eq) (AR5),Waste_Emissions (CO2eq) from CH4 (AR5),Waste_Emissions (CO2eq) from N2O (AR5),Waste_Emissions (N2O)
0,1,Armenia,1992,1.7228,28.2523,130.5921,1498.4820,791.0644,576.8255,2.1767,...,82.7595,82.7595,0.3123,0.0766,20.4000,,606.9750,571.2000,35.7750,0.1350
1,1,Armenia,1993,1.4569,25.1645,130.5921,1325.0771,704.6060,489.8790,1.8486,...,82.7595,82.7595,0.3123,0.0766,19.2000,,572.0500,537.6000,34.4500,0.1300
2,1,Armenia,1994,1.3019,24.8122,130.5921,1260.2782,694.7416,434.9445,1.6413,...,38.6370,38.6370,0.1458,0.0357,19.0000,,567.2450,532.0000,35.2450,0.1330
3,1,Armenia,1995,1.2884,24.4821,130.5921,1245.5234,685.4988,429.4325,1.6205,...,38.6370,38.6370,0.1458,0.0357,18.9000,,563.6500,529.2000,34.4500,0.1300
4,1,Armenia,1996,1.3168,24.4132,130.5921,1251.4647,683.5696,437.3030,1.6502,...,44.1490,44.1490,0.1666,0.0409,19.0000,,567.2450,532.0000,35.2450,0.1330
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15477,5873,OECD,2019,1259.5546,27422.7672,-431216.1060,847348.4556,767837.4816,510727.0800,1927.2720,...,165811.4275,165811.4275,625.7035,153.4743,18384.0900,12720.0427,557452.2372,514754.5200,29977.6745,113.1233
15478,5873,OECD,2020,1273.4054,26807.1082,-430986.5039,822925.7537,750599.0296,503313.2280,1899.2952,...,170932.3405,170932.3405,645.0277,158.2148,18150.8400,12847.1362,551146.9637,508223.5200,30076.3075,113.4955
15479,5873,OECD,2021,1261.6715,26791.0993,-430986.5039,819435.4370,750150.7804,500271.1605,1887.8157,...,164611.7460,164611.7460,621.1764,152.3640,18239.9298,12733.9931,553793.5025,510718.0357,30341.4737,114.4961
15480,5873,OECD,2030,,,,,,,,...,154542.4350,154542.4350,583.1790,143.0436,,,,,,


# Alternativa para este caso en particular:

La tabla final tendria 252 columnas (252 combinaciones de item y element). No nos interesa tantos datos de cada tabla.

Pueden seleccionar todos los items pero solo 1 Element--->

In [32]:
# Extract all unique items and elements
unique_items = df['Item'].unique()
unique_elements = ['Emissions (CO2eq) (AR5)']

# Filter the DataFrame
filtered_df = df[(df['Element'].isin(unique_elements)) & (df['Item'].isin(unique_items))]

# Group by Area, Year, Item, and Element, then sum the values
grouped = filtered_df.groupby(['Area Code', 'Area', 'Year', 'Item', 'Element'])['Value'].sum().reset_index()

# Pivot the table to have combinations of Items and Elements as columns
pivot_df = grouped.pivot_table(index=['Area Code', 'Area', 'Year'], 
                               columns=['Item', 'Element'], 
                               values='Value', 
                               aggfunc='sum').reset_index()

# Flatten the multi-level column index and rename columns
pivot_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in pivot_df.columns.values]

In [33]:
pivot_df

Unnamed: 0,Area Code,Area,Year,AFOLU_Emissions (CO2eq) (AR5),Agricultural Soils_Emissions (CO2eq) (AR5),Agrifood Systems Waste Disposal_Emissions (CO2eq) (AR5),Agrifood systems_Emissions (CO2eq) (AR5),All sectors with LULUCF_Emissions (CO2eq) (AR5),All sectors without LULUCF_Emissions (CO2eq) (AR5),Burning - Crop residues_Emissions (CO2eq) (AR5),...,Manure left on Pasture_Emissions (CO2eq) (AR5),Net Forest conversion_Emissions (CO2eq) (AR5),On-farm energy use_Emissions (CO2eq) (AR5),Other_Emissions (CO2eq) (AR5),Pesticides Manufacturing_Emissions (CO2eq) (AR5),Pre- and Post- Production_Emissions (CO2eq) (AR5),Rice Cultivation_Emissions (CO2eq) (AR5),Savanna fires_Emissions (CO2eq) (AR5),Synthetic Fertilizers_Emissions (CO2eq) (AR5),Waste_Emissions (CO2eq) (AR5)
0,1,Armenia,1992,1498.4820,544.8665,250.1941,2.997345e+03,8.467390e+03,8.335908e+03,2.5811,...,402.8795,54.2254,740.2164,44.7850,0.2283,8.359032e+02,,4.2975,82.7595,606.9750
1,1,Armenia,1993,1325.0771,473.5815,266.3624,1.841726e+03,4.911174e+03,4.779692e+03,3.7940,...,346.1695,54.2254,93.0094,22.4720,0.1420,5.008966e+02,,4.2975,82.7595,572.0500
2,1,Armenia,1994,1260.2782,418.9650,282.6547,1.848546e+03,5.031029e+03,4.899547e+03,3.3673,...,340.8430,54.2254,97.4503,7.4200,0.3406,5.680747e+02,,4.2975,38.6370,567.2450
3,1,Armenia,1995,1245.5234,413.6385,296.7981,1.959937e+03,5.648471e+03,5.516989e+03,2.5027,...,334.9335,54.2254,114.9126,8.0295,0.3251,6.767576e+02,,4.2975,38.6370,563.6500
4,1,Armenia,1996,1251.4647,423.1520,301.4296,1.834244e+03,4.824605e+03,4.693762e+03,3.3953,...,334.5890,54.2254,127.6139,5.8565,0.5378,5.317826e+02,,1.6016,44.1490,567.2450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15477,5873,OECD,2019,847348.4556,442412.5445,263902.6370,3.512659e+06,1.491032e+07,1.531582e+07,7190.1476,...,130154.6175,209078.1220,257077.5152,44372.1198,26935.6153,1.535246e+06,29642.7824,36366.6864,165811.4275,557452.2372
15478,5873,OECD,2020,822925.7537,447471.1560,264319.9350,3.438269e+06,1.354679e+07,1.395840e+07,7229.4692,...,129087.5420,209078.1220,248882.1469,43695.7280,16682.2414,1.487048e+06,31723.9664,16128.9383,170932.3405,551146.9637
15479,5873,OECD,2021,819435.4370,443311.3980,264706.0607,3.530872e+06,1.414703e+07,1.456847e+07,7473.0049,...,130200.9925,209078.1220,250185.2035,45359.7342,27791.1224,1.572575e+06,29532.1516,28670.1217,164611.7460,553793.5025
15480,5873,OECD,2030,,,,,,,7997.2677,...,146126.8035,,,,,,36129.8224,,154542.4350,


# A partir de aqui se pueden eliminar o retener las columnas que sean utiles

In [None]:
#new_file_name = file_name.split('.')[0]
#pivot_df.to_csv(rf'.\data\FAOSTAT\clean_data\country_tables\{new_file_name}_clean_groupby_country_year.csv')