# Script para extraer datos utiles de cada tabla desde el formato zip directamente

In [5]:
import pandas as pd
#import numpy as np
import zipfile
#import io
import glob


#supress warnings
import warnings
warnings.filterwarnings('ignore')

### Abrir el CSV dentro del zip file que incluye el patron "All_Data_" y guardarlo en un dataframe

In [2]:
# zip_path = r'.\data\FAOSTAT\all_raw\Emissions_Totals_E_All_Data_(Normalized).zip'

In [24]:
def read_CSV (numero):
    if numero < 10:
        numero = "0" + str(numero) # Añade un cero al principio si el número es menor a 10
    
    pattern = f"data//all_raw//{str(numero)}-*All_Data_(Normalized).zip"
    matching_files = glob.glob(pattern)
    
    if not matching_files:
        print("No existe el archivo")
        return None
 
    return matching_files[0] # Selecciona el primer archivo que coincida con el patrón

In [27]:
zip_path = read_CSV(26) # pasar el número de archivo que se quiere leer de "all_raw"    

# Initialize an empty DataFrame
df = pd.DataFrame()

# Open the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # List all files in the zip archive
    all_files = zip_ref.namelist()
    print("Files in the zip archive:", all_files)
    
    # Iterate through the file names in the zip archive
    for file_name in all_files:
        # Check if the file name contains the pattern "All_Data_"
        if "All_Data_" in file_name and file_name.endswith(".csv"):
            print(f"Found matching file: {file_name}")
            # Read the CSV file into a DataFrame
            with zip_ref.open(file_name) as file:
                temp_df = pd.read_csv(file, encoding = "ISO-8859-1")
                # Concatenate the data from this CSV to the main DataFrame
                df = pd.concat([df, temp_df], ignore_index=True)

Files in the zip archive: ['Environment_LandUse_E_All_Data_(Normalized).csv', 'Environment_LandUse_E_AreaCodes.csv', 'Environment_LandUse_E_Flags.csv', 'Environment_LandUse_E_ItemCodes.csv']
Found matching file: Environment_LandUse_E_All_Data_(Normalized).csv


### Determinar si es una tabla de PAISES, COMIDAS, MIXTA o OTRAS

(mirando bien creo que no hay tablas solo de comidas)

In [28]:
condition_paises = all(col in df.columns for col in ['Area Code', 'Year'])
condition_comida = df['Item'].isin(['Meat of chickens; fresh or chilled', 'Tapioca of cassava', 'Wheat', 'Barley', 'Cattle']).any()

if condition_paises and not condition_comida:
    print("La tabla parece ser SOLO de PAISES")
elif condition_comida and condition_paises:
    print("La tabla parece ser de COMIDAS (comidas, cultivos o animales) Y PAISES")
else:
    print("Ni idea que es esta tabla")

La tabla parece ser SOLO de PAISES


In [40]:
df.sample(10)

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
8993,13,'048,Bahrain,6655,Land under perm. meadows and pastures,7208,Share in Agricultural land,1963,1963,%,57.14,E
57039,91,'328,Guyana,6655,Land under perm. meadows and pastures,7209,Share in Land area,2020,2020,%,3.97,E
99874,169,'600,Paraguay,6717,Naturally regenerating forest,7210,Share in Forest land,1995,1995,%,99.92,E
6126,10,'036,Australia,6610,Agricultural land,7209,Share in Land area,1986,1986,%,60.99,E
133533,231,'840,United States of America,6610,Agricultural land,7209,Share in Land area,1967,1967,%,47.53,E
20935,115,'116,Cambodia,6650,Land under permanent crops,7208,Share in Agricultural land,1970,1970,%,4.27,E
133447,215,'834,United Republic of Tanzania,6646,Forest land,7209,Share in Land area,2019,2019,%,52.17,E
134590,240,'850,United States Virgin Islands,6620,Cropland,7277,Area per capita,2010,2010,ha/cap,0.0189,E
39344,58,'218,Ecuador,6650,Land under permanent crops,7208,Share in Agricultural land,1997,1997,%,17.41,E
81893,270,'175,Mayotte,6655,Land under perm. meadows and pastures,7208,Share in Agricultural land,1971,1971,%,0.12,E


Ver las diferentes combinaciones de item y element que hay

In [46]:
pd.set_option('display.max_rows', None)
df.groupby(['Item',"Unit"])['Element'].value_counts().to_frame().sort_values(by = 'count', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
Item,Unit,Element,Unnamed: 3_level_1
Agricultural land,%,Share in Land area,14892
Cropland,%,Share in Agricultural land,14704
Cropland,%,Share in Land area,14704
Cropland,ha/cap,Area per capita,14644
Arable land,%,Share in Agricultural land,14524
Land under perm. meadows and pastures,%,Share in Agricultural land,13835
Land under perm. meadows and pastures,%,Share in Land area,13835
Land under permanent crops,%,Share in Agricultural land,13668
Land area equipped for irrigation,%,Share in Agricultural land,11972
Land area equipped for irrigation,%,Share in Cropland,11972


# Convertir tablas mediante "group by" y SUMANDO el valor para cada combinacion

### Seleccionando elementos e items especificos

Por ejemplo para esta tabla solo:

Items: Enteric Fermentation, Agrifood systems

Element: Emissions (CO2eq) (AR5)

In [54]:
# Define selected Elements and selected Items
selected_Items = ['Cropland']
selected_Elements = ["Share in Agricultural land", "Share in Land area", "Area per capita"]


# Filter the DataFrame
filtered_df = df[(df['Element'].isin(selected_Elements)) & (df['Item'].isin(selected_Items))]

# Group by Area, Year, Item, and Element, then sum the values
grouped = filtered_df.groupby(['Area Code', 'Area', 'Year', 'Item', 'Element',"Unit"])['Value'].sum().reset_index()

# Pivot the table to have combinations of Items and Elements as columns
pivot_df = grouped.pivot_table(index=['Area Code', 'Area', 'Year'], 
                               columns=['Item', 'Element',"Unit"], 
                               values='Value', 
                               aggfunc='sum').reset_index()

# Flatten the multi-level column index and rename columns
pivot_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in pivot_df.columns.values]

**El codigo genera una columna nueva para cada combinacion de item y element**

In [55]:
pd.reset_option('display.max_rows')
pivot_df

Unnamed: 0,Area Code,Area,Year,Cropland_Area per capita_ha/cap,Cropland_Share in Agricultural land_%,Cropland_Share in Land area_%
0,1,Armenia,1992,0.1403,41.28,16.97
1,1,Armenia,1993,0.1466,41.74,17.32
2,1,Armenia,1994,0.1523,41.67,17.56
3,1,Armenia,1995,0.1535,39.71,17.35
4,1,Armenia,1996,0.1597,38.74,17.77
...,...,...,...,...,...,...
14699,5873,OECD,2016,,34.14,11.34
14700,5873,OECD,2017,,33.28,11.33
14701,5873,OECD,2018,,33.63,11.30
14702,5873,OECD,2019,,33.45,11.28


**Guardar tabla si queremos esta opcion**

Sino continuar

In [None]:
#new_file_name = file_name.split('.')[0]
#pivot_df.to_csv(rf'.\data\FAOSTAT\clean_data\country_tables\{new_file_name}_clean_groupby_country_year.csv')

### Seleccionando TODOS las combinaciones de item y element que hay

In [35]:
# Extract all unique items and elements
unique_items = df['Item'].unique()
unique_elements = df['Element'].unique()

# Filter the DataFrame
filtered_df = df[(df['Element'].isin(unique_elements)) & (df['Item'].isin(unique_items))]

# Group by Area, Year, Item, and Element, then sum the values
grouped = filtered_df.groupby(['Area Code', 'Area', 'Year', 'Item', 'Element'])['Value'].sum().reset_index()

# Pivot the table to have combinations of Items and Elements as columns
pivot_df = grouped.pivot_table(index=['Area Code', 'Area', 'Year'], 
                               columns=['Item', 'Element'], 
                               values='Value', 
                               aggfunc='sum').reset_index()

# Flatten the multi-level column index and rename columns
pivot_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in pivot_df.columns.values]

In [38]:
pivot_df.shape

(14985, 18)

In [36]:
pivot_df

Unnamed: 0,Area Code,Area,Year,Agricultural land_Share in Land area,Agriculture area actually irrigated_Share in Agricultural land,Agriculture area under organic agric._Share in Agricultural land,Arable land_Share in Agricultural land,Cropland_Area per capita,Cropland_Share in Agricultural land,Cropland_Share in Land area,Forest land_Share in Land area,Land area equipped for irrigation_Share in Agricultural land,Land area equipped for irrigation_Share in Cropland,Land under perm. meadows and pastures_Share in Agricultural land,Land under perm. meadows and pastures_Share in Land area,Land under permanent crops_Share in Agricultural land,Naturally regenerating forest_Share in Forest land,Planted Forest_Share in Forest land
0,1,Armenia,1992,41.10,,,36.15,0.1403,41.28,16.97,11.74,23.59,57.14,58.72,24.13,5.13,96.08,3.92
1,1,Armenia,1993,41.48,,,36.66,0.1466,41.74,17.32,11.74,23.52,56.35,58.26,24.17,5.08,96.17,3.83
2,1,Armenia,1994,42.15,,,36.25,0.1523,41.67,17.56,11.73,23.30,55.92,58.33,24.59,5.42,96.26,3.74
3,1,Armenia,1995,43.70,,,34.97,0.1535,39.71,17.35,11.72,22.62,56.96,60.29,26.34,4.74,96.35,3.65
4,1,Armenia,1996,45.87,,,34.46,0.1597,38.74,17.77,11.71,21.68,55.97,61.26,28.10,4.29,96.44,3.56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14980,5873,OECD,2016,33.21,,3.78,32.26,,34.14,11.34,32.03,5.50,16.12,65.86,21.87,1.88,89.37,10.63
14981,5873,OECD,2017,34.03,,4.48,31.44,,33.28,11.33,32.02,5.38,16.18,66.72,22.71,1.85,89.20,10.80
14982,5873,OECD,2018,33.62,,4.59,31.74,,33.63,11.30,32.03,5.47,16.28,66.37,22.31,1.89,89.14,10.86
14983,5873,OECD,2019,33.73,,4.67,31.55,,33.45,11.28,32.04,5.48,16.38,66.55,22.44,1.90,89.08,10.92


# Alternativa para este caso en particular:

La tabla final tendria 252 columnas (252 combinaciones de item y element). No nos interesa tantos datos de cada tabla.

Pueden seleccionar todos los items pero solo 1 Element--->

In [32]:
# Extract all unique items and elements
unique_items = df['Item'].unique()
unique_elements = ['Emissions (CO2eq) (AR5)']

# Filter the DataFrame
filtered_df = df[(df['Element'].isin(unique_elements)) & (df['Item'].isin(unique_items))]

# Group by Area, Year, Item, and Element, then sum the values
grouped = filtered_df.groupby(['Area Code', 'Area', 'Year', 'Item', 'Element'])['Value'].sum().reset_index()

# Pivot the table to have combinations of Items and Elements as columns
pivot_df = grouped.pivot_table(index=['Area Code', 'Area', 'Year'], 
                               columns=['Item', 'Element'], 
                               values='Value', 
                               aggfunc='sum').reset_index()

# Flatten the multi-level column index and rename columns
pivot_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in pivot_df.columns.values]

In [33]:
pivot_df

Unnamed: 0,Area Code,Area,Year,AFOLU_Emissions (CO2eq) (AR5),Agricultural Soils_Emissions (CO2eq) (AR5),Agrifood Systems Waste Disposal_Emissions (CO2eq) (AR5),Agrifood systems_Emissions (CO2eq) (AR5),All sectors with LULUCF_Emissions (CO2eq) (AR5),All sectors without LULUCF_Emissions (CO2eq) (AR5),Burning - Crop residues_Emissions (CO2eq) (AR5),...,Manure left on Pasture_Emissions (CO2eq) (AR5),Net Forest conversion_Emissions (CO2eq) (AR5),On-farm energy use_Emissions (CO2eq) (AR5),Other_Emissions (CO2eq) (AR5),Pesticides Manufacturing_Emissions (CO2eq) (AR5),Pre- and Post- Production_Emissions (CO2eq) (AR5),Rice Cultivation_Emissions (CO2eq) (AR5),Savanna fires_Emissions (CO2eq) (AR5),Synthetic Fertilizers_Emissions (CO2eq) (AR5),Waste_Emissions (CO2eq) (AR5)
0,1,Armenia,1992,1498.4820,544.8665,250.1941,2.997345e+03,8.467390e+03,8.335908e+03,2.5811,...,402.8795,54.2254,740.2164,44.7850,0.2283,8.359032e+02,,4.2975,82.7595,606.9750
1,1,Armenia,1993,1325.0771,473.5815,266.3624,1.841726e+03,4.911174e+03,4.779692e+03,3.7940,...,346.1695,54.2254,93.0094,22.4720,0.1420,5.008966e+02,,4.2975,82.7595,572.0500
2,1,Armenia,1994,1260.2782,418.9650,282.6547,1.848546e+03,5.031029e+03,4.899547e+03,3.3673,...,340.8430,54.2254,97.4503,7.4200,0.3406,5.680747e+02,,4.2975,38.6370,567.2450
3,1,Armenia,1995,1245.5234,413.6385,296.7981,1.959937e+03,5.648471e+03,5.516989e+03,2.5027,...,334.9335,54.2254,114.9126,8.0295,0.3251,6.767576e+02,,4.2975,38.6370,563.6500
4,1,Armenia,1996,1251.4647,423.1520,301.4296,1.834244e+03,4.824605e+03,4.693762e+03,3.3953,...,334.5890,54.2254,127.6139,5.8565,0.5378,5.317826e+02,,1.6016,44.1490,567.2450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15477,5873,OECD,2019,847348.4556,442412.5445,263902.6370,3.512659e+06,1.491032e+07,1.531582e+07,7190.1476,...,130154.6175,209078.1220,257077.5152,44372.1198,26935.6153,1.535246e+06,29642.7824,36366.6864,165811.4275,557452.2372
15478,5873,OECD,2020,822925.7537,447471.1560,264319.9350,3.438269e+06,1.354679e+07,1.395840e+07,7229.4692,...,129087.5420,209078.1220,248882.1469,43695.7280,16682.2414,1.487048e+06,31723.9664,16128.9383,170932.3405,551146.9637
15479,5873,OECD,2021,819435.4370,443311.3980,264706.0607,3.530872e+06,1.414703e+07,1.456847e+07,7473.0049,...,130200.9925,209078.1220,250185.2035,45359.7342,27791.1224,1.572575e+06,29532.1516,28670.1217,164611.7460,553793.5025
15480,5873,OECD,2030,,,,,,,7997.2677,...,146126.8035,,,,,,36129.8224,,154542.4350,


# A partir de aqui se pueden eliminar o retener las columnas que sean utiles

In [None]:
#new_file_name = file_name.split('.')[0]
#pivot_df.to_csv(rf'.\data\FAOSTAT\clean_data\country_tables\{new_file_name}_clean_groupby_country_year.csv')