# Script para extraer datos utiles de cada tabla desde el formato zip directamente

In [2]:
import pandas as pd
#import numpy as np
import zipfile
#import io
import glob


#supress warnings
import warnings
warnings.filterwarnings('ignore')

### Abrir el CSV dentro del zip file que incluye el patron "All_Data_" y guardarlo en un dataframe

In [3]:
# se comenta porque toma el zip de la funcion read_CSV
# zip_path = r'.\data\FAOSTAT\all_raw\Emissions_Totals_E_All_Data_(Normalized).zip'

In [7]:
def read_CSV (numero):
    if numero < 10:
        numero = "0" + str(numero) # Añade un cero al principio si el número es menor a 10
    
    pattern = f"data//FAOSTAT//all_raw//{str(numero)}-*All_Data_(Normalized).zip"
    matching_files = glob.glob(pattern)
    
    if not matching_files:
        print("No existe el archivo")
        return None
 
    return matching_files[0] # Selecciona el primer archivo que coincida con el patrón

In [121]:
zip_path = read_CSV(17) # pasar el número de archivo que se quiere leer de "all_raw"    

# Initialize an empty DataFrame
df = pd.DataFrame()

# Open the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # List all files in the zip archive
    all_files = zip_ref.namelist()
    print("Files in the zip archive:", all_files)
    
    # Iterate through the file names in the zip archive
    for file_name in all_files:
        # Check if the file name contains the pattern "All_Data_"
        if "All_Data_" in file_name and file_name.endswith(".csv"):
            print(f"Found matching file: {file_name}")
            # Read the CSV file into a DataFrame
            with zip_ref.open(file_name) as file:
                temp_df = pd.read_csv(file, encoding = "ISO-8859-1")
                # Concatenate the data from this CSV to the main DataFrame
                df = pd.concat([df, temp_df], ignore_index=True)

Files in the zip archive: ['Emissions_Pre_Post_Production_E_All_Data_(Normalized).csv', 'Emissions_Pre_Post_Production_E_AreaCodes.csv', 'Emissions_Pre_Post_Production_E_Flags.csv']
Found matching file: Emissions_Pre_Post_Production_E_All_Data_(Normalized).csv


### Determinar si es una tabla de PAISES, COMIDAS, MIXTA o OTRAS

(mirando bien creo que no hay tablas solo de comidas)

In [122]:
condition_paises = all(col in df.columns for col in ['Area Code', 'Year'])
condition_comida = df['Item'].isin(['Meat of chickens; fresh or chilled', 'Tapioca of cassava', 'Wheat', 'Barley', 'Cattle']).any()

if condition_paises and not condition_comida:
    print("La tabla parece ser SOLO de PAISES")
elif condition_comida and condition_paises:
    print("La tabla parece ser de COMIDAS (comidas, cultivos o animales) Y PAISES")
else:
    print("Ni idea que es esta tabla")

La tabla parece ser SOLO de PAISES


In [123]:
df.sample(2)

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
452627,5400,'150,Europe,6506,Food Packaging,7273,Emissions (CO2),1991,1991,kt,43882.504845,E
122448,61,'226,Equatorial Guinea,6500,Energy Use (Pre- and Post-Production),723113,Emissions (CO2eq) (AR5),2007,2007,kt,74.418882,E


Ver las diferentes combinaciones de item y element que hay

In [124]:
pd.set_option('display.max_rows', None)
df.groupby(['Item',"Element"])["Unit"].value_counts().to_frame().sort_values(by = 'Item', ascending=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
Item,Element,Unit,Unnamed: 3_level_1
Agrifood Systems Waste Disposal,Emissions (CH4),kt,8407
Agrifood Systems Waste Disposal,Emissions (CO2),kt,4589
Agrifood Systems Waste Disposal,Emissions (CO2eq) (AR5),kt,8407
Agrifood Systems Waste Disposal,Emissions (N2O),kt,8014
Cold Chain F-Gas,Emissions (CO2eq) from F-gases (AR5),kt,7472
Domestic Wastewater,Emissions (CH4),kt,7797
Domestic Wastewater,Emissions (CO2eq) (AR5),kt,7893
Domestic Wastewater,Emissions (N2O),kt,6650
Energy Use (Pre- and Post-Production),Energy Use (Total),TJ,7299
Energy Use (Pre- and Post-Production),"Energy Use (Natural Gas, including LNG)",TJ,4158


In [125]:
pd.set_option('display.max_rows', None)
df.groupby(["Element",'Item'])["Unit"].value_counts().to_frame().sort_values(by = ['Element','Item'], ascending=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
Element,Item,Unit,Unnamed: 3_level_1
Emissions (CH4),Agrifood Systems Waste Disposal,kt,8407
Emissions (CH4),Domestic Wastewater,kt,7797
Emissions (CH4),Energy Use (Pre- and Post-Production),kt,8435
Emissions (CH4),Food Household Consumption,kt,7835
Emissions (CH4),Food Packaging,kt,5037
Emissions (CH4),Food Processing,kt,3949
Emissions (CH4),Food Retail,kt,7314
Emissions (CH4),Food Transport,kt,8130
Emissions (CH4),Industrial Wastewater,kt,8056
Emissions (CH4),Pesticides Manufacturing,kt,7730


# Convertir tablas mediante "group by" y SUMANDO el valor para cada combinacion

### Seleccionando elementos e items especificos

Por ejemplo para esta tabla solo:

Items: Enteric Fermentation, Agrifood systems

Element: Emissions (CO2eq) (AR5)

In [126]:
# Define selected Elements and selected Items
selected_Items = ['Total Energy']
selected_Elements = ["Emissions (CO2)", "Emissions (CH4)", "Emissions (N2O)", "Energy use in agriculture"]

# Filter the DataFrame
filtered_df = df[(df['Element'].isin(selected_Elements)) & (df['Item'].isin(selected_Items))]

# Group by Area, Year, Item, and Element, then sum the values
grouped = filtered_df.groupby(['Area Code', 'Area', 'Year', 'Item', 'Element',"Unit"])['Value'].sum().reset_index()

# Pivot the table to have combinations of Items and Elements as columns
pivot_df = grouped.pivot_table(index=['Area Code', 'Area', 'Year'], 
                               columns=['Item', 'Element',"Unit"], 
                               values='Value', 
                               aggfunc='sum').reset_index()

# Flatten the multi-level column index and rename columns
pivot_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in pivot_df.columns.values]

**El codigo genera una columna nueva para cada combinacion de item y element**

In [127]:
pd.reset_option('display.max_rows')
pivot_df.sample(5)

ValueError: a must be greater than 0 unless no samples are taken

**Guardar tabla si queremos esta opcion**

Sino continuar

In [None]:
new_file_name = file_name.split('.')[0]
print(new_file_name)
# pivot_df.to_csv(rf'.\data\FAOSTAT\clean_data\country_tables\{new_file_name}_clean_groupby_country_year.csv')

Emissions_Agriculture_Energy_E_Flags


### Seleccionando TODOS las combinaciones de item y element que hay

In [None]:
# Extract all unique items and elements
unique_items = df['Item'].unique()
unique_elements = df['Element'].unique()

# Filter the DataFrame
filtered_df = df[(df['Element'].isin(unique_elements)) & (df['Item'].isin(unique_items))]

# Group by Area, Year, Item, and Element, then sum the values
grouped = filtered_df.groupby(['Area Code', 'Area', 'Year', 'Item', 'Element'])['Value'].sum().reset_index()

# Pivot the table to have combinations of Items and Elements as columns
pivot_df = grouped.pivot_table(index=['Area Code', 'Area', 'Year'], 
                               columns=['Item', 'Element'], 
                               values='Value', 
                               aggfunc='sum').reset_index()

# Flatten the multi-level column index and rename columns
pivot_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in pivot_df.columns.values]

In [None]:
pivot_df.head(2)
pivot_df[pivot_df["Area Code"]==5000].to_csv("prueba_tabla_18.csv")

PermissionError: [Errno 13] Permission denied: 'prueba_tabla_18.csv'

# Alternativa para este caso en particular:

La tabla final tendria 252 columnas (252 combinaciones de item y element). No nos interesa tantos datos de cada tabla.

Pueden seleccionar todos los items pero solo 1 Element--->

In [None]:
# Extract all unique items and elements
unique_items = df['Item'].unique()
unique_elements = ['Emissions (CO2eq) (AR5)']

# Filter the DataFrame
filtered_df = df[(df['Element'].isin(unique_elements)) & (df['Item'].isin(unique_items))]

# Group by Area, Year, Item, and Element, then sum the values
grouped = filtered_df.groupby(['Area Code', 'Area', 'Year', 'Item', 'Element'])['Value'].sum().reset_index()

# Pivot the table to have combinations of Items and Elements as columns
pivot_df = grouped.pivot_table(index=['Area Code', 'Area', 'Year'], 
                               columns=['Item', 'Element'], 
                               values='Value', 
                               aggfunc='sum').reset_index()

# Flatten the multi-level column index and rename columns
pivot_df.columns = ['_'.join(col).strip() if col[1] else col[0] for col in pivot_df.columns.values]

In [None]:
pivot_df

Unnamed: 0,Area Code,Area,Year


# A partir de aqui se pueden eliminar o retener las columnas que sean utiles

In [None]:
#new_file_name = file_name.split('.')[0]
#pivot_df.to_csv(rf'.\data\FAOSTAT\clean_data\country_tables\{new_file_name}_clean_groupby_country_year.csv')

In [None]:
# TODO mandar la tabla con el numero de encabezado a la carpeta de "clean_data" con el nombre de "numero_clean_groupby_country_year.csv"