# Identify tables with country, country and food, or other data

The code goes through all the sample tables, determines its format and creates a dataframe with filenames and table number.

In [2]:
import os
import pandas as pd

In [3]:
# Define the directory containing the CSV files
directory = r'..\data\FAOSTAT\sample_data'

In [4]:
def process_csv_file(path):
    try:
        filename = os.path.basename(path)  # Mover esto fuera del bloque try para que esté disponible en except

        df = pd.read_csv(path, encoding_errors='replace') # Manejar errores de codificación reemplazando caracteres problemáticos

        table_nr = filename[:2]
        table_name_parts = filename.split('_')[1:4]
        table_name = '_'.join(table_name_parts)

        required_columns = ['Year', 'Area Code', 'Item']
        if not all(col in df.columns for col in required_columns):
            print(f"La tabla {filename} no tiene todas las columnas requeridas: {', '.join(required_columns)}")
            return {'table_type': "Otro/no_util", 'table_nr': table_nr, 'table_name': table_name}  # Return with "Otro/no_util"

        condition_paises = all(col in df.columns for col in ['Area Code', 'Year'])
        condition_comida = df['Item'].isin(['Meat of chickens; fresh or chilled', 'Tapioca of cassava', 'Wheat', 'Barley', 'Cattle']).any()

        if condition_paises and condition_comida:
            table_type = "Paises y comida"
        elif condition_paises:
            table_type = "Paises"
        elif condition_comida:
            table_type = "Solo comida"
        else:
            table_type = "Otro/no_util"

        return {'table_type': table_type, 'table_nr': table_nr, 'table_name': table_name}

    except UnicodeDecodeError:
        print(f"Error de codificación al leer la tabla {filename}. No se puede abrir.")
        return {'table_type': "Otro/no_util", 'table_nr': table_nr, 'table_name': table_name}  # Return with "Otro/no_util"

    except Exception as e:
        print(f"Error general al procesar la tabla {filename}: {e}")
        return None  # Return None for other errors

# Lista para almacenar las filas del DataFrame resultante
categories_rows = []

# Recursively walk through the directory and process CSV files
for root, dirs, files in os.walk(directory):
    for filename in files:
        if filename.endswith('.csv'):
            path = os.path.join(root, filename)
            new_row = process_csv_file(path)  # Obtener la nueva fila
            if new_row is not None:  # Verificar si se obtuvo una fila válida
                categories_rows.append(new_row)  # Agregar la fila a la lista

# Crear el DataFrame final a partir de las filas recolectadas
categories_df = pd.DataFrame(categories_rows, columns=['table_type', 'table_nr', 'table_name'])

La tabla 10-sample_Development_Assistance_to_Agriculture_E_All_Data_(Normalized).csv no tiene todas las columnas requeridas: Year, Area Code, Item
La tabla 19-sample_Employment_Indicators_Agriculture_E_All_Data_(Normalized).csv no tiene todas las columnas requeridas: Year, Area Code, Item
La tabla 20-sample_Employment_Indicators_Rural_E_All_Data_(Normalized).csv no tiene todas las columnas requeridas: Year, Area Code, Item
La tabla 31-sample_Environment_Temperature_change_E_All_Data_(Normalized).csv no tiene todas las columnas requeridas: Year, Area Code, Item
La tabla 32-sample_Exchange_rate_E_All_Data_(Normalized).csv no tiene todas las columnas requeridas: Year, Area Code, Item
La tabla 35-sample_Food_Aid_Shipments_WFP_E_All_Data_(Normalized).csv no tiene todas las columnas requeridas: Year, Area Code, Item
La tabla 36-sample_Food_and_Diet_Individual_Quantitative_Dietary_Data_E_All_Data_(Normalized).csv no tiene todas las columnas requeridas: Year, Area Code, Item
La tabla 39-sample

In [5]:
pd.set_option('display.max_rows', None)
categories_df

Unnamed: 0,table_type,table_nr,table_name
0,Paises,1,ASTI_Expenditures_E
1,Paises,2,ASTI_Researchers_E
2,Paises,3,Climate_change_Emissions
3,Paises,4,CommodityBalances_(non-food)_(-2013
4,Paises,5,CommodityBalances_(non-food)_(2010-)
5,Paises,6,CommodityBalances_(non-food)_E
6,Paises,7,ConsumerPriceIndices_E_All
7,Paises,8,Cost_Affordability_Healthy
8,Paises,9,Deflators_E_All
9,Otro/no_util,10,Development_Assistance_to


In [6]:
categories_df.to_csv(r'..\data\FAOSTAT\clean_data\all_tables_processed\all_tables_types.csv', index=False)

# Inspect only emission tables with food items

We need to know in which FOOD ITEMS to focus

In [10]:
categories_df.query('table_type == "Paises y comida" & table_name.str.contains("Emissions|Environment")')

Unnamed: 0,table_type,table_nr,table_name
11,Paises y comida,12,Emissions_crops_E
15,Paises y comida,16,Emissions_livestock_E
26,Paises y comida,27,Environment_LivestockManure_E
27,Paises y comida,28,Environment_LivestockPatterns_E
