# Prepare data for simple flask application

In [1]:
import pandas as pd
import numpy as np

In [2]:
trade_mx = pd.read_parquet(r'..\data\FAOSTAT\clean_data\food_trading\trade_matrix_carbon_footprint_v4.parquet')
trade_mx = trade_mx[trade_mx['Year'] == 2021]

In [3]:
cols_trade = ['Reporter Country Code',
 'Partner Country Code',
 'Item Code',
 'Value_tons',
 'distance_in_km',
 'same_continent',
 'share_border',
 'transportation_method',
 'kgCO2eq_tkm']

trade_mx = trade_mx[cols_trade]
trade_mx['by_air'] = np.where(trade_mx['transportation_method'] == 'Air', 1, 0)
trade_mx.drop('transportation_method', axis=1, inplace=True)

In [4]:
coords = pd.read_csv(r"..\data\FAOSTAT\clean_data\food_trading\country_to_continent_codes.csv")
coords.head()

Unnamed: 0,Area Code,Continent,Country,Capital,Latitude,Longitude
0,1,europe,armenia,yerevan,40.1833,44.5167
1,2,asia,afghanistan,kabul,34.526,69.181
2,3,europe,albania,tirana,41.3275,19.8189
3,4,africa,algeria,algiers,36.737232,3.086472
4,7,africa,angola,luanda,-8.839988,13.289437


In [5]:
new_items_df = pd.read_csv(r'..\data\FAOSTAT\clean_data\food_codes_categorized_final.csv', encoding = 'ISO-8859-1')

In [6]:
trade_mx = pd.merge(trade_mx, new_items_df[['Item_Code', 'Item']], 
                               left_on='Item Code', right_on='Item_Code', how='inner')

In [7]:
coors_cols = ['Area Code', 'Continent', 'Country']

trade_mx = trade_mx.merge(coords[coors_cols], left_on='Partner Country Code', right_on='Area Code', how='left')
trade_mx.drop('Area Code', axis=1, inplace=True)
trade_mx.rename(columns={'Continent': 'Continent_source',
                         'Country': 'Country_source'}, inplace=True)

In [8]:
trade_mx = trade_mx.merge(coords[coors_cols], left_on='Reporter Country Code', right_on='Area Code', how='left')
trade_mx.drop('Area Code', axis=1, inplace=True)
trade_mx.rename(columns={'Continent': 'Continent_target',
                            'Country': 'Country_target'}, inplace=True)

In [15]:
trade_mx.Item.nunique()

437

# Export top items list for homepage selection list

In [26]:
top_items = trade_mx.groupby(['Country_target', 'Item'])['kgCO2eq_tkm'].max().reset_index().\
    sort_values('kgCO2eq_tkm', ascending=False)

food_items = top_items.head(440)['Item'].unique().tolist()
print(len(food_items))
print(food_items)

100
['bananas', 'soya beans', 'cassava; dry', 'tomatoes', 'palm oil', 'wheat', 'oranges', 'plantains and cooking bananas', 'grapes', 'pineapples', 'maize (corn)', 'beer of barley; malted', 'avocados', 'chillies and peppers; green (capsicum spp. and pimenta spp.)', 'other tropical fruits; n.e.c.', 'tangerines; mandarins; clementines', 'starch of cassava', 'cucumbers and gherkins', 'meat of chickens; fresh or chilled', 'mangoes; guavas and mangosteens', 'lemons and limes', 'pomelos and grapefruits', 'watermelons', 'meat of pig with the bone; fresh or chilled', 'cheese from whole cow milk', 'cake of soya beans', 'other fruits; n.e.c.', 'juice of pineapples; concentrated', 'pineapple juice', 'rice; broken', 'raw cane or beet sugar (centrifugal only)', 'pumpkins; squash and gourds', 'sorghum', 'blueberries', 'cabbages', 'onions and shallots; dry (excluding dehydrated)', 'barley', 'asparagus', 'lettuce and chicory', 'vegetables frozen', 'rice; milled', 'cranberries', 'cake of palm kernel', '

In [27]:
# Convert to desired format
import json
json_data = {
    "food_items": [
        {"value": item.lower(), "label": item.title()} for item in food_items
    ]
}

# Export to JSON file (e.g., "food_items.json")
with open("food_items.json", "w") as json_file:
    json.dump(json_data, json_file, indent=4)  # indent for readability

# Export flask app data

In [11]:
trade_mx = trade_mx.iloc[:,3:]

In [13]:
trade_mx['Item'] = trade_mx['Item'].apply(lambda x: x.lower())

In [14]:
trade_mx.to_parquet('../data/trade_mx_app.parquet', index=False)

In [161]:
item = 'Soya beans'
country_target = 'argentina'

# Filter the DataFrame for the specified country and item
filtered_df = trade_mx[(trade_mx['Country_target'] == country_target) & (trade_mx['Item'] == item)]
filtered_df.sort_values('Value_tons', ascending=False, inplace=True)
filtered_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.sort_values('Value_tons', ascending=False, inplace=True)


Unnamed: 0,Value_tons,distance_in_km,same_continent,share_border,kgCO2eq_tkm,by_air,Item_Code,Item,Continent_source,Country_source,Continent_target,Country_target
17976,4601769.12,1039.568365,1,1.0,956770700.0,0,236,Soya beans,south america,paraguay,south america,argentina
7135,218176.89,2336.787789,1,1.0,101966600.0,0,236,Soya beans,south america,brazil,south america,argentina
4321,28068.79,1859.291672,1,1.0,10437610.0,0,236,Soya beans,south america,bolivia,south america,argentina
20883,16050.9,208.300837,1,1.0,668683.2,0,236,Soya beans,south america,uruguay,south america,argentina
20407,1263.52,8396.571025,0,0.0,106092.4,0,236,Soya beans,north america,united states of america,south america,argentina
8617,0.83,9067.594487,0,0.0,75.26103,0,236,Soya beans,north america,canada,south america,argentina
15893,0.56,11154.520218,0,0.0,62.46531,0,236,Soya beans,europe,italy,south america,argentina
10282,0.07,1135.709448,1,1.0,15.89993,0,236,Soya beans,south america,chile,south america,argentina


In [171]:
def find_top_sources(df, country_target, item):
    # Filter the DataFrame for the specified country and item
    filtered_df = df[(df['Country_target'] == country_target) & (df['Item'] == item)]
    
    if filtered_df.empty:
        return None, 0, 0

    # Calculate the total weight of the item imported from each source country
    total_weight = filtered_df['Value_tons'].sum()
    
    # Calculate the total weight of the item imported from each source country
    source_weights = filtered_df.groupby('Country_source')['Value_tons'].sum().sort_values(ascending=False)
    
    # Calculate the probability for each Country_source based on the weight
    source_probabilities = source_weights / total_weight
    
    # Filter for countries with individual probability higher than 20%
    top_sources = source_probabilities[source_probabilities > 0.20].head(3)
    
    if top_sources.empty:
        return None, 0, 0

    # Check if all distances for the top sources are less than 2000 km
    top_countries = top_sources.index
    top_distances = filtered_df[filtered_df['Country_source'].isin(top_countries)]['distance_in_km']
    
    if (top_distances < 2000).all() or country_target in top_countries:
        print("GOOD! You are eating local")
        return None, 0, 0

    # Calculate the summed probability of the selected countries
    summed_probability = top_sources.sum()
    
    # Calculate the mean CO2 emissions for the selected countries
    mean_co2_emissions = filtered_df[filtered_df['Country_source'].isin(top_countries)]['kgCO2eq_tkm'].mean()
    
    return top_sources, summed_probability, mean_co2_emissions

# Example usage:
# Assuming you have a DataFrame named `trade_mx`
country_target = 'argentina'
item = 'Soya beans'
top_sources, summed_probability, mean_co2 = find_top_sources(trade_mx, country_target, item)

if top_sources is not None:
    if len(top_sources) == 3:
        sources_text = f"{top_sources.index[0]}, {top_sources.index[1]} or {top_sources.index[2]}"
    elif len(top_sources) == 2:
        sources_text = f"{top_sources.index[0]} or {top_sources.index[1]}"
    else:  # len(top_sources) == 1
        sources_text = top_sources.index[0]
    
    sources_text = sources_text.upper()
    # Print the source(s), probability, and formatted CO2 emissions
    print(f"Look out! With a probability of {summed_probability*100:.2f}%, your food is probably coming from\n{sources_text}.")

    # Format CO2 emissions in millions of kg
    mean_co2_million_kg = mean_co2 / 1e6  
    print(f"That generates approximately {mean_co2_million_kg:.2f} million kg of CO2!!")


GOOD! You are eating local
