In [None]:
# Import libraries
import os
import pandas as pd
from sqlalchemy import create_engine, inspect, MetaData, Table, select, func
from local_settings import postgresql as settings

In [None]:
# Define a function for creating the sqlalchemy engine
def get_engine(user, passwd, host, port, db):
    url = f"postgresql+psycopg2://{user}:{passwd}@{host}:{port}/{db}"
    engine = create_engine(url, pool_size= 50, echo= False)

    return engine

In [None]:
# Create the engine and respective connection

engine = get_engine(
    user= settings['pguser'],
    passwd=settings['pgpasswd'],
    host=settings['pghost'],
    port=settings['pgport'],
    db=settings['pgdb']
)

connection = engine.connect()

Once the engine is connected to the remote database, we can use the `inspect` function from SQLAlchemy to examine the contents of the database, i.e. its tables.

In [None]:
inspector = inspect(engine)

print(inspector.get_table_names())

[]


In [46]:
path = os.getcwd() + '/food_carbon_footprint_2018_long.csv'
foodprint_2018 = pd.read_csv(path, sep= ';')
foodprint_2018.to_sql(name= 'foodprint', con= connection, if_exists= 'replace', index= False)

250

In [4]:
inspector = inspect(engine)

print(inspector.get_table_names())

['foodprint']


In [5]:
metadata= MetaData()
foodprint = Table('foodprint', metadata, autoload_replace= True, autoload_with= engine)

In [6]:
stmt = select(foodprint)

print(stmt, '\n')
print(pd.DataFrame(connection.execute(stmt).fetchall()).head(5))


SELECT foodprint.country, foodprint.product, foodprint.metric, foodprint.value 
FROM foodprint 

       country product                                     metric  value
0    Argentina    Pork  Supplied for Consumption (kg/person/year)  10.51
1    Australia    Pork  Supplied for Consumption (kg/person/year)  24.14
2      Albania    Pork  Supplied for Consumption (kg/person/year)  10.88
3      Iceland    Pork  Supplied for Consumption (kg/person/year)  21.69
4  New Zealand    Pork  Supplied for Consumption (kg/person/year)  22.29


In [7]:
# Data type of each column
for column in foodprint.columns:
    print(f"{column.key}: {column.type}")

country: TEXT
product: TEXT
metric: TEXT
value: DOUBLE PRECISION


In [28]:
# Missing data in each column
print("Missing values: \n")
for column in foodprint.columns:
    stmt = select(func.count()).where(column.is_(None))
    result = connection.execute(stmt).scalar()
    print(f"{column.key}: {result}")

Missing values: 

country: 0
product: 0
metric: 0
value: 0


In [9]:
# Number of countries under analysis for each product and metric

query = """
    SELECT product,
           metric,
           COUNT(country)
        FROM foodprint
        GROUP BY product, metric
        ORDER BY product, metric
"""

pd.read_sql_query(query, con= connection)


Unnamed: 0,product,metric,count
0,Animal products,Total Kg CO2/year/person,130
1,Beef,Kg CO2/person/year,130
2,Beef,Supplied for Consumption (kg/person/year),130
3,Difference Animal Non-Animals,Products/person/year in kg,130
4,Eggs,Kg CO2/person/year,130
5,Eggs,Supplied for Consumption (kg/person/year),130
6,Fish,Kg CO2/person/year,130
7,Fish,Supplied for Consumption (kg/person/year),130
8,Lamb & Goat,Kg CO2/person/year,130
9,Lamb & Goat,Supplied for Consumption (kg/person/year),130


In the `product` and `metric` fields, there are a few values that do not correspond directly to individual items in these categories, as they aggregate information at a global level. Specifically, the values `Animal products` and `Non-animal products` appear in the `product` field, and both are associated with the metric `Total Kg CO₂/year/person` in the `metric` field.

To address this particular feature of the dataset, it can be transformed by filtering out these values and then creating a new categorical field that distinguishes between the `Animal` and `Non-animal` product categories.