In [64]:
from sqlalchemy import create_engine
from local_settings import postgresql as settings

In [65]:
def get_engine(user, passwd, host, port, db):
    url = f"mssql+pyodbc://{user}:{passwd}@{host},{port}/{db}?driver=ODBC+Driver+17+for+SQL+Server"
    engine = create_engine(url, pool_size= 50, echo= False)

    return engine

In [66]:
engine = get_engine(
    user= settings['pguser'],
    passwd=settings['pgpasswd'],
    host=settings['pghost'],
    port=settings['pgport'],
    db=settings['pgdb']
)

connection = engine.connect()

In [67]:
from sqlalchemy import inspect, MetaData, Table, func


inspector = inspect(engine)
print(inspector.get_table_names())

['foodprint_2018']


In [68]:
metadata = MetaData()

foodprint_2018 = Table('foodprint_2018', metadata, autoload= True, autoload_with= engine)

print(foodprint_2018.columns.keys())

['Id', 'Country', 'Product', 'Metric', 'Value']


In [69]:
from sqlalchemy import select
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px


def sql_query(query):
    ### Read SQL queries
    return pd.read_sql_query(query, connection)


# Print the first 10 rows 
query = """
    SELECT TOP 10 * 
        FROM foodprint_2018
    """

sql_query(query)

Unnamed: 0,Id,Country,Product,Metric,Value
0,1,Argentina,Pork,Supplied for Consumption (kg/person/year),10.51
1,2,Australia,Pork,Supplied for Consumption (kg/person/year),24.14
2,3,Albania,Pork,Supplied for Consumption (kg/person/year),10.88
3,4,Iceland,Pork,Supplied for Consumption (kg/person/year),21.69
4,5,New Zealand,Pork,Supplied for Consumption (kg/person/year),22.29
5,6,USA,Pork,Supplied for Consumption (kg/person/year),27.64
6,7,Uruguay,Pork,Supplied for Consumption (kg/person/year),16.84
7,8,Luxembourg,Pork,Supplied for Consumption (kg/person/year),43.58
8,9,Brazil,Pork,Supplied for Consumption (kg/person/year),12.6
9,10,Kazakhstan,Pork,Supplied for Consumption (kg/person/year),10.36


##### Exploratory Data Analysis

* Are there missing values in the dataset?

In [75]:
for col in foodprint_2018.columns:
    stmt = select(func.count()).where(col.is_(None))
    result = connection.execute(stmt).scalar()
    print(f"Null values in {col.name}: {result}")

Null values in Id: 0
Null values in Country: 0
Null values in Product: 0
Null values in Metric: 0
Null values in Value: 0


* What are the Products and Metrics under analysis?

In [25]:
stmt = select(foodprint_2018.columns.Product.distinct(), foodprint_2018.columns.Metric)
stmt = stmt.order_by(foodprint_2018.columns.Product, foodprint_2018.columns.Metric)

pd.DataFrame(connection.execute(stmt), 
             columns= [foodprint_2018.columns.keys()[2], foodprint_2018.columns.keys()[3]])

Unnamed: 0,Product,Metric
0,Animal products,Total Kg CO2/year/person
1,Beef,Kg CO2/person/year
2,Beef,Supplied for Consumption (kg/person/year)
3,Difference Animal Non-Animals,Products/person/year in kg
4,Eggs,Kg CO2/person/year
5,Eggs,Supplied for Consumption (kg/person/year)
6,Fish,Kg CO2/person/year
7,Fish,Supplied for Consumption (kg/person/year)
8,Lamb & Goat,Kg CO2/person/year
9,Lamb & Goat,Supplied for Consumption (kg/person/year)


* Number of countries analysed for each product

In [9]:
stmt = select(foodprint_2018.columns.Product,
              func.count(foodprint_2018.columns.Country.distinct()).label('Number of countries'))

stmt = stmt.group_by(foodprint_2018.columns.Product)
stmt = stmt.order_by(foodprint_2018.columns.Product)

print(stmt, '\n')

pd.DataFrame(connection.execute(stmt).fetchall(), columns= ['Product', 'Number of countries'])

SELECT foodprint_2018."Product", count(DISTINCT foodprint_2018."Country") AS "Number of countries" 
FROM foodprint_2018 GROUP BY foodprint_2018."Product" ORDER BY foodprint_2018."Product" 



Unnamed: 0,Product,Number of countries
0,Animal products,130
1,Beef,130
2,Difference Animal Non-Animals,130
3,Eggs,130
4,Fish,130
5,Lamb & Goat,130
6,Milk - inc. Cheese,130
7,Non-animal products,130
8,Nuts inc. Peanut Butter,130
9,Pork,130


* Distribution of both metrics

In [55]:
fp_df.head()

Unnamed: 0,index,Country,Product,Metric,Value
0,0,Argentina,Pork,Supplied for Consumption (kg/person/year),10.51
1,1,Australia,Pork,Supplied for Consumption (kg/person/year),24.14
2,2,Albania,Pork,Supplied for Consumption (kg/person/year),10.88
3,3,Iceland,Pork,Supplied for Consumption (kg/person/year),21.69
4,4,New Zealand,Pork,Supplied for Consumption (kg/person/year),22.29


In [85]:
stmt = select(foodprint_2018)
stmt = stmt.where(foodprint_2018.columns.Metric.like('Kg%'))
cols = foodprint_2018.columns.keys()

fp_df = pd.DataFrame(connection.execute(stmt).fetchall(), columns= cols)
del fp_df['Id']

fp_df['Value'] = fp_df['Value'].map(lambda x: float(x))

fig = px.histogram(
    fp_df,
    x= "Value"
)
fig.show()

In [86]:
stmt = select(foodprint_2018)
stmt = stmt.where(foodprint_2018.columns.Metric.like('Supplied%'))
cols = foodprint_2018.columns.keys()

fp_df = pd.DataFrame(connection.execute(stmt).fetchall(), columns= cols)
del fp_df['Id']

fp_df['Value'] = fp_df['Value'].map(lambda x: float(x))

fig = px.histogram(
    fp_df,
    x= "Value"
)
fig.show()