# Section 0 Defining modules/libraries/functions

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import plot 
import plotly.io as io
#io.renderers.default='browser'
#import plotly.io as pio
#pio.renderers.default='svg'
from pingouin import kruskal
#Declare FilePaths
from pathlib import Path

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 4000


In [None]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns


# Section 1 Importing Data

In [None]:
cropsCodes = pd.read_csv("C:/Users/cianw/Documents/dataAnalytics/CA2/Data/Eurostat/Code Dictionary/crops.dic", sep='\t',names=['crops', 'crop_name'], header = None)
strucproCodes = pd.read_csv("C:/Users/cianw/Documents/dataAnalytics/CA2/Data/Eurostat/Code Dictionary/strucpro.dic",sep='\t',names=['code', 'units'], header = None)
unitCodes = pd.read_csv("C:/Users/cianw/Documents/dataAnalytics/CA2/Data/Eurostat/Code Dictionary/unit.dic",sep='\t',names=['code', 'units'], header = None)

In [None]:

#Organic Production Data
stdProduction_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\apro_cpsh1_linear.csv")
orgTonne_path= Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\org_croppro_linear.csv")
orgArea_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\org_cropar_linear.csv")

#Organic Area Utilisation Data
orgAreaUtil_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Organic Area\sdg_02_40_linear.csv")

#Organic Processors Data
orgProcessors_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Organic Processors\org_cpreact_linear.csv")

#Harmonised Risk Index 1 Data
hriPath = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Pesiticide Use Risk Indicator\aei_hri_linear.csv")

#N and P Fertilizer Data
fertUse_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Fertlizer Use\aei_fm_usefert_linear.csv")

#Waste Generation Data
wasteGeneration_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Resource Usage\cei_pc030_linear.csv")

#National Productivity Data
productivityIndex_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Resource Usage\cei_pc034_linear.csv")

#Country Gini Data
countryGini_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Quality of Life\tessi190_linear.csv")

#Employment Rate Data
employmentRate_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Economics\tesem010_linear.csv")

#Median/Mean Income Data
income_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Economics\ilc_di03_linear.csv")

#Biodiversity of Birds Data
birdBiodiversity_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Biodiversity Index\env_bio2_linear.csv")

#National Employment Rate Data
employmentRate_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Economics\tesem010_linear.csv")

In [None]:
orgProcessors = pd.read_csv(orgProcessors_path)
orgProcessors  = orgProcessors[orgProcessors['nace_r2'].isin(['C103'])]
orgProcessors  = orgProcessors[orgProcessors['unit'].isin(['NR'])]
orgProcessors = orgProcessors.drop(columns = ['DATAFLOW', 'LAST UPDATE','freq', 'unit', 'OBS_FLAG'])
orgProcessors = orgProcessors.rename(columns={"OBS_VALUE":"numOrganicProcessors"})
print(orgProcessors.groupby('TIME_PERIOD').geo.size())
#print(orgProcessors.groupby('TIME_PERIOD').geo.nunique())
#print(orgProcessors.groupby(['TIME_PERIOD', 'geo']).geo.size())
orgProcessors.shape
orgProcessors.head(10)

In [None]:
orgAreaUtil = pd.read_csv(orgAreaUtil_path)
orgAreaUtil = orgAreaUtil.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'crops','freq', 'agprdmet', 'unit', 'OBS_FLAG'])
orgAreaUtil = orgAreaUtil.rename(columns={"OBS_VALUE":"areaUsedForOrganic"})
orgAreaUtil.shape
orgAreaUtil.head()

In [None]:
hriPesticide = pd.read_csv(hriPath)
hriPesticide = hriPesticide[hriPesticide['subst_cat'].isin(['HRI1'])]
hriPesticide = hriPesticide[~hriPesticide['geo'].isin(['EU', 'EU27_2020', 'EU28'])]
hriPesticide = hriPesticide.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'])
hriPesticide = hriPesticide.rename(columns={"OBS_VALUE":"harmRiskInd"})
#print(hriPesticide.groupby('TIME_PERIOD').geo.nunique())
hriPesticide.shape

Can apply an interesting step here by using left joins to exclude unwanted data later.

The dataset imported and pivoted/transposed below contains more than just the Countries required, it contains subdivisions indicated by numbers beside the country name. Instead of manually writing code that excludes these cases, this set will be left joined later, which will exclude the data automatically.

In [None]:
fertUse = pd.read_csv(fertUse_path)
fertUse = fertUse.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'])
fertUse = fertUse.pivot(index=['geo', 'TIME_PERIOD'], columns='nutrient', values='OBS_VALUE').reset_index() 
fertUse = fertUse.rename(columns={"N":"N_use_tonne",
                                 "P":"P_use_tonne"})
#print(fertUseGroup3.groupby('TIME_PERIOD').geo.nunique())
fertUse.shape
fertUse.head()

In [None]:
countryGini = pd.read_csv(countryGini_path)

countryGini = countryGini.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'indic_il', 'OBS_FLAG'])
countryGini = countryGini.rename(columns={"OBS_VALUE":"gini"})
countryGini.head()

In [None]:
#Standard Crop Production Import
stdProduction_lin = pd.read_csv(stdProduction_path)
stdProduction_lin = stdProduction_lin[stdProduction_lin['strucpro'].isin(['AR', 'PR_HU_EU']) ]
stdProduction_lin  = stdProduction_lin[~stdProduction_lin['OBS_FLAG'].isin(['c','n'])]
stdProduction_lin_yield = stdProduction_lin[stdProduction_lin['strucpro'].isin(['YI_HU_EU']) ]
stdProduction = stdProduction_lin.pivot(index=['crops', 'geo', 'TIME_PERIOD'], columns='strucpro', values='OBS_VALUE').reset_index() 
stdProduction = stdProduction.dropna(subset=['AR', 'PR_HU_EU']) #Removes last NAN value in SET
stdProduction = stdProduction[(stdProduction['AR'] != 0)] #Removes last NAN value in SET
stdProduction['area_HA'] = stdProduction['AR']*1000
stdProduction['tonnes'] = stdProduction['PR_HU_EU']*1000

In [None]:
missing_values_table(stdProduction)
print('Unique Geo:' + str(stdProduction.geo.nunique()))
print(stdProduction.groupby('TIME_PERIOD').geo.nunique())
stdProduction.describe()
stdProduction.info()

In [None]:
del stdProduction_lin 
del stdProduction_lin_yield


In [None]:
#Organic Crop Production Import
orgArea_all = pd.read_csv(orgArea_path)
orgArea_total = orgArea_all[(orgArea_all['agprdmet'] == 'TOTAL') & (orgArea_all['unit']=='HA' ) ]
orgArea_total = orgArea_total.rename(columns={"OBS_VALUE":"area_HA"})
orgArea_total.info()
orgArea_total = orgArea_total[~orgArea_total['OBS_FLAG'].isin(['c','n'])]
orgArea_total = orgArea_total.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'agprdmet', 'OBS_FLAG'])
orgArea_total = orgArea_total.dropna(subset=['area_HA']) #Removes last NAN value in SET
orgArea_total = orgArea_total[(orgArea_total['area_HA'] != 0)] #Removes 0 area values in SET which cause inf
missing_values_table(orgArea_total)

#del orgArea_all

orgTonne = pd.read_csv(orgTonne_path)
orgTonne = orgTonne.rename(columns={"OBS_VALUE":"tonnes"})
orgTonne.info()
orgTonne = orgTonne[~orgTonne['OBS_FLAG'].isin(['c','n'])]
orgTonne = orgTonne.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'])
missing_values_table(orgTonne)

orgProduction = pd.merge(orgArea_total, orgTonne, on=['crops', 'geo', 'TIME_PERIOD'], how='inner', suffixes=('_A','_T'))
print(orgProduction.groupby('TIME_PERIOD').geo.nunique())
orgProduction.describe()
orgProduction.info()


In [None]:
cropProd = pd.merge( stdProduction, orgProduction, on=['crops', 'geo', 'TIME_PERIOD'], how='inner', suffixes=('_std','_org'))
cropProd = pd.merge( cropProd, cropsCodes, on=['crops'], how='inner')
cropProd['geo'] = cropProd['geo'].astype('str') 
print(cropProd.groupby('TIME_PERIOD').geo.nunique())
hriPesticide.shape
cropProd.describe()
cropProd.info()


In [None]:

print(orgProduction.geo.unique())
print(stdProduction.geo.unique())
print(orgArea_all.geo.unique())
print(orgTonne.geo.unique())

# Variable Creation

In [None]:
cropProd['tonne_per_HA_org'] = cropProd['tonnes_org']/cropProd['area_HA_org']
cropProd['tonne_per_HA_std'] = cropProd['tonnes_std']/cropProd['area_HA_std']
cropProd['util_ratio'] = cropProd['tonne_per_HA_org']/cropProd['tonne_per_HA_std']

cropProdTotals = cropProd[cropProd['crops'].str.contains("0000")]


cropProd_Geo_Y= cropProd.groupby(['geo', 'TIME_PERIOD']).sum(numeric_only = True).reset_index()
cropProd_Geo= cropProd.groupby(['geo']).sum(numeric_only = True).reset_index()
cropProd_crop_Y= cropProd.groupby(['crops', 'crop_name', 'TIME_PERIOD']).sum(numeric_only = True).reset_index()
cropProd_crop= cropProd.groupby(['crops', 'crop_name']).sum(numeric_only = True).reset_index()

cropProdTotals_Geo_Y= cropProdTotals.groupby(['geo', 'TIME_PERIOD']).sum(numeric_only = True).reset_index()
cropProdTotals_Geo= cropProdTotals.groupby(['geo']).sum(numeric_only = True).reset_index()
cropProdTotals_crop_Y= cropProdTotals.groupby(['crops', 'crop_name', 'TIME_PERIOD']).sum(numeric_only = True).reset_index()
cropProdTotals_crop= cropProdTotals.groupby(['crops', 'crop_name']).sum(numeric_only = True).reset_index()

cropProdTotals.shape
#cropProdTotals.head(10)
#print(cropProdTotals.groupby(['TIME_PERIOD', 'geo']).size())

In [None]:
import functools as ft
extraVars = [cropProdTotals, orgProcessors, orgAreaUtil, hriPesticide, countryGini]
cropProdTotals_exp = ft.reduce(lambda left, right: pd.merge(left,right, how='left', on=['geo', 'TIME_PERIOD']), extraVars)
cropProdTotals_exp.shape
cropProdTotals_exp.head(10)
#print(extraVars_df.groupby(['TIME_PERIOD', 'geo']).size())
missing_values_table(cropProdTotals_exp)#Will likely keep NAN values and use as a category when clustered to create scorecard perhaps?

In [None]:
#cropProdTotals_exp.to_csv(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Datasets\cropProdTotals_exp.csv")

In [None]:
#cropProdTotals_Geo_Y.to_csv(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Datasets\cropProdTotals_Geo_Y.csv")
#cropProdTotals_crop_Y.to_csv(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Datasets\cropProdTotals_Crop_Y.csv")