# Section 0 Defining modules/libraries/functions

In [1]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import plot 
import plotly.io as io
#io.renderers.default='browser'
#import plotly.io as pio
#pio.renderers.default='svg'
from pingouin import kruskal
#Declare FilePaths
from pathlib import Path

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 4000


In [2]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns


# Section 1 Importing Data

In [3]:
cropsCodes = pd.read_csv("C:/Users/cianw/Documents/dataAnalytics/CA2/Data/Eurostat/Code Dictionary/crops.dic", sep='\t',names=['crops', 'crop_name'], header = None)
strucproCodes = pd.read_csv("C:/Users/cianw/Documents/dataAnalytics/CA2/Data/Eurostat/Code Dictionary/strucpro.dic",sep='\t',names=['code', 'units'], header = None)
unitCodes = pd.read_csv("C:/Users/cianw/Documents/dataAnalytics/CA2/Data/Eurostat/Code Dictionary/unit.dic",sep='\t',names=['code', 'units'], header = None)

In [4]:

#Organic Production Data
stdProduction_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\apro_cpsh1_linear.csv")
orgTonne_path= Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\org_croppro_linear.csv")
orgArea_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\org_cropar_linear.csv")

#Harmonised Risk Index 1 Data
hriPath = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Pesiticide Use Risk Indicator\aei_hri_linear.csv")

#Organic Processors Data
orgProcessors_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Organic Processors\org_cpreact_linear.csv")

#Organic Area Utilisation Data
orgAreaUtil_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Organic Area\sdg_02_40_linear.csv")

#N and P Fertilizer Data
fertUse_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Fertlizer Use\aei_fm_usefert_linear.csv")

#Waste Generation Data
wasteGeneration_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Resource Usage\cei_pc034_linear.csv")

#National Productivity Data
productivityIndex_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Resource Usage\cei_pc030_linear.csv")

#Country Gini Data
countryGini_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Quality of Life\tessi190_linear.csv")

#Employment Rate Data
employmentRate_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Economics\tesem010_linear.csv")

#Median/Mean Income Data
income_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Economics\ilc_di03_linear.csv")

#Biodiversity of Birds Data
birdBiodiversity_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Biodiversity Index\env_bio2_linear.csv")


In [5]:
#Harmonised Risk Index 1 Data
hriPesticide = pd.read_csv(hriPath)
hriPesticide = hriPesticide[hriPesticide['subst_cat'].isin(['HRI1'])]
hriPesticide = hriPesticide[~hriPesticide['geo'].isin(['EU', 'EU27_2020', 'EU28'])]
hriPesticide = hriPesticide.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'])
hriPesticide = hriPesticide.rename(columns={"OBS_VALUE":"harmRiskInd"})
#print(hriPesticide.groupby('TIME_PERIOD').geo.nunique())
hriPesticide.shape
hriPesticide.info()

(279, 4)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 279 entries, 638 to 945
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subst_cat    279 non-null    object
 1   geo          279 non-null    object
 2   TIME_PERIOD  279 non-null    int64 
 3   harmRiskInd  279 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 10.9+ KB


In [6]:
#Organic Processors
orgProcessors = pd.read_csv(orgProcessors_path)
orgProcessors  = orgProcessors[orgProcessors['nace_r2'].isin(['C103'])]
orgProcessors  = orgProcessors[orgProcessors['unit'].isin(['NR'])]
orgProcessors = orgProcessors.drop(columns = ['DATAFLOW', 'LAST UPDATE','freq', 'unit', 'OBS_FLAG'])
orgProcessors = orgProcessors.rename(columns={"OBS_VALUE":"numOrganicProcessors"})
#print(orgProcessors.groupby('TIME_PERIOD').geo.size())
#print(orgProcessors.groupby('TIME_PERIOD').geo.nunique())
#print(orgProcessors.groupby(['TIME_PERIOD', 'geo']).geo.size())
orgProcessors.shape
orgProcessors.info()
#orgProcessors.head()

(262, 4)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 262 entries, 899 to 1160
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   nace_r2               262 non-null    object 
 1   geo                   262 non-null    object 
 2   TIME_PERIOD           262 non-null    int64  
 3   numOrganicProcessors  262 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 10.2+ KB


In [7]:
#Organic Area Utilisation Data
orgAreaUtil = pd.read_csv(orgAreaUtil_path)
orgAreaUtil = orgAreaUtil.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'crops','freq', 'agprdmet', 'unit', 'OBS_FLAG'])
orgAreaUtil = orgAreaUtil.rename(columns={"OBS_VALUE":"areaUsedForOrganic"})
orgAreaUtil.shape
orgAreaUtil.info()
#orgAreaUtil.head(10)

(623, 3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623 entries, 0 to 622
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   geo                 623 non-null    object 
 1   TIME_PERIOD         623 non-null    int64  
 2   areaUsedForOrganic  623 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 14.7+ KB


Can apply an interesting step here by using left joins to exclude unwanted data later.

The dataset imported and pivoted/transposed below contains more than just the Countries required, it contains subdivisions indicated by numbers beside the country name. Instead of manually writing code that excludes these cases, this set will be left joined later, which will exclude the data automatically.

In [8]:
#N and P Fertilizer Data
fertUse = pd.read_csv(fertUse_path)
fertUse = fertUse.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'])
fertUse = fertUse.pivot(index=['geo', 'TIME_PERIOD'], columns='nutrient', values='OBS_VALUE').reset_index() 
fertUse = fertUse.rename(columns={"N":"N_use_tonne",
                                 "P":"P_use_tonne"})
#print(fertUseGroup3.groupby('TIME_PERIOD').geo.nunique())
fertUse.shape
fertUse.info()

(2231, 4)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2231 entries, 0 to 2230
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   geo          2231 non-null   object 
 1   TIME_PERIOD  2231 non-null   int64  
 2   N_use_tonne  2231 non-null   float64
 3   P_use_tonne  2225 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 69.8+ KB


Below I am imputing the missing odd years by averaging the two around it. By treating the numbers as a trend.

In [9]:
#Waste Generation Data
wasteGeneration = pd.read_csv(wasteGeneration_path)
wasteGeneration = wasteGeneration.drop(columns = ['DATAFLOW', 'LAST UPDATE','nace_r2', 'unit','freq', 'OBS_FLAG', 'hazard', 'waste'])
wasteGeneration["waste_unit"] = "KG per Capita"
wasteGeneration = wasteGeneration.rename(columns={"OBS_VALUE":"waste"})

wasteGeneration_lag = wasteGeneration
wasteGeneration_lag['TIME_LAG'] = wasteGeneration_lag.groupby(['geo'])['TIME_PERIOD'].shift(1)
wasteGeneration_lag['waste_lag'] = wasteGeneration_lag.groupby(['geo'])['waste'].shift(1)

wasteGeneration_lag['waste_temp'] = ((wasteGeneration_lag['waste'] + wasteGeneration_lag['waste_lag'])/2)
wasteGeneration_lag['TIME_PERIOD_temp'] = ((wasteGeneration_lag['TIME_PERIOD'] + wasteGeneration_lag['TIME_LAG'])/2)
wasteGeneration_lag = wasteGeneration_lag.drop(columns = ['TIME_PERIOD', 'TIME_LAG','waste', 'waste_lag'])
wasteGeneration_lag = wasteGeneration_lag.dropna()
wasteGeneration_lag = wasteGeneration_lag.rename(columns={"waste_temp":"waste",
                                                           "TIME_PERIOD_temp":"TIME_PERIOD"})
wasteGeneration_lag['TIME_PERIOD'] = wasteGeneration_lag['TIME_PERIOD'].astype('int')
wasteGeneration = pd.concat([wasteGeneration, wasteGeneration_lag], join='inner', ignore_index=True)

wasteGeneration.shape
wasteGeneration.info()

(609, 4)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 609 entries, 0 to 608
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   geo          609 non-null    object 
 1   TIME_PERIOD  609 non-null    int64  
 2   waste        609 non-null    float64
 3   waste_unit   609 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 19.2+ KB


In [10]:
#National Productivity Data
productivityIndex = pd.read_csv(productivityIndex_Path)
productivityIndex  = productivityIndex[productivityIndex['unit'].isin(['PPS_KG'])]
productivityIndex = productivityIndex.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'unit','freq', 'OBS_FLAG'])
productivityIndex["productivity_unit"] = "Purchase Power Standard Per KG"
productivityIndex = productivityIndex.rename(columns={"OBS_VALUE":"productivity"})
productivityIndex.shape
productivityIndex.info()
productivityIndex.head(20)

(760, 4)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 760 entries, 1458 to 2217
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   geo                760 non-null    object 
 1   TIME_PERIOD        760 non-null    int64  
 2   productivity       760 non-null    float64
 3   productivity_unit  760 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 29.7+ KB


Unnamed: 0,geo,TIME_PERIOD,productivity,productivity_unit
1458,AL,2010,1.0386,Purchase Power Standard Per KG
1459,AL,2011,1.0644,Purchase Power Standard Per KG
1460,AL,2012,1.0762,Purchase Power Standard Per KG
1461,AL,2013,0.9575,Purchase Power Standard Per KG
1462,AL,2014,1.0276,Purchase Power Standard Per KG
1463,AL,2015,0.9241,Purchase Power Standard Per KG
1464,AL,2016,0.8441,Purchase Power Standard Per KG
1465,AL,2017,1.1586,Purchase Power Standard Per KG
1466,AL,2018,1.1441,Purchase Power Standard Per KG
1467,AL,2019,1.212,Purchase Power Standard Per KG


In [11]:
#GINI index data
countryGini = pd.read_csv(countryGini_path)

countryGini = countryGini.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'indic_il', 'OBS_FLAG'])
countryGini = countryGini.rename(columns={"OBS_VALUE":"gini"})

countryGini.shape
countryGini.info()

(458, 3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   geo          458 non-null    object 
 1   TIME_PERIOD  458 non-null    int64  
 2   gini         458 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 10.9+ KB


In [12]:
#Employment Rate Data
emplyomentRate = pd.read_csv(employmentRate_Path)
emplyomentRate = emplyomentRate.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'OBS_FLAG', 'age', 'unit', 'indic_em'])
emplyomentRate = emplyomentRate.pivot(index=['geo', 'TIME_PERIOD'], columns='sex', values='OBS_VALUE').reset_index() 
emplyomentRate = emplyomentRate.rename(columns={"T":"emplyomentRate_T",
                                               "M":"emplyomentRate_M",
                                               "F":"emplyomentRate_F"})
emplyomentRate.shape
emplyomentRate.info()

(465, 5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 465 entries, 0 to 464
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   geo               465 non-null    object 
 1   TIME_PERIOD       465 non-null    int64  
 2   emplyomentRate_F  465 non-null    float64
 3   emplyomentRate_M  465 non-null    float64
 4   emplyomentRate_T  465 non-null    float64
dtypes: float64(3), int64(1), object(1)
memory usage: 18.3+ KB


In [13]:
#Median/Mean Income Data
income = pd.read_csv(income_Path)
income  = income[income['unit'].isin(['EUR'])]
income  = income[income['age'].isin(['TOTAL'])]
income = income.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'OBS_FLAG', 'age', 'unit'])
income = income.pivot(index=['geo', 'TIME_PERIOD'], columns=['indic_il', 'sex'], values='OBS_VALUE').reset_index() 
income.shape
income.info()

(767, 8)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 767 entries, 0 to 766
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   (geo, )          767 non-null    object
 1   (TIME_PERIOD, )  767 non-null    int64 
 2   (MED_E, F)       767 non-null    int64 
 3   (MEI_E, F)       767 non-null    int64 
 4   (MED_E, M)       767 non-null    int64 
 5   (MEI_E, M)       767 non-null    int64 
 6   (MED_E, T)       767 non-null    int64 
 7   (MEI_E, T)       767 non-null    int64 
dtypes: int64(7), object(1)
memory usage: 48.1+ KB


In [14]:
#Bird Biodiversity
birdBiodiversity = pd.read_csv(birdBiodiversity_Path)
birdBiodiversity = birdBiodiversity.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'OBS_FLAG', 'unit'])
birdBiodiversity = birdBiodiversity.rename(columns={"OBS_VALUE":"birdBiodiversityIndex"})

birdBiodiversity.head()

Unnamed: 0,geo,TIME_PERIOD,birdBiodiversityIndex
0,AT,1998,100.0
1,AT,1999,102.3
2,AT,2000,98.5
3,AT,2001,91.3
4,AT,2002,92.7


In [15]:
#Standard Crop Production Import
stdProduction_lin = pd.read_csv(stdProduction_path)
stdProduction_lin = stdProduction_lin[stdProduction_lin['crops'].str.contains("0000")]
stdProduction_lin = stdProduction_lin[stdProduction_lin['strucpro'].isin(['AR', 'PR_HU_EU']) ]
stdProduction_lin  = stdProduction_lin[~stdProduction_lin['OBS_FLAG'].isin(['c','n'])]
stdProduction_lin_yield = stdProduction_lin[stdProduction_lin['strucpro'].isin(['YI_HU_EU']) ]
stdProduction = stdProduction_lin.pivot(index=['crops', 'geo', 'TIME_PERIOD'], columns='strucpro', values='OBS_VALUE').reset_index() 
stdProduction = stdProduction.dropna(subset=['AR', 'PR_HU_EU']) #Removes last NAN value in SET
#stdProduction = stdProduction[(stdProduction['AR'] != 0)] #Removes last NAN value in SET
stdProduction['area_HA'] = stdProduction['AR']*1000
stdProduction['tonnes'] = stdProduction['PR_HU_EU']*1000

In [16]:
missing_values_table(stdProduction)
print('Unique Geo:' + str(stdProduction.geo.nunique()))
print(stdProduction.groupby('TIME_PERIOD').geo.nunique())
stdProduction.describe()
stdProduction.info()

Your selected dataframe has 7 columns.
There are 0 columns that have missing values.


Unnamed: 0_level_0,Missing Values,% of Total Values
strucpro,Unnamed: 1_level_1,Unnamed: 2_level_1


Unique Geo:41
TIME_PERIOD
2000    39
2001    39
2002    39
2003    39
2004    39
2005    39
2006    40
2007    41
2008    41
2009    39
2010    41
2011    41
2012    41
2013    39
2014    41
2015    41
2016    41
2017    41
2018    41
2019    41
2020    40
2021    39
2022    31
Name: geo, dtype: int64


strucpro,TIME_PERIOD,AR,PR_HU_EU,area_HA,tonnes
count,7643.0,7643.0,7643.0,7643.0,7643.0
mean,2011.01714,841.903312,7012.652864,841903.3,7012653.0
std,6.362807,5007.016665,30225.550547,5007017.0,30225550.0
min,2000.0,0.0,0.0,0.0,0.0
25%,2006.0,2.24,16.2,2240.0,16200.0
50%,2011.0,24.4,248.51,24400.0,248510.0
75%,2017.0,207.85,2573.95,207850.0,2573950.0
max,2022.0,62062.44,326925.01,62062440.0,326925000.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7643 entries, 0 to 9402
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   crops        7643 non-null   object 
 1   geo          7643 non-null   object 
 2   TIME_PERIOD  7643 non-null   int64  
 3   AR           7643 non-null   float64
 4   PR_HU_EU     7643 non-null   float64
 5   area_HA      7643 non-null   float64
 6   tonnes       7643 non-null   float64
dtypes: float64(4), int64(1), object(2)
memory usage: 477.7+ KB


In [17]:
del stdProduction_lin 
del stdProduction_lin_yield


In [18]:
#Organic Crop Production Import
orgArea_all = pd.read_csv(orgArea_path)
orgArea_total = orgArea_all[(orgArea_all['agprdmet'] == 'TOTAL') & (orgArea_all['unit']=='HA' ) & orgArea_all['crops'].str.contains("0000")]
orgArea_total = orgArea_total.rename(columns={"OBS_VALUE":"area_HA"})
orgArea_total.info()
orgArea_total = orgArea_total[~orgArea_total['OBS_FLAG'].isin(['c','n'])]
orgArea_total = orgArea_total.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'agprdmet', 'OBS_FLAG'])
orgArea_total = orgArea_total.dropna(subset=['area_HA']) #Removes last NAN value in SET
orgArea_total = orgArea_total[(orgArea_total['area_HA'] != 0)] #Removes 0 area values in SET which cause inf
missing_values_table(orgArea_total)
orgArea_total.head(25)

#del orgArea_all

orgTonne = pd.read_csv(orgTonne_path)
orgTonne = orgTonne[orgTonne['crops'].str.contains("0000")]
orgTonne = orgTonne.rename(columns={"OBS_VALUE":"tonnes"})
orgTonne.info()
orgTonne = orgTonne[~orgTonne['OBS_FLAG'].isin(['c','n'])]
orgTonne = orgTonne.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'])
missing_values_table(orgTonne)

orgProduction = pd.merge(orgArea_total, orgTonne, on=['crops', 'geo', 'TIME_PERIOD'], how='inner', suffixes=('_A','_T'))
orgProduction.head(100)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4169 entries, 1984 to 48046
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATAFLOW     4169 non-null   object 
 1   LAST UPDATE  4169 non-null   object 
 2   freq         4169 non-null   object 
 3   unit         4169 non-null   object 
 4   crops        4169 non-null   object 
 5   agprdmet     4169 non-null   object 
 6   geo          4169 non-null   object 
 7   TIME_PERIOD  4169 non-null   int64  
 8   area_HA      4152 non-null   float64
 9   OBS_FLAG     115 non-null    object 
dtypes: float64(1), int64(1), object(8)
memory usage: 358.3+ KB
Your selected dataframe has 4 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


Unnamed: 0,crops,geo,TIME_PERIOD,area_HA
1984,C0000,AT,2012,97178.0
1985,C0000,AT,2013,100854.0
1986,C0000,AT,2014,97784.0
1987,C0000,AT,2015,95742.0
1988,C0000,AT,2016,102336.0
1989,C0000,AT,2017,114691.0
1990,C0000,AT,2018,123314.0
1991,C0000,AT,2019,137105.0
1992,C0000,AT,2020,133100.0
1993,C0000,BE,2012,4265.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2258 entries, 0 to 10855
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATAFLOW     2258 non-null   object 
 1   LAST UPDATE  2258 non-null   object 
 2   freq         2258 non-null   object 
 3   crops        2258 non-null   object 
 4   unit         2258 non-null   object 
 5   geo          2258 non-null   object 
 6   TIME_PERIOD  2258 non-null   int64  
 7   tonnes       2210 non-null   float64
 8   OBS_FLAG     112 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 176.4+ KB
Your selected dataframe has 4 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


Unnamed: 0,crops,geo,TIME_PERIOD,area_HA,tonnes
0,C0000,BE,2016,10314.0,32001.0
1,C0000,BE,2017,11474.0,43500.0
2,C0000,BE,2018,11651.0,46188.0
3,C0000,BE,2019,12199.0,46441.0
4,C0000,BE,2020,13179.0,54492.0
5,C0000,BG,2013,7669.0,3548.0
6,C0000,BG,2014,10795.0,7671.0
7,C0000,BG,2015,22191.0,5619.0
8,C0000,BG,2016,30940.0,5943.0
9,C0000,BG,2017,16602.0,16152.0


In [19]:
cropProd = pd.merge( stdProduction, orgProduction, on=['crops', 'geo', 'TIME_PERIOD'], how='inner', suffixes=('_std','_org'))
cropProd = pd.merge( cropProd, cropsCodes, on=['crops'], how='inner')
cropProd['geo'] = cropProd['geo'].astype('str') 
print(cropProd.groupby('TIME_PERIOD').geo.nunique())
hriPesticide.shape
cropProd.describe()
cropProd.info()
cropProd.head()

TIME_PERIOD
2012    14
2013    21
2014    23
2015    25
2016    26
2017    27
2018    25
2019    26
2020    27
Name: geo, dtype: int64


(279, 4)

Unnamed: 0,TIME_PERIOD,AR,PR_HU_EU,area_HA_std,tonnes_std,area_HA_org,tonnes_org
count,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0
mean,2016.53381,476.026821,3975.070143,476026.8,3975070.0,22389.996099,76349.14
std,2.361139,1356.307197,8976.976669,1356307.0,8976977.0,61974.583104,226199.4
min,2012.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,2015.0,6.55,81.635,6550.0,81635.0,248.5,1149.0
50%,2017.0,46.06,555.26,46060.0,555260.0,1840.0,5919.5
75%,2019.0,286.775,3531.51,286775.0,3531510.0,13830.25,42594.5
max,2020.0,11727.0,125594.63,11727000.0,125594600.0,637852.0,3193410.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1538 entries, 0 to 1537
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   crops        1538 non-null   object 
 1   geo          1538 non-null   object 
 2   TIME_PERIOD  1538 non-null   int64  
 3   AR           1538 non-null   float64
 4   PR_HU_EU     1538 non-null   float64
 5   area_HA_std  1538 non-null   float64
 6   tonnes_std   1538 non-null   float64
 7   area_HA_org  1538 non-null   float64
 8   tonnes_org   1538 non-null   float64
 9   crop_name    1538 non-null   object 
dtypes: float64(6), int64(1), object(3)
memory usage: 132.2+ KB


Unnamed: 0,crops,geo,TIME_PERIOD,AR,PR_HU_EU,area_HA_std,tonnes_std,area_HA_org,tonnes_org,crop_name
0,C0000,BE,2016,337.01,2228.79,337010.0,2228790.0,10314.0,32001.0,Cereals for the production of grain (including...
1,C0000,BE,2017,305.44,2642.25,305440.0,2642250.0,11474.0,43500.0,Cereals for the production of grain (including...
2,C0000,BE,2018,304.52,2431.04,304520.0,2431040.0,11651.0,46188.0,Cereals for the production of grain (including...
3,C0000,BE,2019,313.11,2816.04,313110.0,2816040.0,12199.0,46441.0,Cereals for the production of grain (including...
4,C0000,BE,2020,304.34,2565.7,304340.0,2565700.0,13179.0,54492.0,Cereals for the production of grain (including...


In [20]:

print(orgProduction.geo.unique())
print(stdProduction.geo.unique())
print(orgArea_all.geo.unique())
print(orgTonne.geo.unique())

['BE' 'BG' 'CY' 'CZ' 'EE' 'EL' 'ES' 'FI' 'FR' 'HR' 'HU' 'IE' 'IT' 'LT'
 'LU' 'LV' 'MT' 'NL' 'PL' 'RO' 'RS' 'SE' 'SI' 'SK' 'TR' 'UK' 'IS' 'DE']
['AL' 'AT' 'BA' 'BE' 'BG' 'CH' 'CY' 'CZ' 'DE' 'DK' 'EE' 'EL' 'ES' 'EU'
 'EU27_2020' 'EU28' 'FI' 'FR' 'HR' 'HU' 'IE' 'IS' 'IT' 'LT' 'LU' 'LV' 'ME'
 'MK' 'MT' 'NL' 'NO' 'PL' 'PT' 'RO' 'RS' 'SE' 'SI' 'SK' 'TR' 'UK' 'XK']
['AL' 'BE' 'BG' 'CY' 'CZ' 'DK' 'EE' 'EL' 'ES' 'FI' 'FR' 'HR' 'HU' 'IE'
 'IS' 'IT' 'LT' 'LU' 'LV' 'ME' 'MK' 'MT' 'NL' 'NO' 'PL' 'PT' 'RO' 'RS'
 'SE' 'SI' 'SK' 'TR' 'UK' 'AT' 'CH' 'DE' 'EU' 'EU27_2020' 'EU28']
['AL' 'BE' 'BG' 'CY' 'CZ' 'EE' 'EL' 'ES' 'FI' 'FR' 'HR' 'HU' 'IE' 'IT'
 'LT' 'LU' 'LV' 'ME' 'MK' 'MT' 'NL' 'PL' 'RO' 'RS' 'SE' 'SI' 'SK' 'TR'
 'UK' 'IS' 'DE' 'NO']


# Variable Creation

In [21]:
cropProd['tonne_per_HA_org'] = cropProd['tonnes_org']/cropProd['area_HA_org']
cropProd['tonne_per_HA_std'] = cropProd['tonnes_std']/cropProd['area_HA_std']
cropProd['util_ratio'] = cropProd['tonne_per_HA_org']/cropProd['tonne_per_HA_std']

cropProdTotals = cropProd[cropProd['crops'].str.contains("0000")]

cropProdTotals_Geo_Y= cropProdTotals.groupby(['geo', 'TIME_PERIOD']).sum(numeric_only = True).reset_index()
#cropProdTotals_Geo= cropProdTotals.groupby(['geo']).sum(numeric_only = True).reset_index()
#cropProdTotals_crop_Y= cropProdTotals.groupby(['crops', 'crop_name', 'TIME_PERIOD']).sum(numeric_only = True).reset_index()
#cropProdTotals_crop= cropProdTotals.groupby(['crops', 'crop_name']).sum(numeric_only = True).reset_index()

cropProdTotals_Geo_Y.shape
cropProdTotals_Geo_Y.head(10)
#print(cropProdTotals.groupby(['TIME_PERIOD', 'geo']).size())

(214, 11)

Unnamed: 0,geo,TIME_PERIOD,AR,PR_HU_EU,area_HA_std,tonnes_std,area_HA_org,tonnes_org,tonne_per_HA_org,tonne_per_HA_std,util_ratio
0,BE,2016,499.46,7483.71,499460.0,7483710.0,15308.0,90535.0,79.683771,162.655464,4.119228
1,BE,2017,637.99,18600.16,637990.0,18600160.0,17927.0,146702.0,112.527456,214.319434,4.589758
2,BE,2018,640.27,16426.22,640270.0,16426220.0,20192.0,187259.0,112.700447,220.25916,4.525321
3,BE,2019,653.07,18076.98,653070.0,18076980.0,21896.0,246886.0,132.99886,228.08827,5.104597
4,BE,2020,487.07,8198.58,487070.0,8198580.0,22025.0,225947.0,98.99638,166.582988,4.507809
5,BG,2013,3233.5,13662.96,3233500.0,13662960.0,22052.0,17684.0,12.525268,74.08986,0.869984
6,BG,2014,3285.79,14743.33,3285790.0,14743330.0,34754.0,49209.0,28.855741,83.517525,3.329841
7,BG,2015,2129.13,11937.95,2129130.0,11937950.0,83218.0,58319.0,19.862904,88.428932,1.940963
8,BG,2016,2180.51,12913.34,2180510.0,12913340.0,114613.0,78194.0,13.386667,87.760039,1.324553
9,BG,2017,2129.44,13818.48,2129440.0,13818480.0,95178.0,82254.0,17.621542,98.963822,1.938072


In [22]:
#%whos DataFrame
hriPesticide.head()
hriPesticide.shape

Unnamed: 0,subst_cat,geo,TIME_PERIOD,harmRiskInd
638,HRI1,AT,2011,102
639,HRI1,AT,2012,106
640,HRI1,AT,2013,92
641,HRI1,AT,2014,95
642,HRI1,AT,2015,101


(279, 4)

In [23]:
import functools as ft
extraVars = [hriPesticide, orgProcessors, orgAreaUtil, countryGini, cropProdTotals_Geo_Y, birdBiodiversity, emplyomentRate, income, fertUse, productivityIndex, wasteGeneration]
rds = ft.reduce(lambda left, right: pd.merge(left,right, how='left', on=['geo', 'TIME_PERIOD']), extraVars)
rds.shape
rds.head(20)
#print(extraVars_df.groupby(['TIME_PERIOD', 'geo']).size())
missing_values_table(rds)#Will likely keep NAN values and use as a category when clustered to create scorecard perhaps?


merging between different levels is deprecated and will be removed in a future version. (1 levels on the left, 2 on the right)


dropping on a non-lexsorted multi-index without a level parameter may impact performance.



(279, 33)

Unnamed: 0,subst_cat,geo,TIME_PERIOD,harmRiskInd,nace_r2,numOrganicProcessors,areaUsedForOrganic,gini,AR,PR_HU_EU,...,"(MED_E, M)","(MEI_E, M)","(MED_E, T)","(MEI_E, T)",N_use_tonne,P_use_tonne,productivity,productivity_unit,waste,waste_unit
0,HRI1,AT,2011,102,,,19.6,27.4,,,...,21884.0,24422.0,21463.0,23922.0,98161.0,9947.0,1.6311,Purchase Power Standard Per KG,5647.5,KG per Capita
1,HRI1,AT,2012,106,,,18.62,27.6,,,...,22519.0,25112.0,21807.0,24423.0,107895.0,12301.0,1.7368,Purchase Power Standard Per KG,5699.0,KG per Capita
2,HRI1,AT,2013,92,,,18.4,27.0,,,...,22750.0,25000.0,22073.0,24366.0,110626.0,14234.0,1.7932,Purchase Power Standard Per KG,6118.0,KG per Capita
3,HRI1,AT,2014,95,,,19.35,27.6,,,...,23741.0,26649.0,23211.0,26080.0,121562.0,14110.0,1.826,Purchase Power Standard Per KG,6537.0,KG per Capita
4,HRI1,AT,2015,101,,,20.3,27.2,,,...,23856.0,26715.0,23260.0,25958.0,124078.0,12495.0,1.9358,Purchase Power Standard Per KG,6772.5,KG per Capita
5,HRI1,AT,2016,115,,,21.25,27.2,,,...,24111.0,26676.0,23694.0,26054.0,132031.0,15451.0,1.8986,Purchase Power Standard Per KG,7008.0,KG per Capita
6,HRI1,AT,2017,116,,,23.37,27.9,,,...,25225.0,28139.0,24752.0,27629.0,111884.0,11789.0,1.9554,Purchase Power Standard Per KG,7218.0,KG per Capita
7,HRI1,AT,2018,129,,,24.08,26.8,,,...,25922.0,28381.0,25176.0,27804.0,100096.0,13110.0,2.0265,Purchase Power Standard Per KG,7428.0,KG per Capita
8,HRI1,AT,2019,121,,,25.33,27.5,,,...,26155.0,29082.0,25729.0,28568.0,102812.0,13238.0,2.0765,Purchase Power Standard Per KG,7578.0,KG per Capita
9,HRI1,AT,2020,135,,,25.69,27.0,,,...,26958.0,29963.0,26555.0,29503.0,117321.0,12208.0,1.9472,Purchase Power Standard Per KG,7728.0,KG per Capita


Your selected dataframe has 33 columns.
There are 27 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
tonne_per_HA_std,82,29.4
util_ratio,82,29.4
AR,82,29.4
PR_HU_EU,82,29.4
area_HA_std,82,29.4
tonnes_std,82,29.4
area_HA_org,82,29.4
tonnes_org,82,29.4
tonne_per_HA_org,82,29.4
nace_r2,71,25.4


In [24]:
#rds.to_csv(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Datasets\referenceDataSet.csv")

In [25]:
#cropProdTotals_Geo_Y.to_csv(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Datasets\cropProdTotals_Geo_Y.csv")
#cropProdTotals_crop_Y.to_csv(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Datasets\cropProdTotals_Crop_Y.csv")