# Section 0 Defining modules/libraries/functions

In [1]:
import glob
import pandas as pd
import numpy as np
from pathlib import Path
pd.options.display.max_rows = 300
pd.options.display.max_columns = 100


In [2]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns


# Section 1 Importing Data

In [3]:
cropsCodes = pd.read_csv("C:/Users/cianw/Documents/dataAnalytics/CA2/Data/Eurostat/Code Dictionary/crops.dic", sep='\t',names=['crops', 'crop_name'], header = None)
strucproCodes = pd.read_csv("C:/Users/cianw/Documents/dataAnalytics/CA2/Data/Eurostat/Code Dictionary/strucpro.dic",sep='\t',names=['code', 'units'], header = None)
unitCodes = pd.read_csv("C:/Users/cianw/Documents/dataAnalytics/CA2/Data/Eurostat/Code Dictionary/unit.dic",sep='\t',names=['code', 'units'], header = None)
fordCodes = pd.read_csv("C:/Users/cianw/Documents/dataAnalytics/CA2/Data/Eurostat/Code Dictionary/ford.dic",sep='\t',names=['ford', 'units'], header = None)

In [4]:

#Organic Production Data
stdProduction_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\apro_cpsh1_linear.csv")
orgTonne_path= Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\org_croppro_linear.csv")
orgArea_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\org_cropar_linear.csv")

#Harmonised Risk Index 1 Data
hriPath = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Pesiticide Use Risk Indicator\aei_hri_linear.csv")

#Organic Processors Data
orgProcessors_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Organic Processors\org_cpreact_linear.csv")

#Organic Area Utilisation Data
orgAreaUtil_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Organic Area\sdg_02_40_linear.csv")

#N and P Fertilizer Data
fertUse_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Fertlizer Use\aei_fm_usefert_linear.csv")

#Waste Generation Data
wasteGeneration_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Resource Usage\cei_pc034_linear.csv")

#National Productivity Data
productivityIndex_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Resource Usage\cei_pc030_linear.csv")

#Country Gini Data
countryGini_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Quality of Life\tessi190_linear.csv")

#Employment Rate Data
employmentRate_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Economics\tesem010_linear.csv")

#Median/Mean Income Data
income_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Economics\ilc_di03_linear.csv")

#Biodiversity of Birds Data
birdBiodiversity_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Biodiversity Index\env_bio2_linear.csv")

#Pesticide Sales
pestSales_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Pesticide Sales\aei_fm_salpest09_linear.csv")

#Pesticide Use
pestUse_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\aei_pestuse_linear.csv")

#Farm Structure
farmStructure_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Farm Structure\ef_lac_main_linear.csv");

#Higher Education Spending
higherEdu_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Economics\rd_e_gerdtot_linear.csv");

#R&D Spending
research_Path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Economics\rd_e_gerdsc_linear.csv");

In [5]:
#Harmonised Risk Index 1 Data
hriPesticide = pd.read_csv(hriPath)
hriPesticide = hriPesticide[hriPesticide['subst_cat'].isin(['HRI1'])]
hriPesticide = hriPesticide[~hriPesticide['geo'].isin(['EU', 'EU27_2020', 'EU28', 'UK'])]
hriPesticide = hriPesticide.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'])
hriPesticide = hriPesticide.rename(columns={"OBS_VALUE":"harmRiskInd"})
#print(hriPesticide.groupby('TIME_PERIOD').geo.nunique())
#hriPesticide.shape
#hriPesticide.info()
hriPesticide.describe()
#hriPesticide.head(25)

Unnamed: 0,TIME_PERIOD,harmRiskInd
count,270.0,270.0
mean,2015.5,91.403704
std,2.877615,26.279665
min,2011.0,17.0
25%,2013.0,77.0
50%,2015.5,93.0
75%,2018.0,103.0
max,2020.0,242.0


In [6]:
#Organic Processors
orgProcessors = pd.read_csv(orgProcessors_path)
orgProcessors  = orgProcessors[orgProcessors['nace_r2'].isin(['C103','C101', 'C102', 'C104', 'C105', 'C109' 'C106'])]
orgProcessors  = orgProcessors[orgProcessors['unit'].isin(['NR'])]
orgProcessors = orgProcessors.drop(columns = ['DATAFLOW', 'LAST UPDATE','freq', 'unit', 'OBS_FLAG'])
orgProcessors = orgProcessors.pivot(index=['geo', 'TIME_PERIOD'], columns='nace_r2', values='OBS_VALUE').reset_index() 
orgProcessors = orgProcessors.rename(columns={"OBS_VALUE":"numOrganicProcessors"})
#print(orgProcessors.groupby('TIME_PERIOD').geo.size())
#print(orgProcessors.groupby('TIME_PERIOD').geo.nunique())
#print(orgProcessors.groupby(['TIME_PERIOD', 'geo']).geo.size())
orgProcessors.shape
#orgProcessors.info()
orgProcessors.head()

nace_r2,geo,TIME_PERIOD,C101,C102,C103,C104,C105
0,BE,2012,99.0,11.0,192.0,34.0,79.0
1,BE,2013,71.0,7.0,113.0,22.0,58.0
2,BE,2014,76.0,8.0,116.0,24.0,64.0
3,BE,2015,90.0,10.0,134.0,29.0,83.0
4,BE,2016,93.0,9.0,161.0,29.0,67.0


In [7]:
#Organic Area Utilisation Data
orgAreaUtil = pd.read_csv(orgAreaUtil_path)
orgAreaUtil = orgAreaUtil.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'crops','freq', 'agprdmet', 'unit', 'OBS_FLAG'])
orgAreaUtil = orgAreaUtil.rename(columns={"OBS_VALUE":"areaUsedForOrganic_PCT"})
#orgAreaUtil.shape
#orgAreaUtil.info()
#orgAreaUtil.head(10)

Can apply an interesting step here by using left joins to exclude unwanted data later.

The dataset imported and pivoted/transposed below contains more than just the Countries required, it contains subdivisions indicated by numbers beside the country name. Instead of manually writing code that excludes these cases, this set will be left joined later, which will exclude the data automatically.

In [8]:
#N and P Fertilizer Data
fertUse = pd.read_csv(fertUse_path)
fertUse = fertUse.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'])
fertUse = fertUse.pivot(index=['geo', 'TIME_PERIOD'], columns='nutrient', values='OBS_VALUE').reset_index() 
fertUse = fertUse.rename(columns={"N":"N_use_tonne",
                                 "P":"P_use_tonne"})
#print(fertUseGroup3.groupby('TIME_PERIOD').geo.nunique())
#fertUse.shape
#fertUse.info()

Below I am imputing the missing odd years by averaging the two around it. By treating the numbers as a trend.

In [9]:
#Waste Generation Data
wasteGeneration = pd.read_csv(wasteGeneration_path)
wasteGeneration = wasteGeneration.drop(columns = ['DATAFLOW', 'LAST UPDATE','nace_r2', 'unit','freq', 'OBS_FLAG', 'hazard', 'waste'])
wasteGeneration["waste_unit"] = "KG per Capita"
wasteGeneration = wasteGeneration.rename(columns={"OBS_VALUE":"waste"})

wasteGeneration_lag = wasteGeneration
wasteGeneration_lag['TIME_LAG'] = wasteGeneration_lag.groupby(['geo'])['TIME_PERIOD'].shift(1)
wasteGeneration_lag['waste_lag'] = wasteGeneration_lag.groupby(['geo'])['waste'].shift(1)

wasteGeneration_lag['waste_temp'] = ((wasteGeneration_lag['waste'] + wasteGeneration_lag['waste_lag'])/2)
wasteGeneration_lag['TIME_PERIOD_temp'] = ((wasteGeneration_lag['TIME_PERIOD'] + wasteGeneration_lag['TIME_LAG'])/2)
wasteGeneration_lag = wasteGeneration_lag.drop(columns = ['TIME_PERIOD', 'TIME_LAG','waste', 'waste_lag'])
wasteGeneration_lag = wasteGeneration_lag.dropna()
wasteGeneration_lag = wasteGeneration_lag.rename(columns={"waste_temp":"waste",
                                                           "TIME_PERIOD_temp":"TIME_PERIOD"})
wasteGeneration_lag['TIME_PERIOD'] = wasteGeneration_lag['TIME_PERIOD'].astype('int')
wasteGeneration = pd.concat([wasteGeneration, wasteGeneration_lag], join='inner', ignore_index=True)

#wasteGeneration.shape
#wasteGeneration.info()

In [10]:
#National Productivity Data
productivityIndex = pd.read_csv(productivityIndex_Path)
productivityIndex  = productivityIndex[productivityIndex['unit'].isin(['PPS_KG'])]
productivityIndex = productivityIndex.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'unit','freq', 'OBS_FLAG'])
productivityIndex["productivity_unit"] = "Purchase Power Standard Per KG"
productivityIndex = productivityIndex.rename(columns={"OBS_VALUE":"productivity"})
#productivityIndex.shape
#productivityIndex.info()
#productivityIndex.head(20)

In [11]:
#GINI index data
countryGini = pd.read_csv(countryGini_path)

countryGini = countryGini.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'indic_il', 'OBS_FLAG'])
countryGini = countryGini.rename(columns={"OBS_VALUE":"gini"})

#countryGini.shape
#countryGini.info()

In [12]:
#Employment Rate Data
emplyomentRate = pd.read_csv(employmentRate_Path)
emplyomentRate = emplyomentRate.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'OBS_FLAG', 'age', 'unit', 'indic_em'])
emplyomentRate = emplyomentRate.pivot(index=['geo', 'TIME_PERIOD'], columns='sex', values='OBS_VALUE').reset_index() 
emplyomentRate = emplyomentRate.rename(columns={"T":"emplyomentRate_T",
                                               "M":"emplyomentRate_M",
                                               "F":"emplyomentRate_F"})
#emplyomentRate.shape
#emplyomentRate.info()

In [13]:
#Median/Mean Income Data
income = pd.read_csv(income_Path)
income  = income[income['unit'].isin(['EUR'])]
income  = income[income['age'].isin(['TOTAL'])]
income = income.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'OBS_FLAG', 'age', 'unit'])
income = income.pivot(index=['geo', 'TIME_PERIOD'], columns=['indic_il', 'sex'], values='OBS_VALUE').reset_index() 
income.columns = [''.join(col) for col in income.columns.values]
#income.head()
#income.shape
#income.info()


In [14]:
#Bird Biodiversity
birdBiodiversity = pd.read_csv(birdBiodiversity_Path)
birdBiodiversity = birdBiodiversity.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'OBS_FLAG', 'unit'])
birdBiodiversity = birdBiodiversity.rename(columns={"OBS_VALUE":"birdBiodiversityIndex"})

#birdBiodiversity.head()

In [15]:
# Pest Use
pestUse= pd.read_csv(pestUse_Path)
#pestUse = pestUse.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'OBS_FLAG'])
#pestSales = pestSales[(pestSales['TIME_PERIOD'].isin([2016,2018]))]

pestUse['TIME_PERIOD'] = pestUse['TIME_PERIOD'].apply(lambda x: str(x)+ '_y')
pestUse = pestUse[(pestUse['unit'].isin(['KG']))]
pestUse = pestUse[(pestUse['pesticid'].str.contains("_"))]
pestUse  = pestUse[~pestUse['OBS_FLAG'].isin(['c'])]
pestUse['OBS_VALUE'] = np.where(pestUse.OBS_FLAG == 'n', 0, pestUse.OBS_VALUE)
#pestSales = pestSales[(pestSales['pesticid'].str.isdigit()==True)]

#print(pestSales.pesticid.unique())
#pestSales.head()

pestUse = pestUse.pivot(index=['pesticid','crops'], columns =['geo', 'TIME_PERIOD'], values='OBS_VALUE').reset_index() 
pestUse.columns = ['_'.join(col) for col in pestUse.columns.values]

#pestUse.head(30)
#pestUse.shape
#missing_values_table(pestUse)


In [16]:
# Pest Sales for Stats Analysis
pestSales= pd.read_csv(pestSales_Path)
pestSales['mainFert'] = pestSales['pesticid'].str.contains(r'[0-9]')
pestSales = pestSales.dropna(subset=['OBS_VALUE'])
pestSales = pestSales.drop(pestSales[pestSales['mainFert'] == True].index)
pestSales = pestSales.drop(pestSales[pestSales['pesticid'] == 'TOTAL'].index)
pestSales = pestSales.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'OBS_FLAG', 'unit', 'mainFert'])

pestSales = pestSales.sort_values('OBS_VALUE').drop_duplicates(['geo', 'TIME_PERIOD'], keep='last')
pestSales = pestSales.rename(columns={"OBS_VALUE":"pest_KG",
                                     "pesticid" : "mostFrequentPest"})
pestSales.shape
#pestSales.head(300)


#pestSales = pestSales.pivot(index=['pesticid'], columns =['TIME_PERIOD', 'geo'], values='OBS_VALUE').reset_index() 
#pestSales.columns = ['_'.join(col) for col in pestSales.columns.values]

#pestSales.head(30)


(307, 4)

In [17]:
orgTonne = pd.read_csv(orgTonne_path)
orgTonne = orgTonne[orgTonne['crops'].str.contains("0000")]
orgTonne = orgTonne.drop(orgTonne[orgTonne['OBS_VALUE'] == 0].index)
orgTonne = orgTonne.rename(columns={"OBS_VALUE":"orgTonnes",
                                   "crops":"mostGrownOrganic"})
#orgTonne.info()
orgTonne = orgTonne[~orgTonne['OBS_FLAG'].isin(['c','n'])]
orgTonne = orgTonne.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'])
orgTonne = orgTonne.sort_values('orgTonnes').drop_duplicates(['geo', 'TIME_PERIOD'], keep='last')

#print(orgTonne.head(10))
#missing_values_table(orgTonne)

In [18]:
stdProduction_lin = pd.read_csv(stdProduction_path)
stdProduction_lin = stdProduction_lin[stdProduction_lin['crops'].str.contains("0000")]
stdProduction_lin = stdProduction_lin[stdProduction_lin['strucpro'].isin(['PR_HU_EU']) ]
stdProduction_lin  = stdProduction_lin[~stdProduction_lin['OBS_FLAG'].isin(['c','n'])]
stdProduction_lin = stdProduction_lin.drop(stdProduction_lin[stdProduction_lin['OBS_VALUE'] == 0].index)
stdProduction_lin = stdProduction_lin.dropna(subset = ['OBS_VALUE'])
stdProduction_lin['OBS_VALUE'] = stdProduction_lin['OBS_VALUE']*1000
stdProduction_lin = stdProduction_lin.rename(columns={"OBS_VALUE":"stdTonnes",
                                                      "crops":"mostGrownStd"})
#orgTonne.info()
stdProduction_lin = stdProduction_lin[~stdProduction_lin['OBS_FLAG'].isin(['c','n'])]
stdProduction_lin = stdProduction_lin.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'OBS_FLAG', 'strucpro'])
stdProduction_lin = stdProduction_lin.sort_values('stdTonnes').drop_duplicates(['geo', 'TIME_PERIOD'], keep='last')

#print(stdProduction_lin.head(300))
#missing_values_table(orgTonne)

In [19]:
higherEdu = pd.read_csv(higherEdu_Path)

higherEdu = higherEdu[higherEdu['sectperf'].isin(['HES']) ]
higherEdu = higherEdu[higherEdu['unit'].isin(['EUR_HAB']) ]
higherEdu = higherEdu.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'OBS_FLAG', 'unit', 'sectperf'])
higherEdu = higherEdu.rename(columns={"OBS_VALUE":"eduSpend_eur_hab"})

#print(higherEdu.groupby('TIME_PERIOD').geo.nunique())

In [43]:
research = pd.read_csv(research_Path)
research = pd.merge(research, fordCodes, on=['ford'], how='inner')

research = research[research['sectperf'].isin(['GOV']) ]
research = research[research['ford'].isin(['FORD1','FORD2','FORD3','FORD4', 'FORD5', 'FORD6', 'FORD401', 'FORD402', 'FORD403', 'FORD404', 'FORD405', 'FORD504', 'FORD303', 'FORD208']) ]
research = research[research['unit'].isin(['EUR_HAB']) ]
research = research.pivot(index=['geo', 'TIME_PERIOD'], columns='units', values='OBS_VALUE').reset_index() 

print(missing_values_table(research))

#Unfortunately, the data on specific agricultural details was not readily avialable and had too many nulls, will delete columns with greater than 10% nulls as they are effectively useless.
researchColumns = list(research.drop(research.filter(regex=r'geo|TIME_PERIOD').columns, axis=1))
research_null = research[researchColumns].isnull().sum() / len(research)
missing_features_research = list(research_null[research_null > 0.1].index)
research = research.drop(missing_features_research, axis=1)
research = research.fillna(0)

print(missing_values_table(research))
print(research.shape)

Your selected dataframe has 16 columns.
There are 13 columns that have missing values.
                                      Missing Values  % of Total Values
units                                                                  
Environmental biotechnology                      667               96.2
Veterinary science                               657               94.8
Animal and dairy science                         654               94.4
Health sciences                                  652               94.1
Other agricultural sciences                      649               93.7
Agricultural biotechnology                       648               93.5
Agriculture, forestry, and fisheries             638               92.1
Sociology                                        637               91.9
Humanities                                        37                5.3
Agricultural sciences                             18                2.6
Medical and health sciences                      

In [49]:
#Standard Crop Production Import
stdProduction_lin = pd.read_csv(stdProduction_path)
stdProduction_lin = stdProduction_lin[stdProduction_lin['crops'].str.contains("0000")]
stdProduction_lin = stdProduction_lin[stdProduction_lin['strucpro'].isin(['AR', 'PR_HU_EU']) ]
stdProduction_lin  = stdProduction_lin[~stdProduction_lin['OBS_FLAG'].isin(['c','n'])]
stdProduction_lin_yield = stdProduction_lin[stdProduction_lin['strucpro'].isin(['YI_HU_EU']) ]
stdProduction = stdProduction_lin.pivot(index=['crops', 'geo', 'TIME_PERIOD'], columns='strucpro', values='OBS_VALUE').reset_index() 
stdProduction = stdProduction.dropna(subset=['AR', 'PR_HU_EU']) #Removes last NAN value in SET
#stdProduction = stdProduction[(stdProduction['AR'] != 0)] #Removes last NAN value in SET
stdProduction['area_HA'] = stdProduction['AR']*1000
stdProduction['tonnes'] = stdProduction['PR_HU_EU']*1000

In [50]:
missing_values_table(stdProduction)
print('Unique Geo:' + str(stdProduction.geo.nunique()))
print(stdProduction.groupby('TIME_PERIOD').geo.nunique())
stdProduction.describe()
stdProduction.info()

Your selected dataframe has 7 columns.
There are 0 columns that have missing values.
Unique Geo:41
TIME_PERIOD
2000    39
2001    39
2002    39
2003    39
2004    39
2005    39
2006    40
2007    41
2008    41
2009    39
2010    41
2011    41
2012    41
2013    39
2014    41
2015    41
2016    41
2017    41
2018    41
2019    41
2020    40
2021    39
2022    31
Name: geo, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7643 entries, 0 to 9402
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   crops        7643 non-null   object 
 1   geo          7643 non-null   object 
 2   TIME_PERIOD  7643 non-null   int64  
 3   AR           7643 non-null   float64
 4   PR_HU_EU     7643 non-null   float64
 5   area_HA      7643 non-null   float64
 6   tonnes       7643 non-null   float64
dtypes: float64(4), int64(1), object(2)
memory usage: 477.7+ KB


In [51]:
#Organic Crop Production Import
orgArea_all = pd.read_csv(orgArea_path)
orgArea_total = orgArea_all[(orgArea_all['agprdmet'] == 'TOTAL') & (orgArea_all['unit']=='HA' ) & orgArea_all['crops'].str.contains("0000")]
orgArea_total = orgArea_total.rename(columns={"OBS_VALUE":"area_HA"})
orgArea_total.info()
orgArea_total = orgArea_total[~orgArea_total['OBS_FLAG'].isin(['c','n'])]
orgArea_total = orgArea_total.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'agprdmet', 'OBS_FLAG'])
orgArea_total = orgArea_total.dropna(subset=['area_HA']) #Removes last NAN value in SET
orgArea_total = orgArea_total[(orgArea_total['area_HA'] != 0)] #Removes 0 area values in SET which cause inf
missing_values_table(orgArea_total)
orgArea_total.head(25)

#del orgArea_all

orgTonne = pd.read_csv(orgTonne_path)
orgTonne = orgTonne[orgTonne['crops'].str.contains("0000")]
orgTonne = orgTonne.rename(columns={"OBS_VALUE":"tonnes"})
orgTonne.info()
orgTonne = orgTonne[~orgTonne['OBS_FLAG'].isin(['c','n'])]
orgTonne = orgTonne.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'])
missing_values_table(orgTonne)

orgProduction = pd.merge(orgArea_total, orgTonne, on=['crops', 'geo', 'TIME_PERIOD'], how='inner', suffixes=('_A','_T'))
orgProduction.head(100)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4169 entries, 1984 to 48046
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATAFLOW     4169 non-null   object 
 1   LAST UPDATE  4169 non-null   object 
 2   freq         4169 non-null   object 
 3   unit         4169 non-null   object 
 4   crops        4169 non-null   object 
 5   agprdmet     4169 non-null   object 
 6   geo          4169 non-null   object 
 7   TIME_PERIOD  4169 non-null   int64  
 8   area_HA      4152 non-null   float64
 9   OBS_FLAG     115 non-null    object 
dtypes: float64(1), int64(1), object(8)
memory usage: 358.3+ KB
Your selected dataframe has 4 columns.
There are 0 columns that have missing values.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2258 entries, 0 to 10855
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   DATAFLOW     2258 non-null   object 
 1 

Unnamed: 0,crops,geo,TIME_PERIOD,area_HA,tonnes
0,C0000,BE,2016,10314.0,32001.0
1,C0000,BE,2017,11474.0,43500.0
2,C0000,BE,2018,11651.0,46188.0
3,C0000,BE,2019,12199.0,46441.0
4,C0000,BE,2020,13179.0,54492.0
5,C0000,BG,2013,7669.0,3548.0
6,C0000,BG,2014,10795.0,7671.0
7,C0000,BG,2015,22191.0,5619.0
8,C0000,BG,2016,30940.0,5943.0
9,C0000,BG,2017,16602.0,16152.0


In [None]:
cropProd = pd.merge( stdProduction, orgProduction, on=['crops', 'geo', 'TIME_PERIOD'], how='inner', suffixes=('_std','_org'))
cropProd = pd.merge( cropProd, cropsCodes, on=['crops'], how='inner')
cropProd['geo'] = cropProd['geo'].astype('str') 
print(cropProd.groupby('TIME_PERIOD').geo.nunique())
hriPesticide.shape
cropProd.describe()
cropProd.info()
cropProd.head()

In [None]:
cropProd['tonne_per_HA_org'] = cropProd['tonnes_org']/cropProd['area_HA_org']
cropProd['tonne_per_HA_std'] = cropProd['tonnes_std']/cropProd['area_HA_std']
cropProd['util_ratio'] = cropProd['tonne_per_HA_org']/cropProd['tonne_per_HA_std']

cropProdTotals = cropProd[cropProd['crops'].str.contains("0000")]

cropProdTotals_Geo_Y= cropProdTotals.groupby(['geo', 'TIME_PERIOD']).sum(numeric_only = True).reset_index()
#cropProdTotals_Geo= cropProdTotals.groupby(['geo']).sum(numeric_only = True).reset_index()
#cropProdTotals_crop_Y= cropProdTotals.groupby(['crops', 'crop_name', 'TIME_PERIOD']).sum(numeric_only = True).reset_index()
#cropProdTotals_crop= cropProdTotals.groupby(['crops', 'crop_name']).sum(numeric_only = True).reset_index()

cropProdTotals_Geo_Y.shape
cropProdTotals_Geo_Y.head(10)
#print(cropProdTotals.groupby(['TIME_PERIOD', 'geo']).size())

# Variable Creation

In [21]:
#%whos DataFrame
#hriPesticide.head()
hriPesticide.shape

(270, 4)

In [39]:
import functools as ft
#extraVars = [hriPesticide, orgProcessors, orgAreaUtil, countryGini, cropProdTotals_Geo_Y, birdBiodiversity, emplyomentRate, income, fertUse, productivityIndex, wasteGeneration]
extraVars = [hriPesticide, orgProcessors, orgAreaUtil, countryGini, birdBiodiversity, emplyomentRate, income, fertUse, productivityIndex, wasteGeneration, pestSales, orgTonne, stdProduction_lin, higherEdu, research]
rds = ft.reduce(lambda left, right: pd.merge(left,right, how='left', on=['geo', 'TIME_PERIOD']), extraVars)

#rds.shape
#rds.head(20)
#print(extraVars_df.groupby(['TIME_PERIOD', 'geo']).size())
#missing_values_table(rds)#Will likely keep NAN values and use as a category when clustered to create scorecard perhaps?

In [40]:

rds["mostGrownOrganic"] = rds["mostGrownOrganic"].fillna("NoOrganic")
rds["orgTonnes"] = rds["orgTonnes"].fillna(0)
rds["mostGrownStd"] = rds["mostGrownStd"].fillna("NoStd")
rds["stdTonnes"] = rds["stdTonnes"].fillna(0)
rds["mostFrequentPest"] = rds["mostFrequentPest"].fillna("NoPest")
rds["C101"] = rds["C101"].fillna(0)
rds["C102"] = rds["C102"].fillna(0)
rds["C103"] = rds["C103"].fillna(0)
rds["C104"] = rds["C104"].fillna(0)
rds["C105"] = rds["C105"].fillna(0)


organicGrown = pd.get_dummies(rds["mostGrownOrganic"], prefix='org_')
stdGrown = pd.get_dummies(rds["mostGrownStd"], prefix='std_')
pestMax = pd.get_dummies(rds["mostFrequentPest"])

rds=pd.merge(rds,organicGrown, left_index=True, right_index=True)
rds=pd.merge(rds,stdGrown, left_index=True, right_index=True)
rds=pd.merge(rds,pestMax, left_index=True, right_index=True)

#rds.head(20)

In [47]:
missing_values_table(rds)

rdsColumns = list(rds.drop(rds.filter(regex=r'geo|TIME_PERIOD').columns, axis=1))
rds_null = rds[rdsColumns].isnull().sum() / len(rds)
missing_features_rds = list(rds_null[rds_null > 0.1].index)
rds = rds.drop(missing_features_rds, axis=1)

rds['N_use_tonne'] = rds['N_use_tonne'].fillna(rds.groupby('geo')['N_use_tonne'].transform('mean'))
rds['P_use_tonne'] = rds['P_use_tonne'].fillna(rds.groupby('geo')['P_use_tonne'].transform('mean'))
rds['pest_KG'] = rds['pest_KG'].fillna(rds.groupby('geo')['pest_KG'].transform('mean'))
rds['waste'] = rds['waste'].fillna(rds.groupby('geo')['waste'].transform('mean'))
rds['areaUsedForOrganic_PCT'] = rds['areaUsedForOrganic_PCT'].fillna(rds.groupby('geo')['areaUsedForOrganic_PCT'].transform('mean'))

missing_values_table(rds)

Your selected dataframe has 50 columns.
There are 6 columns that have missing values.
Your selected dataframe has 50 columns.
There are 1 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
waste_unit,2,0.7


In [48]:
rds.to_csv(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Datasets\referenceDataSet.csv")

In [26]:
#cropProdTotals_Geo_Y.to_csv(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Datasets\cropProdTotals_Geo_Y.csv")
#pestSales.to_csv(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Datasets\pestSales.csv")
#pestUse.to_csv(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Datasets\pestUse.csv")