# Section 0 Defining modules/libraries/functions

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import plot 
import plotly.io as io
#io.renderers.default='browser'
#import plotly.io as pio
#pio.renderers.default='svg'
from pingouin import kruskal
#Declare FilePaths
from pathlib import Path

In [None]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns


# Section 1 Importing Data

In [None]:
cropsCodes = pd.read_csv("C:/Users/cianw/Documents/dataAnalytics/CA2/Data/Eurostat/Code Dictionary/crops.dic", sep='\t',names=['crops', 'crop_name'], header = None)
strucproCodes = pd.read_csv("C:/Users/cianw/Documents/dataAnalytics/CA2/Data/Eurostat/Code Dictionary/strucpro.dic",sep='\t',names=['code', 'units'], header = None)
unitCodes = pd.read_csv("C:/Users/cianw/Documents/dataAnalytics/CA2/Data/Eurostat/Code Dictionary/unit.dic",sep='\t',names=['code', 'units'], header = None)

In [45]:
stdProduction_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\apro_cpsh1_linear.csv")
orgTonne_path= Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\org_croppro_linear.csv")
orgArea_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\org_cropar_linear.csv")
fertRisk_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Pesiticide Use Risk Indicator\aei_hri_linear.csv")
fertUse_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\aei_pestuse_linear.csv")
fertUseGroup3_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Use of more hazardous pesticides\sdg_02_52_linear.csv")
orgAreaUtil_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Organic Area\sdg_02_40_linear.csv")
orgProcessors_path = Path(r"C:\Users\cianw\Documents\dataAnalytics\CA2\Data\Eurostat\Agricultural Production\Crops\Organic Processors\org_cpreact_linear.csv")

In [51]:
orgProcessors = pd.read_csv(orgProcessors_path)
orgProcessors  = orgProcessors[orgProcessors['nace_r2'].isin(['C103'])]
orgProcessors = orgProcessors.drop(columns = ['DATAFLOW', 'LAST UPDATE','freq', 'unit', 'OBS_FLAG'])
orgProcessors = orgProcessors.rename(columns={"OBS_VALUE":"areUsedForOrganic"})

In [63]:
orgAreaUtil = pd.read_csv(orgAreaUtil_path)
orgAreaUtil.head()
orgAreaUtil = orgAreaUtil.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'crops','freq', 'agprdmet', 'unit', 'OBS_FLAG'])
orgAreaUtil = orgAreaUtil.rename(columns={"OBS_VALUE":"areUsedForOrganic"})

In [53]:
hriFertiliser = pd.read_csv(fertRisk_path)
hriFertiliser  = hriFertiliser[hriFertiliser['subst_cat'].isin(['HRI1'])]
hriFertiliser = hriFertiliser.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'])
hriFertiliser = hriFertiliser.rename(columns={"OBS_VALUE":"harmRiskInd"})

In [54]:
fertUseGroup3 = pd.read_csv(fertUseGroup3_path)
fertUseGroup3 = fertUseGroup3.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'subst_cat', 'unit', 'OBS_FLAG'])
fertUseGroup3 = fertUseGroup3.rename(columns={"OBS_VALUE":"grp3FertUse"})

In [57]:
import functools as ft
extraVars = [orgProcessors, orgAreaUtil, hriFertiliser, fertUseGroup3]
extraVars_df = ft.reduce(lambda left, right: pd.merge(left,right, on=['geo', 'TIME_PERIOD']), extraVars)
extraVars_df.head()

Unnamed: 0,nace_r2,geo,TIME_PERIOD,areUsedForOrganic_x,areUsedForOrganic_y,subst_cat,harmRiskInd,grp3FertUse
0,C103,BG,2018,2015877.0,2.56,HRI1,83,168
1,C103,BG,2018,85.0,2.56,HRI1,83,168
2,C103,BG,2018,6162.0,2.56,HRI1,83,168
3,C103,BG,2019,8205201.0,2.34,HRI1,98,129
4,C103,BG,2019,76.0,2.34,HRI1,98,129


In [None]:
#Standard Crop Production Import
stdProduction_lin = pd.read_csv(stdProduction_path)
stdProduction_lin = stdProduction_lin[stdProduction_lin['strucpro'].isin(['AR', 'PR_HU_EU']) ]
stdProduction_lin  = stdProduction_lin[~stdProduction_lin['OBS_FLAG'].isin(['c','n'])]
stdProduction_lin_yield = stdProduction_lin[stdProduction_lin['strucpro'].isin(['YI_HU_EU']) ]
stdProduction = stdProduction_lin.pivot(index=['crops', 'geo', 'TIME_PERIOD'], columns='strucpro', values='OBS_VALUE').reset_index() 
stdProduction  = stdProduction.dropna(subset=['AR', 'PR_HU_EU']) #Removes last NAN value in SET
stdProduction  = stdProduction[(stdProduction['AR'] != 0)] #Removes last NAN value in SET
stdProduction['area_HA'] = stdProduction['AR']*1000
stdProduction['tonnes'] = stdProduction['PR_HU_EU']*1000

In [None]:
missing_values_table(stdProduction)
stdProduction.describe()
stdProduction.info()

In [None]:
del stdProduction_lin 
del stdProduction_lin_yield


In [None]:
#Organic Crop Production Import
orgArea_all = pd.read_csv(orgArea_path)
orgArea_total = orgArea_all[(orgArea_all['agprdmet'] == 'TOTAL') & (orgArea_all['unit']=='HA' ) ]
orgArea_total = orgArea_total.rename(columns={"OBS_VALUE":"area_HA"})
orgArea_total.info()
orgArea_total = orgArea_total[~orgArea_total['OBS_FLAG'].isin(['c','n'])]
orgArea_total = orgArea_total.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'agprdmet', 'OBS_FLAG'])
orgArea_total = orgArea_total.dropna(subset=['area_HA']) #Removes last NAN value in SET
orgArea_total = orgArea_total[(orgArea_total['area_HA'] != 0)] #Removes last NAN value in SET
missing_values_table(orgArea_total)

del orgArea_all

orgTonne = pd.read_csv(orgTonne_path)
orgTonne = orgTonne.rename(columns={"OBS_VALUE":"tonnes"})
orgTonne.info()
orgTonne = orgTonne[~orgTonne['OBS_FLAG'].isin(['c','n'])]
orgTonne = orgTonne.drop(columns = ['DATAFLOW', 'LAST UPDATE', 'freq', 'unit', 'OBS_FLAG'])
missing_values_table(orgTonne)

orgProduction = pd.merge(orgArea_total, orgTonne, on=['crops', 'geo', 'TIME_PERIOD'], how='inner', suffixes=('_A','_T'))
orgProduction.describe()
orgProduction.info()


In [None]:
cropProd = pd.merge( stdProduction, orgProduction, on=['crops', 'geo', 'TIME_PERIOD'], how='inner', suffixes=('_std','_org'))
cropProd = pd.merge( cropProd, cropsCodes, on=['crops'], how='inner')
cropProd['geo'] = cropProd['geo'].astype('str') 
cropProd.describe()
cropProd.info()


# Variable Creation

In [None]:
cropProd['tonne_per_HA_org'] = cropProd['tonnes_org']/cropProd['area_HA_org']
cropProd['tonne_per_HA_std'] = cropProd['tonnes_std']/cropProd['area_HA_std']

cropProdTotals = cropProd[cropProd['crops'].str.contains("0000")]


cropProd_Geo_Y= cropProd.groupby(['geo', 'TIME_PERIOD']).sum(numeric_only = True).reset_index()
cropProd_Geo= cropProd.groupby(['geo']).sum(numeric_only = True).reset_index()
cropProd_crop_Y= cropProd.groupby(['crops', 'crop_name', 'TIME_PERIOD']).sum(numeric_only = True).reset_index()
cropProd_crop= cropProd.groupby(['crops', 'crop_name']).sum(numeric_only = True).reset_index()

cropProdTotals_Geo_Y= cropProdTotals.groupby(['geo', 'TIME_PERIOD']).sum(numeric_only = True).reset_index()
cropProdTotals_Geo= cropProdTotals.groupby(['geo']).sum(numeric_only = True).reset_index()
cropProdTotals_crop_Y= cropProdTotals.groupby(['crops', 'crop_name', 'TIME_PERIOD']).sum(numeric_only = True).reset_index()
cropProdTotals_crop= cropProdTotals.groupby(['crops', 'crop_name']).sum(numeric_only = True).reset_index()


In [None]:
from ipywidgets import Layout

unique_geo = cropProdTotals_Geo_Y.PdDistrict.unique()

country = widgets.SelectMultiple(
    options = unique_geo.tolist(),
    value = ['BAYVIEW', 'NORTHERN'],
    description='Country',
    disabled=False,
    layout = Layout(width='50%', height='80px', display='flex')
)