In [125]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import numpy as np
import nltk
import math
import os

In [123]:
# Initial question: can you predict crop yield 
# by year based on the total exploitable water resources a country has available?


def convert_to_pandas(filename):
    """
    Convert dataset stored in data/ dir with a filename descriptor into a pandas df
    
    filename - name of file in data/{filename}: string
    """
    if "xlsx" in filename:
        return pd.read_excel(f"data/{filename}")
    if "csv" in filename:
        return pd.read_csv(f"data/{filename}")
    
def crop_yield_specialized_preprocessing(df, total_yield_output=True):
    """
    Performs crop yield specialized preprocessing
    Our class column dataset requires special preprocessing to convert the attainable yields and yield gaps
    into crop yield
    
    df - crop yield input dataframe: DataFrame
    """
    attainable_cols = [col for col in df.columns if "attainable" in col]
    gap_cols = [col for col in df.columns if "gap" in col]
    crop_names = [name.split("_")[0] for name in attainable_cols]
    new_col_names = list()
    
    for crop in crop_names:
        attainable_col_name = [col for col in attainable_cols if crop in col][0]
        gap_col_name = [col for col in gap_cols if crop in col][0]

        new_col_name = f"{crop}_crop_yield"
        new_col_names.append(new_col_name)
        df[new_col_name] = df[attainable_col_name] - df[gap_col_name]
    
    df = df.fillna(0)
    df.rename(columns = {'Entity':'Country'}, inplace = True)
    
    if total_yield_output:
        df['total_yield'] = df[new_col_names].sum(axis=1)
        df = strip_df(df, ["Country", "Year", "total_yield"]) 
        return df
            
    df = strip_df(df, ["Country", "Year"] + new_col_names)
    return df


def aquastat_feature_rename(df, fn, aquastat_feature):
    new_column_value = fn.split(".xlsx")[0]
    df.rename(columns = {aquastat_feature: new_column_value}, inplace = True)
    return df
    
    
    
def strip_df(df, keep_features_list):
    """
    Strips a dataframe of anything except whats included in the keep_features_list
    
    df - input df: DataFrame
    keep_features_list - list of features you want to keep as column name : list[str]
    """
    del_columns = [column for column in df.columns if column not in keep_features_list]
    for col in del_columns:
        df = df.drop([col], axis=1)
    return df
    
def combine_feature_dfs(dfs, feature_merge_list):
    """
    Merges two dataframes based on the column name values provided in the feature_merge_list
    
    df1 - list of input dfs: List[DataFrame]
    feature_merge_list - list of features you want to merge by (Year and Country usually) : list[str]
    """
    
    aquastat_df = dfs[0]
    for aqua_df in dfs[1:]:
        aquastat_df = aquastat_df.merge(aqua_df, on=feature_merge_list)
    
    return aquastat_df


def pipeline(aquastat_filenames, crop_yield_filename, aquastat_feature):
    """
    Runs the aquastat and crop yield pipeline
    """
    shared_attributes = ["Country", "Year"]
    crop_df = crop_yield_specialized_preprocessing(convert_to_pandas(crop_yield_filename))
    dfs = []
    for fn in aquastat_filenames:
        temp_df = convert_to_pandas(fn)
        strip_temp_df = strip_df(temp_df, shared_attributes + [aquastat_feature])
        renamed_temp_df = aquastat_feature_rename(strip_temp_df, fn, aquastat_feature)
        dfs.append(strip_temp_df)
    
    dfs.append(crop_df)
    combined_df = combine_feature_dfs(dfs, shared_attributes) 
    
    return combined_df


In [127]:
# Combining total exploitable water resources (10^9 m3/year) 
# with the various crop yeilds by country and by year

aquastat_file_names = [fn for fn in os.listdir("data/") if ".xlsx" in fn]

aquastat_crop_yeild = pipeline(aquastat_file_names, "Attainable_yields.csv", "value")

aquastat_crop_yeild

Unnamed: 0,Country,Year,total_renewable_water_resources,agricultural_value_added_percent_of_gdp,flood_occurence,total_exploitable_water_resources,area_salinized_by_irrigation,precipitation_index,gdp_per_capita,dam_capacity,total_population_with_access_to_safe_drinking_water,total_internal_renewable_water_resource_per_capita,arable_land_area,total_population,total_yield
0,Albania,2018,30.200,18.429476,2.7,13.000,0.000000,1136.0,5257.650696,4.030000,95.1,9331.399988,611.346,2882.740,87.14
1,Albania,2017,30.200,19.022127,2.7,13.000,0.000000,1136.0,4514.204908,4.030000,95.1,9326.776621,612.000,2884.169,69.81
2,Albania,2016,30.200,19.849993,2.7,13.000,0.480000,1136.0,4109.340457,4.030000,95.1,9319.444935,620.300,2886.438,68.46
3,Albania,2015,30.200,19.780225,2.7,13.000,0.960000,1136.0,3939.413126,4.030000,95.1,9306.306528,615.100,2890.513,71.04
4,Albania,2014,30.200,19.990153,2.7,13.000,1.440000,1136.0,4567.281443,4.030000,95.1,9287.695875,615.600,2896.305,68.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,Tunisia,2017,4.615,9.685052,2.6,3.625,193.294118,326.1,3481.203612,2.691378,97.7,366.906102,2607.000,11433.443,90.76
108,Tunisia,2016,4.615,9.391980,2.6,3.625,186.588235,326.1,3697.880363,2.691378,97.7,371.109378,2564.000,11303.945,85.10
109,Tunisia,2015,4.615,10.282704,2.6,3.625,179.882353,326.1,3861.643875,2.691378,97.7,375.225325,2570.000,11179.949,83.86
110,Tunisia,2014,4.615,9.153284,2.6,3.625,173.176471,326.1,4305.543424,2.691378,97.7,379.185012,2588.000,11063.201,74.09
