In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import numpy as np
import nltk
import math

In [81]:
# Initial question: can you predict crop yield 
# by year based on the total exploitable water resources a country has available?


def convert_to_pandas(filename):
    """
    Convert dataset stored in data/ dir with a filename descriptor into a pandas df
    
    filename - name of file in data/{filename}: string
    """
    if "xlsx" in filename:
        return pd.read_excel(f"data/{filename}")
    if "csv" in filename:
        return pd.read_csv(f"data/{filename}")
    
def crop_yield_specialized_preprocessing(df):
    """
    Performs crop yield specialized preprocessing
    Our class column dataset requires special preprocessing to convert the attainable yields and yield gaps
    into crop yield
    
    df - crop yield input dataframe: DataFrame
    """
    attainable_cols = [col for col in df.columns if "attainable" in col]
    gap_cols = [col for col in df.columns if "gap" in col]
    crop_names = [name.split("_")[0] for name in attainable_cols]
    new_col_names = list()
    
    for crop in crop_names:
        attainable_col_name = [col for col in attainable_cols if crop in col][0]
        gap_col_name = [col for col in gap_cols if crop in col][0]

        new_col_name = f"{crop}_crop_yield"
        new_col_names.append(new_col_name)
        df[new_col_name] = df[attainable_col_name] - df[gap_col_name]
    
    df = strip_df(df, ["Entity", "Year"] + new_col_names)
    df.rename(columns = {'Entity':'Country'}, inplace = True)
    
    return df
    
    
def strip_df(df, keep_features_list):
    """
    Strips a dataframe of anything except whats included in the keep_features_list
    
    df - input df: DataFrame
    keep_features_list - list of features you want to keep as column name : list[str]
    """
    del_columns = [column for column in df.columns if column not in keep_features_list]
    for col in del_columns:
        df = df.drop([col], axis=1)
    return df
    
def combine_feature_dfs(df1, df2, feature_merge_list):
    """
    Merges two dataframes based on the column name values provided in the feature_merge_list
    
    df1 - input df: DataFrame
    df2 - input df: DataFrame
    feature_merge_list - list of features you want to merge by (Year and Country usually) : list[str]
    """
    return pd.merge(df1, df2, on=feature_merge_list)


def pipeline(aquastat_filename, crop_yield_filename, aquastat_feature):
    """
    Runs the aquastat and crop yield pipeline
    """
    shared_attributes = ["Country", "Year"]
    aqua_df, crop_df = convert_to_pandas(aquastat_filename), convert_to_pandas(crop_yield_filename)
    crop_df = crop_yield_specialized_preprocessing(crop_df)
    aqua_df = strip_df(aqua_df, shared_attributes + ["value"])
    combined_df = combine_feature_dfs(aqua_df, crop_df, shared_attributes)
    return combined_df


In [82]:
# Combining total exploitable water resources (10^9 m3/year) 
# with the various crop yeilds by country and by year
water_resource_df = pipeline("total_exploitable_water_resource_by_year_by_country.xlsx", 
                             "Attainable_yields.csv", "value")
water_resource_df.rename(columns = {'value':'total_exploitable_water_resources'}, inplace = True)

water_resource_df

Unnamed: 0,Country,Year,total_exploitable_water_resources,barley_crop_yield,cassava_crop_yield,cotton_crop_yield,groundnut_crop_yield,maize_crop_yield,millet_crop_yield,oilpalm_crop_yield,potato_crop_yield,rapeseed_crop_yield,rice_crop_yield,rye_crop_yield,sorghum_crop_yield,soybean_crop_yield,sugarbeet_crop_yield,sugarcane_crop_yield,sunflower_crop_yield,wheat_crop_yield
0,Albania,2018,13.0,2.84,,1.11,,7.23,,,26.17,,,2.31,,2.55,39.15,,2.09,3.69
1,Albania,2017,13.0,2.95,,1.11,,6.56,,,25.11,,,2.27,,2.20,23.48,,2.09,4.04
2,Albania,2016,13.0,2.90,,1.11,,6.49,,,24.59,,,2.25,,2.52,22.61,,2.09,3.90
3,Albania,2015,13.0,2.80,,1.11,,6.96,,,24.26,,,2.31,,2.50,25.06,,2.09,3.95
4,Albania,2014,13.0,2.70,,1.11,,6.91,,,25.00,,,2.23,,1.69,22.77,,2.09,4.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2260,Zimbabwe,1968,1.5,2.79,3.06,1.16,0.46,1.32,0.53,,12.50,,1.52,,0.41,1.73,,104.85,0.61,3.58
2261,Zimbabwe,1967,1.5,2.79,3.03,1.94,0.70,1.77,0.47,,13.06,,1.40,,0.79,1.67,,105.33,0.65,2.69
2262,Zimbabwe,1966,1.5,2.79,3.03,1.47,0.65,1.14,0.54,,10.95,,1.30,,0.80,1.77,,84.82,0.55,2.01
2263,Zimbabwe,1965,1.5,2.79,3.01,1.18,0.35,1.10,0.65,,13.38,,1.30,,0.69,0.46,,109.85,0.51,2.35


In [83]:
# Combining total population (per 1000 inhabitants) with the various crop yeilds by country and by year
population_df = pipeline("total_population_by_year_by_country.xlsx", 
                             "Attainable_yields.csv", "value")
population_df.rename(columns = {'value':'total_population'}, inplace = True)
population_df

Unnamed: 0,Country,Year,total_population,barley_crop_yield,cassava_crop_yield,cotton_crop_yield,groundnut_crop_yield,maize_crop_yield,millet_crop_yield,oilpalm_crop_yield,potato_crop_yield,rapeseed_crop_yield,rice_crop_yield,rye_crop_yield,sorghum_crop_yield,soybean_crop_yield,sugarbeet_crop_yield,sugarcane_crop_yield,sunflower_crop_yield,wheat_crop_yield
0,Afghanistan,2018,37171.921,0.67,,1.00,,1.47,1.98,,19.00,,3.00,,,,27.36,0.00,1.56,2.21
1,Afghanistan,2017,36296.113,1.39,,1.15,,1.30,1.98,,15.98,,3.09,,,,10.46,0.00,1.59,2.03
2,Afghanistan,2016,35383.032,1.38,,1.15,,2.05,1.98,,11.99,,3.00,,,,7.83,0.00,1.54,1.98
3,Afghanistan,2015,34413.603,1.43,,1.20,,2.15,1.98,,13.09,,2.50,,,,8.16,0.00,1.63,2.20
4,Afghanistan,2014,33370.794,1.52,,1.22,,2.49,1.98,,13.61,,2.44,,,,11.56,0.00,1.85,2.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6708,Zimbabwe,1968,4941.906,2.79,3.06,1.16,0.46,1.32,0.53,,12.50,,1.52,,0.41,1.73,,104.85,0.61,3.58
6709,Zimbabwe,1967,4779.827,2.79,3.03,1.94,0.70,1.77,0.47,,13.06,,1.40,,0.79,1.67,,105.33,0.65,2.69
6710,Zimbabwe,1966,4623.351,2.79,3.03,1.47,0.65,1.14,0.54,,10.95,,1.30,,0.80,1.77,,84.82,0.55,2.01
6711,Zimbabwe,1965,4471.177,2.79,3.01,1.18,0.35,1.10,0.65,,13.38,,1.30,,0.69,0.46,,109.85,0.51,2.35
