In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import numpy as np
import nltk
import math
from collections import Counter 
from math import log2
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%run data_intake.ipynb
import os

In [14]:
aquastat_file_names = [fn for fn in os.listdir("data/") if ".xlsx" in fn]

aquastat_crop_yeild = pipeline(aquastat_file_names, "Attainable_yields.csv", "value")

aquastat_crop_yeild

Unnamed: 0,Country,Year,total_renewable_water_resources,agricultural_value_added_percent_of_gdp,flood_occurence,total_exploitable_water_resources,area_salinized_by_irrigation,precipitation_index,gdp_per_capita,dam_capacity,total_population_with_access_to_safe_drinking_water,total_internal_renewable_water_resource_per_capita,arable_land_area,total_population,total_yield
0,Albania,2018,30.200,18.429476,2.7,13.000,0.000000,1136.0,5257.650696,4.030000,95.1,9331.399988,611.346,2882.740,87.14
1,Albania,2017,30.200,19.022127,2.7,13.000,0.000000,1136.0,4514.204908,4.030000,95.1,9326.776621,612.000,2884.169,69.81
2,Albania,2016,30.200,19.849993,2.7,13.000,0.480000,1136.0,4109.340457,4.030000,95.1,9319.444935,620.300,2886.438,68.46
3,Albania,2015,30.200,19.780225,2.7,13.000,0.960000,1136.0,3939.413126,4.030000,95.1,9306.306528,615.100,2890.513,71.04
4,Albania,2014,30.200,19.990153,2.7,13.000,1.440000,1136.0,4567.281443,4.030000,95.1,9287.695875,615.600,2896.305,68.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,Tunisia,2017,4.615,9.685052,2.6,3.625,193.294118,326.1,3481.203612,2.691378,97.7,366.906102,2607.000,11433.443,90.76
108,Tunisia,2016,4.615,9.391980,2.6,3.625,186.588235,326.1,3697.880363,2.691378,97.7,371.109378,2564.000,11303.945,85.10
109,Tunisia,2015,4.615,10.282704,2.6,3.625,179.882353,326.1,3861.643875,2.691378,97.7,375.225325,2570.000,11179.949,83.86
110,Tunisia,2014,4.615,9.153284,2.6,3.625,173.176471,326.1,4305.543424,2.691378,97.7,379.185012,2588.000,11063.201,74.09


In [10]:
def total(cnt):
    return sum(cnt.values())

def gini(cnt):
    tot = total(cnt)
    return 1 - sum([(v/tot)**2 for v in cnt.values()])

def entropy(cnt):
    tot = total(cnt)
    return sum([(-v/tot) * log2(v/tot) for v in cnt.values()])
    
def wavg(cnt1, cnt2, measure):
    tot1 = total(cnt1)
    tot2 = total(cnt2)
    tot = tot1 + tot2
    return (measure(cnt1) * tot1 + measure(cnt2) * tot2) / tot

def evaluate_split(df, class_col, split_col, feature_val, measure):
    df1, df2 = df[df[split_col] == feature_val], df[df[split_col] != feature_val]
    cnt1, cnt2 = Counter(df1[class_col]), Counter(df2[class_col])
    return wavg(cnt1, cnt2, measure)


def split(df, split_col, v):
    """
    Subdivides the dataframe based on the choosen split_column and value partition
    """
    left_split = df[df[split_col] <= v]
    right_split = df[df[split_col] > v]
    return left_split, right_split
    


def get_best_split(df, class_col, method, target_impurity):
    """
    Returns information for a split of the dataset based on the choosen method (gini or other),
    including the right and left partitions of the dataframe
    """
    # Holder variables for the information regarding our best split of the df
    best_col = 0
    best_v = ''
    best_meas = float("inf")
    best_split_left = None
    best_split_right = None
    
    # Loop through all the feature columns
    for split_col in df.columns:
        # Make sure it's not the class column
        if split_col != class_col:
            # Find all the unique values in the feature column
            for v in set(df[split_col]):
                # Create a split w/ the values greater than or equal to or less than for left and right splits
                split_left, split_right = split(df, split_col, v)
                # important step, stops infinite recursion on the same split
                if len(split_left)>0 and len(split_right)>0:
                    # Evaluate this split w/ the given metric
                    meas = evaluate_split(df, class_col, split_col, v, method)
                    # If we found a better split (and the impurity is okay), we set it as our best split
                    # and update the holder variables accordingly
                    if meas < best_meas and meas > target_impurity:
                        best_meas = meas
                        best_col = split_col
                        best_v = v
                        best_split_left = split_left
                        best_split_right = split_right

    return best_col, best_v, best_meas, best_split_left, best_split_right
                    


def compute_majority_class(df, class_col):
    """
    Compute the majority class for this node
    """
    # Count the number of each unique class in the class column
    counts = Counter(df[class_col])
    majority_class = None
    # Holder variable to keep track of the number of occurences of the majority class
    max_count = 0
    # Iterate through the counts tuple, find the majority class
    for k,v in counts.items():
        if v > max_count:
            max_count = v
            majority_class = k
    return majority_class
        

In [21]:
def dtree(train, criterion, class_col, max_depth=None, current_depth=0, min_instances=2, target_impurity=0.0):
    """
    Constructs binary decision tree based on the input parameters
    """
    # If the input dataset is empty at this point in recursion, we have hit a return leaf
    if train is None or len(train) == 0:
        return None
    # If the input dataset is smaller than our min number of instances or the max depth has been exceeded, return leaf
    elif len(train) < min_instances or current_depth == max_depth:
        return None
    
    # Generate the best split for this input dataset
    best_col, best_v, best_meas, left_subtree, right_subtree = get_best_split(
        train, class_col, criterion, target_impurity)
    
    # The split with the best measusure must have had an empty side, so we have to terminate to a leaf  
    # Or no best measure was found (stops infinite recursion)
    if best_meas == float("inf"):
        return None

    # Recursive formulation of building the decision tree
    return (best_col, best_v, len(train), compute_majority_class(train, "total_yield"), best_meas, current_depth,
            dtree(left_subtree, criterion, max_depth, current_depth + 1, min_instances, target_impurity), 
            dtree(right_subtree, criterion, max_depth, current_depth + 1, min_instances, target_impurity))


In [22]:
test_tree = dtree(aquastat_crop_yeild[0:200], gini, "total_yield", max_depth=None, target_impurity=0.0)
test_tree

('Country',
 'Libya',
 112,
 93.25,
 0.9789982030548068,
 0,
 ('total_renewable_water_resources',
  0.7,
  54,
  26.01,
  0.95679012345679,
  1,
  ('Year',
   2016,
   6,
   26.01,
   0.6,
   2,
   ('Year',
    2013,
    4,
    26.05,
    0.5,
    3,
    None,
    ('Year', 2014, 3, 26.05, 0.3333333333333333, 4, None, None)),
   None),
  ('Country',
   'Kenya',
   48,
   87.13999999999999,
   0.9583333333333334,
   2,
   ('Country',
    'Australia',
    42,
    87.13999999999999,
    0.9523809523809523,
    3,
    ('Country',
     'Albania',
     12,
     87.13999999999999,
     0.8333333333333334,
     4,
     ('Year',
      2016,
      6,
      87.13999999999999,
      0.6666666666666666,
      5,
      ('Year',
       2013,
       4,
       68.46000000000001,
       0.5,
       6,
       None,
       ('Year',
        2014,
        3,
        68.46000000000001,
        0.3333333333333333,
        7,
        None,
        None)),
      None),
     ('Year',
      2016,
      6,
      14