# Setup

In [28]:
import json
import os
import sys

sys.path.append('..')

import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import LabelEncoder

from utils import read_csv_non_utf, preprocess_data

In [2]:
# Loading in general configuration
with open('../config.json', 'r') as f:
    config = json.load(f)

# Getting filepaths
gdrive_fp = config['gdrive_path']
LIFE_fp = config['LIFE_folder']
dataset_fp = config['datasets_path']
benitez_lopez2019 = config['indiv_data_paths']['benitez_lopez2019']

data_path = os.path.join(gdrive_fp, LIFE_fp, dataset_fp, benitez_lopez2019)

In [3]:
# Reading in data as a pandas dataframe
ben_lop2019 = read_csv_non_utf(data_path)

# Designing the two-stage model

In this model, the first stage classifies into four categories: local extirpation (ratio=0), abundance decline (0<ratio<1), no abundance effect (ratio=1), and abundance increase (ratio>1).

Bits that still need to be implemented:
1. A function that gets the datasets for each of the three models,
2. Independently fit each model on their respective datasets,
3. Implement prediction behavior for the joint model,
4. Probability predictions just come from the classifier component (probability of extirpation, decrease, no change, or increase).

In [7]:
def four_DI_cats(ratios, neighborhood = 0):
    new_ratios = np.zeros_like(ratios).astype(str)
    
    new_ratios[ratios <= 0 + neighborhood] = 'extirpated'
    new_ratios[(ratios > 0 + neighborhood) & (ratios < 1 - neighborhood)] = 'decrease'
    new_ratios[(ratios >= 1 - neighborhood) & (ratios <= 1 + neighborhood)] = 'no change'
    new_ratios[ratios > 1 + neighborhood] = 'increase'

    return new_ratios

In [62]:
class TwoStageNovelModel(RegressorMixin, BaseEstimator):

    def __init__(self, classifier, regressor_decrease, regressor_increase, DI_cats_mapping):
        self.classifier = classifier
        self.regressor_decrease = regressor_decrease
        self.regressor_increase = regressor_increase

        self.DI_cats_mapping = DI_cats_mapping

    def fit(self, pp_data, fit_args = None):
        if fit_args is None:
            fit_args = {'zero' : {}, 'nonzero' : {}}
        
        X_datasets, y_datasets = get_three_datasets(pp_data, pred = False, DI_cats_mapping = self.DI_cats_mapping)

        self.classifier.fit(X_datasets['classifier'], y_datasets['classifier'], **fit_args)
        self.regressor_decrease.fit(X_datasets['regressor_decrease'], y_datasets['regressor_decrease'], **fit_args)
        self.regressor_increase.fit(X_datasets['regressor_increase'], y_datasets['regressor_increase'], **fit_args)

    def predict(self):
        pass

    def predict_proba(self):
        pass

In [57]:
def get_three_datasets(pp_data, pred = True, columns = None, DI_cats_mapping = None):
    if columns is None:
        columns = ['DistKm', 'Reserve', 'TravTime', 'LivestockBio', 'Stunting', 'PopDens', 'Literacy']
    if DI_cats_mapping is None:
        DI_cats_mapping = {'extirpated' : 0, 'decrease' : 1, 'no change' : 2, 'increase' : 3}
    
    X = pp_data[columns].copy(deep = True)

    if not pred:
        ratios = pp_data['ratio'].values
        DI_cats = pp_data['DI_category'].values

        X_classifier = X.copy(deep = True)
        X_regressor_decrease = X[DI_cats == DI_cats_mapping['decrease']].copy(deep = True)
        X_regressor_increase = X[DI_cats == DI_cats_mapping['increase']].copy(deep = True)
        X_datasets = {'classifier' : X_classifier, 
                      'regressor_decrease' : X_regressor_decrease, 
                      'regressor_increase' : X_regressor_increase}

        y_classifier = DI_cats
        y_regressor_decrease = ratios[DI_cats == DI_cats_mapping['decrease']]
        y_regressor_increase = ratios[DI_cats == DI_cats_mapping['increase']]
        y_datasets = {'classifier' : y_classifier, 
                      'regressor_decrease' : y_regressor_decrease, 
                      'regressor_increase' : y_regressor_increase}

        return X_datasets, y_datasets

    return X

In [58]:
# Adding the intuitive 4-class target to the dataset (w/nieghborhood to account for noise)
pp_data = preprocess_data(ben_lop2019, standardize = True)
pp_data['DI_category'] = four_DI_cats(pp_data['ratio'], neighborhood = 0.05)

DI_cats_mapping = {'extirpated' : 0, 'decrease' : 1, 'no change' : 2, 'increase' : 3}
pp_data['DI_category'] = pp_data['DI_category'].apply(lambda c: DI_cats_mapping[c])

pp_data.head()

Unnamed: 0,BM,DistKm,PopDens,Stunting,TravTime,LivestockBio,Literacy,Reserve,ratio,DI_category
0,-0.259162,-0.901851,-0.342637,-0.374973,-0.112018,-0.354683,-0.066277,0,0.377193,1
1,4.405221,-0.901851,-0.342637,-0.374973,-0.112018,-0.354683,-0.066277,0,0.86569,1
2,-0.259162,-0.877769,-0.342637,-0.374973,-0.112018,-0.354683,-0.066277,0,0.833333,1
3,4.405221,-0.877769,-0.342637,-0.374973,-0.112018,-0.354683,-0.066277,0,0.900862,1
4,-0.259162,-0.848869,-0.342637,-0.374973,-0.112018,-0.354683,-0.066277,0,0.95614,2


In [59]:
X_datasets, y_datasets = get_three_datasets(pp_data, pred = False, DI_cats_mapping = DI_cats_mapping)