In [16]:
import json
import os
import sys
from itertools import chain, combinations

sys.path.append('..')

from utils import read_csv_non_utf

In [52]:
# Powerset function, ignoring the case of empty subset 
#  - https://docs.python.org/3/library/itertools.html#itertools-recipes  
def powerset(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s) + 1))

In [5]:
# Loading in general configuration
with open('../config.json', 'r') as f:
    config = json.load(f)

# Getting filepaths
gdrive_fp = config['gdrive_path']
LIFE_fp = config['LIFE_folder']
dataset_fp = config['datasets_path']
benitez_lopez2019 = config['indiv_data_paths']['benitez_lopez2019_recreated']

#  reading the dataset
ben_lop_path = os.path.join(gdrive_fp, LIFE_fp, dataset_fp, benitez_lopez2019)
data = read_csv_non_utf(ben_lop_path)

data.head()

Unnamed: 0,Reference,Study,Order,Family,Species,Species_List,Longitude,Latitude,Response_Ratio,Region,...,Road_Density,Percent_Settlement_50km,BM,DistKm,Reserve,TravTime,LivestockBio,Stunting,PopDens,Literacy
0,"Laurance et al., 2006",1,Cetartiodactyla,Bovidae,"Cephalophus callipygus, C. dorsalis, C. leucog...",Cephalophus callipygus|Cephalophus dorsalis|Ce...,9.839,-1.916,0.377193,Africa,...,129,0.002549,17.07,0.05,No,755.8,39.25948,22.0,0.86,81.8
1,"Laurance et al., 2006",1,Proboscidea,Elephantidae,Loxodonta africana,Loxodonta africana,9.839,-1.916,0.86569,Africa,...,129,0.002549,3940.03,0.05,No,755.8,39.25948,22.0,0.86,81.8
2,"Laurance et al., 2006",1,Cetartiodactyla,Bovidae,"Cephalophus callipygus, C. dorsalis, C. leucog...",Cephalophus callipygus|Cephalophus dorsalis|Ce...,9.839,-1.916,0.833333,Africa,...,129,0.002549,17.07,0.3,No,755.8,39.25948,22.0,0.86,81.8
3,"Laurance et al., 2006",1,Proboscidea,Elephantidae,Loxodonta africana,Loxodonta africana,9.839,-1.916,0.900862,Africa,...,129,0.002549,3940.03,0.3,No,755.8,39.25948,22.0,0.86,81.8
4,"Laurance et al., 2006",1,Cetartiodactyla,Bovidae,"Cephalophus callipygus, C. dorsalis, C. leucog...",Cephalophus callipygus|Cephalophus dorsalis|Ce...,9.839,-1.916,0.95614,Africa,...,129,0.002549,17.07,0.6,No,755.8,39.25948,22.0,0.86,81.8


In [21]:
# Defining the number of models to compare for model selection
predictors = ['Body_Mass', 'Stunting_Pct', 'Literacy_Rate', 'Dist_Settlement_KM', 'Travel_Time_Large',
              'Protected_Area', 'Livestock_Biomass', 'Population_Density', 'Percent_Settlement_50km']
interactions = ['Body_Mass*Dist_Settlement_KM', 'Body_Mass*Percent_Settlement_50km']
quadratics = ['I(Dist_Settlement_KM^2)', 'I(Population_Density^2)', 'I(Body_Mass^2)', 
              'I(Stunting_Pct^2)']

all_predictors = predictors + interactions + quadratics
all_combos = list(powerset(all_predictors))

In [36]:
# Thinning out possible combinations w/some reasonable rules:
#  1. Only include a quadratic if the original var is included
#  2. Only include interaction terms if both vars are included
#  3. Models must include body mass, population density, and at least one of 
#     the distance to settlement measures
final_combos = []

for c in all_combos:
    include = True
    
    #  rule 1
    for q in quadratics:
        if q in c:
            original_var = q.removeprefix('I(').removesuffix('^2)')
            
            if original_var not in c:
                include = False

    #  rule 2
    for i in interactions:
        if i in c:
            var1 = i.split('*')[0]
            var2 = i.split('*')[1]

            if (var1 not in c) or (var2 not in c):
                include = False

    #  rule 3
    if 'Body_Mass' not in c:
        include = False
    elif 'Population_Density' not in c:
        include = False
    elif ('Dist_Settlement_KM' not in c) or ('Percent_Settlement_50km' not in c):
        include = False

    #  including only if it satisfies all three rules
    if include:
        final_combos.append(c)

In [42]:
# Checking out the resulting set of models to test
print(f'After thinning out based on the ruleset, there are {len(final_combos)} models to try')
print(f'All models are unique: {len(set(final_combos)) == len(final_combos)}')

After thinning out based on the ruleset, there are 1536 models to try
All models are unique: True


In [48]:
# Constructing the formula based on the var subsets to try
continuous_model_formula = []
binary_model_formula = []

for c in final_combos:
    formula = ' + '.join(c)
    continuous_model_formula.append('RR ~ ' + formula)
    binary_model_formula.append('local_extirpation ~ ' + formula)