In [1]:
import os
import sys
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pylab import *
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

sys.path.append('..')
import model_utils as util

%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 8

In [2]:
DATA_PATH =  '../../../data/cluster/year/'
HISTORIC_DATA_PATH = '../../../data/cluster/historic/'
OPTIMAL_THRESHOLD = 0.60521042084168331

In [3]:
[[X_train, y_train], 
 [X_valid, y_valid], 
 [X_test, y_test]] = util.load_data(DATA_PATH)

X_train: (529623, 24)
X_valid: (176541, 24)
X_test: (176541, 24)
y_train: (529623, 1)
y_valid: (176541, 1)
y_test: (176541, 1)


In [4]:
X = X_train.append(X_valid).append(X_test)
y = y_train.append(y_valid).append(y_test)
del X_train
del X_valid
del X_test
del y_train
del y_valid
del y_test

In [5]:
def make_squared(dataframe, fields):
    df = dataframe.copy()
    for field in fields:
        df['%s_sq' % field] = df[field] ** 2
    return df

In [6]:
def make_interactions(dataframe, interactions):
    df = dataframe.copy()
    for interaction in interactions:
        main_effects = interaction.split(':')
        df[interaction] = df[main_effects[0]] * df[main_effects[1]]
    return df

In [7]:
sq_fields = [
    'meanTemp_Annual', 'meanTemp_AprAug', 'meanTemp_Aug',
    'meanMinTemp_DecFeb', 'meanMinTemp_Oct', 'meanMinTemp_Jan',
    'meanMinTemp_Mar', 'meanMaxTemp_Aug', 'precip_meanAnnual',
    'precip_JunAug', 'precipPrevious_JunAug', 'precip_OctSep',
    'precipPrevious_OctSep', 'precip_growingSeason',
    'elev_etopo1', 'lat', 'lon']
interactions = [
    'meanMinTemp_Oct:precip_OctSep', 'precip_meanAnnual:precip_OctSep',
    'precip_OctSep:precipPrevious_OctSep', 'meanTemp_Aug:meanMinTemp_Oct',
    'precip_OctSep:lon', 'precip_OctSep:precip_growingSeason',
    'precip_OctSep:meanMaxTemp_Aug', 'meanMinTemp_Oct:precip_meanAnnual',
    'precip_OctSep:meanTemp_Aug', 'precip_OctSep:meanMinTemp_Oct',
    'precip_OctSep:elev_etopo1', 'precip_OctSep:elev_etopo1',
    'precip_OctSep:lat', 'precip_OctSep:precip_growingSeason',
    'precip_OctSep:precipPrevious_OctSep',
    'precip_OctSep:precip_meanAnnual', 'precip_OctSep:precip_OctSep',
    'meanMaxTemp_Aug:precip_OctSep', 'meanTemp_AprAug:precip_OctSep',
    'precip_OctSep:varPrecip_growingSeason', 'meanTemp_Aug:precip_OctSep']

In [8]:
X = make_squared(X, sq_fields)
X = make_interactions(X, interactions)

In [9]:
full = X.copy()
full['beetle'] = y['beetle']

In [10]:
# Drop 'studyArea' from predictors (all 1 in the reduced data) and 'x' and
# 'y' (perfectly correlated with 'lon' and 'lat')
X = X.drop(
    ['studyArea', 'x', 'y', 'elev_srtm30', 'year', 
     'varPrecip_growingSeason', 'precip_OctSep:varPrecip_growingSeason'], 
    axis=1)

In [11]:
predictors = list(X)
print(predictors)

['meanTemp_Annual', 'meanTemp_AprAug', 'meanTemp_Aug', 'meanMinTemp_DecFeb', 'meanMinTemp_Oct', 'meanMinTemp_Jan', 'meanMinTemp_Mar', 'meanMaxTemp_Aug', 'precip_meanAnnual', 'precip_JunAug', 'vegetation', 'precipPrevious_JunAug', 'precip_OctSep', 'precipPrevious_OctSep', 'precip_growingSeason', 'elev_etopo1', 'lat', 'lon', 'meanTemp_Annual_sq', 'meanTemp_AprAug_sq', 'meanTemp_Aug_sq', 'meanMinTemp_DecFeb_sq', 'meanMinTemp_Oct_sq', 'meanMinTemp_Jan_sq', 'meanMinTemp_Mar_sq', 'meanMaxTemp_Aug_sq', 'precip_meanAnnual_sq', 'precip_JunAug_sq', 'precipPrevious_JunAug_sq', 'precip_OctSep_sq', 'precipPrevious_OctSep_sq', 'precip_growingSeason_sq', 'elev_etopo1_sq', 'lat_sq', 'lon_sq', 'meanMinTemp_Oct:precip_OctSep', 'precip_meanAnnual:precip_OctSep', 'precip_OctSep:precipPrevious_OctSep', 'meanTemp_Aug:meanMinTemp_Oct', 'precip_OctSep:lon', 'precip_OctSep:precip_growingSeason', 'precip_OctSep:meanMaxTemp_Aug', 'meanMinTemp_Oct:precip_meanAnnual', 'precip_OctSep:meanTemp_Aug', 'precip_OctSep:m

In [12]:
# Normalize data to make gradient descent more efficient
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [13]:
y = y['beetle'].values.reshape(-1)

# Baseline Logistic Regression 
#### With L1 (_Lasso Regression_) or L2 (_Ridge Regression_ ) Regularization
This model will be considered the baseline for logistic regression models as it uses just the raw predictors.  After some EDA, various transformations and interaction terms will also be considered in order to improve the model.

In [14]:
logistic_clf = LogisticRegression(C=0.001, penalty='l2')
logistic_clf.fit(X, y)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
coefs = pd.DataFrame(
    [[pred, coef] 
     for pred, coef in zip(predictors, logistic_clf.coef_[0])], 
    columns=['predictor', 'coef'])
coefs['abs'] = np.abs(coefs.coef)
coefs = coefs.sort_values('abs', ascending=False)
coefs = coefs.drop(['abs'], axis=1)
coefs

Unnamed: 0,predictor,coef
10,vegetation,1.252718
18,meanTemp_Annual_sq,-1.000793
16,lat,0.868577
19,meanTemp_AprAug_sq,-0.837075
7,meanMaxTemp_Aug,0.792203
20,meanTemp_Aug_sq,-0.756262
21,meanMinTemp_DecFeb_sq,-0.655956
23,meanMinTemp_Jan_sq,-0.597047
39,precip_OctSep:lon,0.564877
9,precip_JunAug,-0.552219


In [37]:
probs = logistic_clf.predict_proba(X)
probs = [p[1] for p in probs]

X_df = pd.DataFrame(data=X, index=full.index, columns=predictors)
X_df['year'] = full['year']
X_df['x'] = full['x']
X_df['y'] = full['y']
X_df['probs'] = probs
X_df['preds'] = X_df['probs'].apply(lambda x: 1 if x >= OPTIMAL_THRESHOLD else 0)
X_df.head()

Unnamed: 0,meanTemp_Annual,meanTemp_AprAug,meanTemp_Aug,meanMinTemp_DecFeb,meanMinTemp_Oct,meanMinTemp_Jan,meanMinTemp_Mar,meanMaxTemp_Aug,precip_meanAnnual,precip_JunAug,...,precip_OctSep:precip_meanAnnual,precip_OctSep:precip_OctSep,meanMaxTemp_Aug:precip_OctSep,meanTemp_AprAug:precip_OctSep,meanTemp_Aug:precip_OctSep,year,x,y,probs,preds
0,1.787931,1.132197,0.763807,1.930559,2.119847,1.79232,1.837742,0.299068,-0.984647,-1.534291,...,-0.359426,-0.345187,-0.836969,-0.58362,-0.710947,2006,-1550000.0,-2000000.0,0.000356,0
1,1.818684,1.173135,0.801307,1.942941,2.161332,1.804967,1.862798,0.329201,-1.002165,-1.546388,...,-0.360752,-0.3471,-0.857355,-0.606672,-0.731718,2006,-1540000.0,-2000000.0,0.000315,0
2,1.884919,1.294162,0.953691,1.923028,2.163098,1.784628,1.857889,0.530684,-1.012936,-1.538972,...,-0.361543,-0.348415,-0.852438,-0.608494,-0.73111,2006,-1530000.0,-2000000.0,0.000258,0
3,1.708434,1.125133,0.856013,1.773393,1.857413,1.631792,1.64954,0.552282,-0.925817,-1.41111,...,-0.353619,-0.334897,-0.687389,-0.442504,-0.57388,2006,-1520000.0,-2000000.0,0.000576,0
4,1.742202,1.211342,0.972486,1.730878,1.837109,1.588368,1.626167,0.723415,-0.927514,-1.384405,...,-0.3539,-0.335684,-0.671432,-0.434089,-0.563635,2006,-1510000.0,-2000000.0,0.000512,0


In [50]:
out_data = X_df.loc[X_df.year == 2000, ['x', 'y', 'probs', 'preds']]
out_data = out_data.rename(columns={'probs': 'probs_2000', 'preds': 'preds_2000'})
out_data.index = out_data.apply(lambda row: str(row['x']) + str(row['y']), axis=1)

for year in range(2001, 2015):
    year_data = X_df.loc[X_df.year == year, ['x', 'y', 'probs', 'preds']]
    year_data.index = year_data.apply(lambda row: str(row['x']) + str(row['y']), axis=1)
    out_data['probs_%s' % year] = year_data['probs']
    out_data['preds_%s' % year] = year_data['preds']
out_data.index = range(out_data.shape[0])
out_data.head()

Unnamed: 0,x,y,probs_2000,preds_2000,probs_2001,preds_2001,probs_2002,preds_2002,probs_2003,preds_2003,...,probs_2010,preds_2010,probs_2011,preds_2011,probs_2012,preds_2012,probs_2013,preds_2013,probs_2014,preds_2014
0,-1550000.0,-2000000.0,0.000312,0,0.000569,0,0.000428,0,0.000375,0,...,0.000611,0,0.000521,0,0.000435,0,0.000238,0,0.000193,0
1,-1540000.0,-2000000.0,0.000271,0,0.000504,0,0.000371,0,0.000326,0,...,0.000535,0,0.000462,0,0.000378,0,0.000207,0,0.000164,0
2,-1530000.0,-2000000.0,0.000216,0,0.000414,0,0.0003,0,0.000261,0,...,0.000446,0,0.000388,0,0.000304,0,0.000167,0,0.000127,0
3,-1520000.0,-2000000.0,0.000468,0,0.00087,0,0.000672,0,0.000583,0,...,0.000996,0,0.000902,0,0.000707,0,0.00039,0,0.000292,0
4,-1510000.0,-2000000.0,0.000405,0,0.000768,0,0.000601,0,0.000514,0,...,0.000898,0,0.000814,0,0.000622,0,0.000341,0,0.000252,0
