In [1]:
import sys
import pandas as pd

sys.path.insert(1, '../../TESTING/')

import county_library as CL
import county_stratification as CS

# Define Regressors

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Setup Counties

In [4]:
counties = CS.setup_data()

# Auxilary Functions

In [13]:
def normalize_all(X_train, X_test):
    from sklearn.preprocessing import normalize

    cols = X_train.columns

    X_train_norm = normalize(X_train)
    X_test_norm = normalize(X_test)

    X_train = pd.DataFrame(data=X_train_norm, columns=cols.tolist())
    X_test = pd.DataFrame(data=X_test_norm, columns=cols.tolist())
    
    return X_train, X_test

def get_regression_results(reg, x_train, y_train, x_test, y_test):
    reg.fit(x_train, y_train)
    y_pred = reg.predict(x_test)
    df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    return df

# Build/Split Data
def split_counties(counties, sample=False, left_index=0, right_index= -1, test_index = -1):
    train = CS.build_county(counties[left_index:right_index])#.drop(['Unnamed: 0', 'Unnamed: 0.1'],axis=1)
    if 'Unnamed: 0' in train.columns:
        train = train.drop(['Unnamed: 0'],axis=1)
    if 'Unnamed: 0.1' in train.columns:
        train = train.drop(['Unnamed: 0.1'],axis=1)
            
    test = counties[test_index]#.drop(['Unnamed: 0', 'Unnamed: 0.1'],axis=1)
    if 'Unnamed: 0' in test.columns:
        test = test.drop(['Unnamed: 0'],axis=1)
    if 'Unnamed: 0.1' in test.columns:
        test = test.drop(['Unnamed: 0.1'],axis=1)
    
    if sample:
        # Training Data
        X_train = train.iloc[:, 1:-1][sample].reset_index().drop('index', axis=1)
        y_train = train.loc[:, 'Death Rate']

        # Testing Data
        X_test = test.iloc[:, 1:-1][sample].reset_index().drop('index', axis=1)
        y_test = test.loc[:, 'Death Rate']
    
    else:
        # Training Data
        X_train = train.iloc[:, 1:-1].reset_index().drop('index', axis=1)
        y_train = train.loc[:, 'Death Rate']

        # Testing Data
        X_test = test.iloc[:, 1:-1].reset_index().drop('index', axis=1)
        y_test = test.loc[:, 'Death Rate']

    return X_train, y_train, X_test, y_test

def get_mse(column=False, double=False, test_index = -1):
    counties = CS.setup_data()
    if double: 
        if column:
            mse = {}
            beginning = 2009
            for i in range(9):
                for j in range(i+1, 9):
                    X_train, y_train, X_test, y_test = split_counties(counties, sample=column, left_index=i, right_index=j)
                    X_train, X_test = normalize_all(X_train, X_test)

                    year = str(beginning)
                    end  = str(2009 + j) 

                    res = get_regression_results(rf, X_train, y_train, X_test, y_test)
                    mse[year+'-'+end] = mean_squared_error(res.iloc[:, 0].tolist(), res.iloc[:, 1].tolist())
                beginning += 1
        else:
            mse = {}
            beginning = 2009
            for i in range(9):
                for j in range(i+1, 9):
                    X_train, y_train, X_test, y_test = split_counties(counties, sample=False, left_index=i, right_index=j)
                    X_train, X_test = normalize_all(X_train, X_test)

                    year = str(beginning)
                    end  = str(2009 + j) 

                    res = get_regression_results(rf, X_train, y_train, X_test, y_test)
                    mse[year+'-'+end] = mean_squared_error(res.iloc[:, 0].tolist(), res.iloc[:, 1].tolist())
                beginning += 1
    else: 
        if column:
            mse = {}
            beginning = 2009
            for i in range(8):
                X_train, y_train, X_test, y_test = split_counties(counties, sample=column, left_index=i)
                X_train, X_test = normalize_all(X_train, X_test)

                year = str(beginning)
                beginning += 1
                res = get_regression_results(rf, X_train, y_train, X_test, y_test)
                mse[year+'-2017'] = mean_squared_error(res.iloc[:, 0].tolist(), res.iloc[:, 1].tolist())

        else:
            mse = {}
            beginning = 2009
            for i in range(8):
                X_train, y_train, X_test, y_test = split_counties(counties, sample=False, left_index=i)
                X_train, X_test = normalize_all(X_train, X_test)

                year = str(beginning)
                beginning += 1
                res = get_regression_results(rf, X_train, y_train, X_test, y_test)
                mse[year+'-2017'] = mean_squared_error(res.iloc[:, 0].tolist(), res.iloc[:, 1].tolist())
           
    
    return mse

def mse_simple(train_begin, train_end, k = -1, column = False):
    

    counties = CS.setup_data()
    if column: 
        X_train, y_train, X_test, y_test = split_counties(counties, sample=column, left_index=train_begin, right_index=train_end, test_index=k)
    else:
        X_train, y_train, X_test, y_test = split_counties(counties, sample=False, left_index=train_begin, right_index=train_end, test_index=k)
    X_train, X_test = normalize_all(X_train, X_test)

    year = str(train_begin+2009)
    end  = str(train_end+2009-1)
    cur = str(2009+k)

    mse = {}
    res = get_regression_results(rf, X_train, y_train, X_test, y_test)
    mse[year+'-'+end+'; '+cur] = mean_squared_error(res.iloc[:, 0].tolist(), res.iloc[:, 1].tolist())
    
    return year+'-'+end, cur, mse[year+'-'+end+'; '+cur]

def convert_mse(mse):
    keys = list(mse.keys())
    vals = list(mse.values())
    
    res = []
    
    for i in range(len(keys)):
        res.append([keys[i], vals[i]])
        
    return pd.DataFrame(res, columns=['Years', 'MSE'])

In [6]:
from itertools import combinations as C

In [8]:
combs = C([i for i in range(2009, 2018)], r=2)
test = [i for i in range(2010, 2019)]

c = [i for i in combs]

test_years = []

for year in test:
    for com in c:
        if year > com[1]:
            test_years.append([com[0], com[1],year])
            
for element in test_years:
    for i in range(len(element)):
        if i == 1:
            element[i] -= (2009 - 1)
        else:
            element[i] -= 2009



In [9]:
mse_results = []
for trial in test_years: 
    mse_temp = mse_simple(trial[0], trial[1], trial[2])
    mse_results.append([mse_temp[0], mse_temp[1], mse_temp[2]])

# Combinations of Train/Test Splits

In [19]:
mse_frame = pd.DataFrame(data=mse_results, columns=['Train', 'Test', 'MSE'])
mse_frame.sort_values(by='MSE').head(15)

Unnamed: 0,Train,Test,MSE
2,2009-2011,2012,56.876148
9,2011-2012,2013,59.548055
8,2010-2012,2013,60.155628
6,2009-2012,2013,60.732333
3,2010-2011,2012,61.842465
5,2009-2011,2013,62.12381
7,2010-2011,2013,63.22293
1,2009-2010,2012,63.903783
4,2009-2010,2013,64.149303
19,2012-2013,2014,83.751105


# All Features

In [14]:
convert_mse(get_mse())

Unnamed: 0,Years,MSE
0,2009-2017,194.884649
1,2010-2017,195.948488
2,2011-2017,194.141128
3,2012-2017,192.982773
4,2013-2017,189.41455
5,2014-2017,185.675466
6,2015-2017,177.893088
7,2016-2017,173.756345


# Unemployment

In [15]:
convert_mse(get_mse(column='Unemployment Rate'))

Unnamed: 0,Years,MSE
0,2009-2017,204.182474
1,2010-2017,199.854987
2,2011-2017,196.370218
3,2012-2017,193.767293
4,2013-2017,188.815862
5,2014-2017,183.828069
6,2015-2017,179.671831
7,2016-2017,177.664291


# Income

In [16]:
convert_mse(get_mse(column='Income'))

Unnamed: 0,Years,MSE
0,2009-2017,204.32485
1,2010-2017,199.600007
2,2011-2017,196.799013
3,2012-2017,193.565756
4,2013-2017,188.822442
5,2014-2017,183.558718
6,2015-2017,179.63172
7,2016-2017,177.665751


# Poverty Rate

In [17]:
convert_mse(get_mse(column='Poverty Rate'))

Unnamed: 0,Years,MSE
0,2009-2017,203.83719
1,2010-2017,199.526455
2,2011-2017,196.419119
3,2012-2017,193.906412
4,2013-2017,189.064369
5,2014-2017,183.608551
6,2015-2017,179.530963
7,2016-2017,177.672037


# Prescription Rate

In [18]:
convert_mse(get_mse(column='Prescription Rate'))

Unnamed: 0,Years,MSE
0,2009-2017,203.700558
1,2010-2017,199.902041
2,2011-2017,196.542057
3,2012-2017,193.472853
4,2013-2017,189.022408
5,2014-2017,183.276662
6,2015-2017,179.424777
7,2016-2017,177.664775
