In [4]:
import pandas as pd
import sys
sys.path.insert(1, '../../TESTING/')

import county_library as CL
import county_stratification as CS

# Functions

In [5]:
def county_cleanup(county):
    new_county = county.copy()
    if 'Unnamed: 0' in new_county.columns:
        new_county = new_county.drop(['Unnamed: 0'], axis=1)
    if 'Unnamed: 0.1' in new_county.columns:
        new_county = new_county.drop(['Unnamed: 0.1'], axis=1)
        
    return new_county

def county_dict():
    counties = CS.setup_data()
    years = [str(i) for i in range(2009, 2019)]
    
    county_dict = {}
    for i in range(len(counties)):
        
        county_dict[years[i]] = counties[i]
        
    return county_dict

def normalize_all(X_train, X_test):
    from sklearn.preprocessing import normalize

    cols = X_train.columns

    X_train_norm = normalize(X_train)
    X_test_norm = normalize(X_test)

    X_train = pd.DataFrame(data=X_train_norm, columns=cols.tolist())
    X_test = pd.DataFrame(data=X_test_norm, columns=cols.tolist())
    
    return X_train, X_test

def get_regression_results(reg, x_train, x_test, y_train, y_test):
    reg.fit(x_train, y_train)
    y_pred = reg.predict(x_test)
    df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    return df

def mse_frame(data_in, columns_in):
    return pd.DataFrame(data=data_in, columns=columns_in)

# Load Data

# 2010-2012; 2013

In [6]:
counties          = county_dict()
train_counties    = [counties['2010'], counties['2011'], counties['2012']] 
test              =  county_cleanup(counties['2013'])   

train = county_cleanup(CS.build_county(train_counties))
fips = test['FIPS']

# 2016-2017; 2018

In [17]:
counties          = county_dict()
train_counties    = [counties['2016'], counties['2017']] 
test              =  county_cleanup(counties['2018'])   

train = county_cleanup(CS.build_county(train_counties))
fips = test['FIPS']

# Train/Test Split

In [18]:
X_train = train.iloc[:, 1:-1]
X_test  = test.iloc[:, 1:-1]
y_train = train['Death Rate']
y_test  = test['Death Rate']

X_train, X_test = normalize_all(X_train, X_test)

In [19]:
len(y_test)

671

# Prepare Random Forest Hyper Params

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [61]:
estimators = [5, 10, 25, 50, 75, 100]
max_feats  = ['auto', None, 'sqrt', 'log2', 0.9, 0.2]
min_leaf = [8, 9, 10, 11, 12]

In [20]:
mse_results = {}

# Random Forest

In [21]:
for iteration in range(5):
    for est in estimators:
        for feat in max_feats:
            for leaf in min_leaf:
                key = str(est) + ',' + str(feat) + ',' + str(leaf) 
                rf = RandomForestRegressor(n_estimators=est, max_features=feat, min_samples_leaf=leaf)
                res = get_regression_results(rf, X_train, X_test, y_train, y_test)
                mse_results[key] = mean_squared_error(res.iloc[:, 0].tolist(), res.iloc[:, 1].tolist())
    minimum = min(mse_results.values())
    for i in range(len(mse_results.values())):
        if list(mse_results.values())[i] == minimum:
            print(list(mse_results.keys())[i], minimum)
            break

NameError: name 'estimators' is not defined

# KNN

In [22]:
from sklearn.neighbors import KNeighborsRegressor
n = [i for i in range(1, 31)]
w = ['uniform', 'distance']

In [23]:
for iteration in range(5):
    for neighbor in n:
        for weight in w:
            key = str(neighbor) + ', ' + str(weight)
            knn = KNeighborsRegressor(n_neighbors=neighbor, weights=weight)
            res = get_regression_results(knn, X_train, X_test, y_train, y_test)
            mse_results[key] = mean_squared_error(res.iloc[:, 0].tolist(), res.iloc[:, 1].tolist())
    minimum = min(mse_results.values())
    for i in range(len(mse_results.values())):
        if list(mse_results.values())[i] == minimum:
            print(list(mse_results.keys())[i], minimum)
            break

18, uniform 166.35654838411435
18, uniform 166.35654838411435
18, uniform 166.35654838411435
18, uniform 166.35654838411435
18, uniform 166.35654838411435


# RF 2018

In [42]:
for iteration in range(5):
    for est in estimators:
        for feat in max_feats:
            for leaf in min_leaf:
                key = str(est) + ',' + str(feat) + ',' + str(leaf) 
                rf = RandomForestRegressor(n_estimators=est, max_features=feat, min_samples_leaf=leaf)
                res = get_regression_results(rf, X_train, X_test, y_train, y_test)
                mse_results[key] = mean_squared_error(res.iloc[:, 0].tolist(), res.iloc[:, 1].tolist())
    minimum = min(mse_results.values())
    for i in range(len(mse_results.values())):
        if list(mse_results.values())[i] == minimum:
            print(list(mse_results.keys())[i], minimum)
            break

75,0.2,8 157.9722244345081
75,0.2,12 158.63857712370606
25,0.2,11 157.6888858177801
25,0.2,8 157.53779242974792
25,0.2,11 158.38215640494371


# Optimized 2013

In [72]:
rf_optimal = RandomForestRegressor(n_estimators = 10, max_features=0.9, min_samples_leaf = 8)
results2013 = get_regression_results(rf_optimal, X_train, X_test, y_train, y_test)
results2013['FIPS'] = fips

In [73]:
results2013.head(10)

Unnamed: 0,Actual,Predicted,FIPS
0,8.49,11.848832,1073
1,4.83,14.762081,1097
2,11.3,6.805669,2020
3,7.72,10.059127,4003
4,5.11,9.30959,4013
5,22.66,16.036895,4015
6,15.15,10.891089,4019
7,6.93,6.146038,4021
8,4.65,11.999331,4025
9,6.13,11.617353,5119


# Optimized 2018

In [43]:
rf_optimal = RandomForestRegressor(n_estimators = 25, max_features=0.2, min_samples_leaf = 8)
results2018 = get_regression_results(rf_optimal, X_train, X_test, y_train, y_test)
results2018['FIPS'] = fips

In [80]:
results2018.head(15)

Unnamed: 0,Actual,Predicted,FIPS
0,6.42,20.485038,1003
1,13.18,19.863722,1043
2,14.63,22.763879,1055
3,20.63,24.826969,1073
4,3.0,20.720558,1089
5,7.98,21.485524,1097
6,11.76,21.689238,1103
7,18.04,20.145326,1115
8,11.13,16.324532,1117
9,5.74,20.545171,1125
