In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
%matplotlib inline
import seaborn as sns
sns.set_context('poster')

## Models With MSA, Years

### Import, Split, and Standardize Data

In [2]:
start = datetime.datetime.time(datetime.datetime.now())

In [3]:
df = pd.read_pickle('../data/merged/all_data_2006_to_2016.pkl')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 928 entries, 0 to 927
Data columns (total 13 columns):
year                                                                           928 non-null int64
MSA_orig                                                                       928 non-null object
MSA                                                                            928 non-null object
MSA_abbr                                                                       928 non-null object
now_married_except_separated                                                   928 non-null float64
less_than_high_school_diploma                                                  928 non-null float64
unmarried_portion_of_women_15_to_50_years_who_had_a_birth_in_past_12_months    928 non-null float64
households_with_food_stamp_snap_benefits                                       928 non-null float64
percentage_married_couple_family                                               928 non-null float64


In [5]:
# remove outliers with atypically high murder rates
df = df[df.MSA_abbr != 'NEW_ORLEANS_LA']
df = df[(df.MSA_abbr != 'MEMPHIS_TN') | (df.year != 2016)]
df = df[(df.MSA_abbr != 'BATON_ROUGE_LA') | (df.year != 2007)]

In [6]:
# drop extra MSA names
df = df.drop(['MSA_orig', 'MSA'], axis=1)

In [7]:
# drop all features except year
df = df.drop(['now_married_except_separated',
              'less_than_high_school_diploma',
              'unmarried_portion_of_women_15_to_50_years_who_had_a_birth_in_past_12_months',
              'households_with_food_stamp_snap_benefits',
              'percentage_married_couple_family',
              'percentage_female_householder_no_husband_present_family',
              'poverty_all_people',
              'house_median_value_(dollars)'], axis=1)

In [8]:
df = pd.get_dummies(df, columns=['MSA_abbr'], drop_first=True)
df.head()

Unnamed: 0,year,murder_per_100_k,MSA_abbr_ALBANY_NY,MSA_abbr_ALBUQUERQUE_NM,MSA_abbr_ALLENTOWN_PA,MSA_abbr_ATLANTA_GA,MSA_abbr_AUGUSTA_GA,MSA_abbr_AUSTIN_TX,MSA_abbr_BAKERSFIELD_CA,MSA_abbr_BALTIMORE_MD,...,MSA_abbr_TAMPA_FL,MSA_abbr_TOLEDO_OH,MSA_abbr_TUCSON_AZ,MSA_abbr_TULSA_OK,MSA_abbr_VIRGINIA_BEACH_NC,MSA_abbr_WASHINGTON_DC,MSA_abbr_WICHITA_KS,MSA_abbr_WINSTON_NC,MSA_abbr_WORCESTER_MA,MSA_abbr_YOUNGSTOWN_OH
0,2007,3.6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2008,3.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2009,4.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2010,3.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2011,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# separate labels and features

label_col = 'murder_per_100_k'

x = df.drop([label_col], axis=1)
y = df[label_col]

print('Sizes match: {}'.format(len(x)==len(y)))

Sizes match: True


In [10]:
# train test split

from sklearn.model_selection import train_test_split

x_train_s, x_test_s, y_train, y_test = train_test_split(x,y, test_size=0.3)

print('Len x_train: {}'.format(len(x_train_s)))
print('Len x_test: {}'.format(len(x_test_s)))
print('Len y_train: {}'.format(len(y_train)))
print('Len x_test: {}'.format(len(y_test)))

Len x_train: 641
Len x_test: 275
Len y_train: 641
Len x_test: 275


In [11]:
# standardize data

from sklearn.preprocessing import StandardScaler
standardizer = StandardScaler().fit(x_train_s)

x_train = standardizer.transform(x_train_s)
x_test = standardizer.transform(x_test_s)

### Simple Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, BayesianRidge, HuberRegressor
from sklearn.model_selection import GridSearchCV

In [13]:
# instantiate and fit models

def make_models(x_train, y_train):
    md = dict()

    md['linear'] = LinearRegression().fit(x_train, y_train)
    md['ridge'] = RidgeCV(cv=15).fit(x_train, y_train)
    md['lasso'] = LassoCV(cv=15).fit(x_train, y_train)
    md['bayes'] = BayesianRidge(tol=0.0001).fit(x_train, y_train)
    md['huber'] = GridSearchCV(HuberRegressor(),{'epsilon': [1.0,1.1,1.2,1.3,1.4,1.5,1.6,1.7]}).fit(x_train, y_train).best_estimator_
    
    return md

In [14]:
# score models

def score_model(model):

    train_score = model.score(x_train, y_train)
    test_score = model.score(x_test, y_test)

    return np.array([train_score, test_score])

In [15]:
def results_df(model):

    # get train and test scores
    scores_df = pd.DataFrame(score_model(model)).transpose()
    scores_df.columns = ['Train R2','Test R2']

    # get coefficient matrix
    coeffs_df = pd.DataFrame(model.coef_).transpose()
    coeffs_df.columns = x.columns

    # join dataframes
    return pd.concat([scores_df, coeffs_df], axis=1)

In [16]:
from sklearn.utils import resample

In [17]:
def run_experiment(n_iters):
    
    sample_results = dict()
    
    for n in range(n_iters):
        # get new sample
        xb, yb = resample(x_train, y_train)

        # make and fit models
        model_dict = make_models(xb, yb)

        # get sample of results for each model
        for key in model_dict:
            
            # initialize empty dictionary
            if key not in sample_results:
                sample_results[key] = []
                        
            # get model results
            sample_results[key].append(results_df(model_dict[key]))
            
    # concatenate results dfs into single df
    for key in sample_results:
        sample_results[key] = pd.concat(sample_results[key])
        
    return sample_results

In [18]:
exp = run_experiment(200)

In [19]:
coef_dict = dict()

# iterate over all models
for key in exp:
    
    # iterate over results of this model
    for c in exp[key].columns:
        
        # initialize dict for result names
        if c not in coef_dict:
            coef_dict[c] = dict()
    
        # add this coeff to the dict
        coef_dict[c][key] = exp[key][c]

# convert dict of dicts into dict of dataframes
coef_dfs = {key: pd.DataFrame(coef_dict[key]) for key in coef_dict}

In [20]:
for key in exp:
    print(key)
    print()
    print(exp[key].mean())
    print()

linear

Train R2                        -1.685948e+26
Test R2                         -2.676673e+26
year                            -2.474309e-01
MSA_abbr_ALBANY_NY               2.297532e+09
MSA_abbr_ALBUQUERQUE_NM          1.853909e-01
MSA_abbr_ALLENTOWN_PA           -1.233394e-01
MSA_abbr_ATLANTA_GA              2.480143e-01
MSA_abbr_AUGUSTA_GA              3.639266e-01
MSA_abbr_AUSTIN_TX              -2.186196e-01
MSA_abbr_BAKERSFIELD_CA          2.566492e-01
MSA_abbr_BALTIMORE_MD            6.847021e-01
MSA_abbr_BATON_ROUGE_LA          6.656917e-01
MSA_abbr_BIRMINGHAM_AL           4.532214e-01
MSA_abbr_BOISE_ID               -3.168309e-01
MSA_abbr_BOSTON_MA              -7.331181e+10
MSA_abbr_BRADENTON_FL            8.613645e+11
MSA_abbr_BRIDGEPORT_CT          -1.876810e-01
MSA_abbr_BUFFALO_NY             -2.982015e+10
MSA_abbr_CAPE_CORAL_FL           5.650360e+10
MSA_abbr_CHARLESTON_SC           7.075024e+09
MSA_abbr_CHARLOTTE_NC           -1.469361e+11
MSA_abbr_CHATANOOGA_TN    

In [21]:
def print_runtime():
    hours = int(str(end)[0:2])-int(str(start)[0:2])
    minutes = int(str(end)[3:5])-int(str(start)[3:5])
    seconds = int(str(end)[6:8])-int(str(start)[6:8])
    if hours < 0:
        hours = hours + 24
    if minutes < 0:
        minutes = minutes + 60
        hours = hours - 1
    if seconds < 0:
        seconds = seconds + 60
        minutes = minutes - 1
    print(hours, "hrs", minutes, "mins", seconds, "secs")

In [22]:
end = datetime.datetime.time(datetime.datetime.now())

In [23]:
print_runtime()

0 hrs 6 mins 16 secs
