In [None]:
import pandas as pd
import numpy as np
import scipy.io as sc
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima_model import ARIMA
import plotly.figure_factory as ff
import plotly as py

In [None]:
#import social factors
soc_df = pd.read_csv("datasets/after/mergedSOC.csv", index_col=0)
print(soc_df.shape)
soc_df.head()

In [None]:
#import hospital discharge data
hd_df = pd.read_csv("datasets/after/mergedIP.csv", index_col=0)
print(hd_df.shape)
hd_df.head()

# Autoregression to Predict 2019 Social Factors

### Insert Details

For the first part of our project, we decided to use an autoregressive model to predict social factors in 2019 using the previous years.  We then compared this to the actual social data from 2019.

In [None]:
# mergedSOC_df[[]]
mergedSOC_df = pd.read_csv("datasets/after/mergedSOC.csv", index_col=0)

In [None]:
# factors is an array of dataframes where factors[i] describes the i-th factor.
# For each dataframe, there are 51 rows (number of counties) and 7 columns (number of years)
factors = []
for i in range(0, 48):
    factors.append(mergedSOC_df[[]])

i = 0
for c in mergedSOC_df.columns:
    factors[i] = pd.concat([factors[i], mergedSOC_df.loc[:,c]], axis=1, join='inner')
    factors[i] = factors[i].rename(columns={c: c[-2:]})
    i = (i + 1) % 48




In [None]:
#split test/train data
test = factors[0]["19"]
train = factors[0].drop("19", axis=1)
test = np.array(test)
train = np.array(train)

In [None]:
#run autoregression per factor
predicts = []
for i in range(51):
        model = AutoReg(train[i], lags = 2)
        fit_model = model.fit()
        predict = fit_model.predict(start = len(train[0,:]), end = len(train[0,:]), dynamic = False)
        predicts.append(predict)
predicts = np.array(predicts)

In [None]:
#visualize, calc error
msqr = 1/len(test)*np.sum(test-predicts)**2
rmsqr = msqr**(1/2)
plt.plot(predicts)
plt.plot(test)
rmsqr

# Nonlinear vs Linear Neural Net

### Using Social Factors to predict Health Outcomes

For the next part of our project, we wanted to use our social factors to predict hospital discharges.  We first classified each county as "below average health", "average health", and "above average health" depending on the percentage of discharges they had relative to their population.

We then ran both a nonlinear neural network with sigmoid activation functions, and a linear network to compare how they functioned relative to one another.

Each model was run with social data from one year as inputs, and health outcome prediction (below, average, above) on the same respective year where below average health corresponds to an output of $100$, average to $110$ and above to $111$.

In [None]:
df_train_features = hd_df.groupby(np.arange(len(hd_df.columns))//4, axis=1).mean()
df_train_features.columns = ["2013", "2014", "2015", "2016", "2017", "2018", "2019"]
print(df_train_features.shape)
df_train_features.head()

In [None]:
#separate each year into its own df
df_train_labels_2013 = df_train_features.iloc[:, 0]
df_train_labels_2014 = df_train_features.iloc[:, 1]
df_train_labels_2015 = df_train_features.iloc[:, 2]
df_train_labels_2016 = df_train_features.iloc[:, 3]
df_train_labels_2017 = df_train_features.iloc[:, 4]
df_train_labels_2018 = df_train_features.iloc[:, 5]
df_train_labels_2019 = df_train_features.iloc[:, 6]

In [None]:
#calculate .33 quantile for each year
above_2013 = df_train_labels_2013.quantile(0.33)
above_2014 = df_train_labels_2014.quantile(0.33)
above_2015 = df_train_labels_2015.quantile(0.33)
above_2016 = df_train_labels_2016.quantile(0.33)
above_2017 = df_train_labels_2017.quantile(0.33)
above_2018 = df_train_labels_2018.quantile(0.33)
above_2019 = df_train_labels_2019.quantile(0.33)

In [None]:
#calculate .66 quantile for each year
average_2013 = df_train_labels_2013.quantile(0.66)
average_2014 = df_train_labels_2014.quantile(0.66)
average_2015 = df_train_labels_2015.quantile(0.66)
average_2016 = df_train_labels_2016.quantile(0.66)
average_2017 = df_train_labels_2017.quantile(0.66)
average_2018 = df_train_labels_2018.quantile(0.66)
average_2019 = df_train_labels_2019.quantile(0.66)

In [None]:
#calculate 1 quantile for each year
below_2013 = df_train_labels_2013.quantile(1)
below_2014 = df_train_labels_2014.quantile(1)
below_2015 = df_train_labels_2015.quantile(1)
below_2016 = df_train_labels_2016.quantile(1)
below_2017 = df_train_labels_2017.quantile(1)
below_2018 = df_train_labels_2018.quantile(1)
below_2019 = df_train_labels_2019.quantile(1)

In [None]:
df_test_features_2013 = soc_df.iloc[:, 0:48]
df_test_features_2014 = soc_df.iloc[:, 48:96]
df_test_features_2015 = soc_df.iloc[:, 96:144]
df_test_features_2016 = soc_df.iloc[:, 144:192]
df_test_features_2017 = soc_df.iloc[:, 192:240]
df_test_features_2018 = soc_df.iloc[:, 240:288]
df_test_features_2019 = soc_df.iloc[:, 288:336]
df_test_features_2013.head()

In [None]:
# Combine all data into two data frames

# First rename all social factor columns to blank so they combine correctly
df_test_features_2013.columns = [''] * len(df_test_features_2013.columns)
df_test_features_2014.columns = [''] * len(df_test_features_2013.columns)
df_test_features_2015.columns = [''] * len(df_test_features_2013.columns)
df_test_features_2016.columns = [''] * len(df_test_features_2013.columns)
df_test_features_2017.columns = [''] * len(df_test_features_2013.columns)
df_test_features_2018.columns = [''] * len(df_test_features_2013.columns)
df_test_features_2019.columns = [''] * len(df_test_features_2013.columns)

# New df_Factors has all the info appended so years are staked on top of each other not in the same row
df_factors = df_test_features_2013.append(df_test_features_2014)
df_factors = df_factors.append(df_test_features_2015)
#df_factors = df_factors.append(df_test_features_2016)
df_factors = df_factors.append(df_test_features_2017)
df_factors = df_factors.append(df_test_features_2018)
df_factors = df_factors.append(df_test_features_2019)

# New df_Labels has all the health data for the counties
labels = [df_train_labels_2013,df_train_labels_2014,df_train_labels_2015,#df_train_labels_2016,
          df_train_labels_2017,df_train_labels_2018,df_train_labels_2019]
df_labels = pd.concat(labels)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(df_factors,df_labels,test_size=0.1, random_state=10)

In [None]:
from sklearn.preprocessing import StandardScaler # scaling data may be important
scaler = StandardScaler()
scaler.fit(train_x)

In [None]:
NN = MLPRegressor(activation = 'logistic', solver = 'adam', max_iter = 10000)
NN.fit(df_factors,df_labels)

In [None]:
# Try with Linear activation function
NN_linear = MLPRegressor (activation = 'identity', solver = 'adam', max_iter = 10000)
NN_linear.fit(df_factors,df_labels)
NN_linear.score(df_test_features_2016,df_train_labels_2016)

In [None]:
print(NN.score(df_test_features_2016, df_train_labels_2016))
predicted = NN.predict(df_test_features_2016)
#print(type(test_y), type(predicted))
p = pd.DataFrame(predicted)
p.index = df_train_labels_2016.index

In [None]:
df_health = pd.concat([p, df_train_labels_2016.to_frame()], axis=1)
print(df_health)
df_health.columns = ['predicted', 'actual']
df_health_classified = pd.DataFrame(columns = ['predicted', 'actual'])
for i, row in df_health.iterrows():
    p = row[0]
    a = row[1]
    if p < above_2013:
        p = 'Above'
    elif p < average_2013:
        p = 'Average'
    else:
        p = 'Below'
    df_health_classified.at[i,'predicted'] = p
    if a < above_2013:
        a = 'Above'
    elif a < average_2013:
        a = 'Average'
    else:
        a = 'Below'
    df_health_classified.at[i,'actual'] = a
df_health_classified

In [None]:
df_health = pd.concat([p, df_train_labels_2016.to_frame()], axis=1)
df_health.columns = ['predicted', 'actual']
df_health_classified = pd.DataFrame(columns = ['predicted', 'actual'])
for i, row in df_health.iterrows():
    p = row[0]
    a = row[1]
    if p < above_2013:
        p = 1
    elif p < average_2013:
        p = 2
    else:
        p = 3
    df_health_classified.at[i,'predicted'] = p
    if a < above_2013:
        a = 1
    elif a < average_2013:
        a = 2
    else:
        a = 3
    df_health_classified.at[i,'actual'] = a

In [None]:
df_health_classified.index.names= ['county']

df_health_classified

In [None]:
df_sample = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/minoritymajority.csv')
df_sample_r = df_sample[df_sample['STNAME'] == 'Texas']

counties = pd.read_csv('datasets/after/county.csv')
counties.columns = ['i', 'FIPS', 'county']
vis = pd.merge(df_health_classified, counties, on="county")

values1 = vis['predicted']
values2 = vis['actual']

endpts1 = list(np.mgrid[min(values1):max(values1):4j])
endpts2 = list(np.mgrid[min(values2):max(values2):4j])

#colorscale = ["#030512","#1d1d3b","#323268","#3d4b94","#3e6ab0","#4989bc","#60a7c7","#85c5d3","#b7e0e4","#eafcfd"]
#colorscale = ["#f7fbff", "#ebf3fb", "#d2e3f3", "#c6dbef", "#b3d2e9", "#9ecae1", "#85bcdb", "#6baed6", "#57a0ce", "#3082be", "#2171b5", "#1361a9","#08519c", "#0b4083", "#08306b"]
colorscale = ["lightyellow","orange","orange","red","darkred"]


fips = vis['FIPS']

fig = ff.create_choropleth(fips=fips, values=values1, scope=['Texas'], colorscale = colorscale, binning_endpoints=endpts1,
        plot_bgcolor='rgb(100,100,100)',
        paper_bgcolor='rgb(229,229,229)', county_outline={'color': 'rgb(255,255,255)', 'width': 0.75})
fig.show()

In [None]:
# randomly shuffles column x
x = 0
df_shuffled = np.random.permutation(df_factors.iloc[:,x].values)
df_temp = df_factors.copy()
df_temp.iloc[:,x] = df_shuffled

In [None]:
# scaling the data, haven't looked into exactly what this does but I saw that you should do this on some site
train_x, test_x, train_y, test_y = train_test_split(df_factors,df_labels,test_size=0.2, random_state=10)
scaler = StandardScaler()
scaler.fit(train_x)

In [None]:
total = []
for x in range(0,48): # computes an average score shuffling each column one by one and runing the program 25 times
    average = df_factors.iloc[:,x].mean()
    df_temp = df_factors.copy()
    df_temp.iloc[:,x] = average
    for i in range(0,500):
        NN = MLPRegressor(activation = 'logistic', solver = 'adam', max_iter = 10000)
        train_x, test_x, train_y, test_y = train_test_split(df_temp,df_labels,test_size=0.1, random_state=10)
        NN.fit(train_x,train_y)
        total.append(NN.score(test_x, test_y))
for i in range(0,500):
    NN = MLPRegressor(activation = 'logistic', solver = 'adam', max_iter = 10000)
    train_x, test_x, train_y, test_y = train_test_split(df_factors,df_labels,test_size=0.1, random_state=10)
    NN.fit(train_x,train_y)
    total.append(NN.score(test_x, test_y))

In [None]:
df_test_Results = pd.DataFrame(np.array(total).reshape(-1,500))
df_test_Results['average'] = df_test_Results.mean(numeric_only=True, axis=1)
df_test_Results['STDev'] = df_test_Results.std(axis=1)

In [None]:
x_axis = list("Col{0}".format(i) for i in range(1,49))
x_axis.append("Complete")
x_pos = np.arange(len(x_axis))

In [None]:
import matplotlib.pyplot as plt

In [None]:
x_pos = [0,2]

In [None]:
fig, ax = plt.subplots()
ax.bar(x_pos, df_test_Results.loc[:,"average"], yerr=df_test_Results.loc[:,"STDev"])#, align='center',alpha=0.5, ecolor='black', capsize=2)
ax.set_ylabel('Mean Score')
ax.set_xticks(x_pos)
ax.set_xticklabels(x_axis)
ax.set_title('Computing Average')
ax.yaxis.grid(True)

# Save the figure and show
plt.tight_layout()
plt.show()

No change
.73
0
0.7108650254858128
1
0.7960634235499481
2
0.8294445453849859
3
0.7131161077022256
4
0.7649890226507019
5
0.7182807190899667
6
0.770573975108926
7
0.7678379400275417
8
0.7394518440306926
9
0.7370813452666014
10
0.7469239702406627
11
0.6987987125400841
12
0.8031039579147157
13
0.7597713988573509
14
0.7407542084947817
15
0.7551527617051675
16
0.7032623511716416
17
0.7468654275975656
18
0.7269607583883346
19
0.7239341835447677
20
0.7063222378120773
21
0.7515546917564759
22
0.7811960516713465
23
0.7875212552563348
24
0.7701103826676152
25
0.8098780049975205
26
0.7555997194214767
27
0.6816149802794327
28
0.7656032470712896
29
0.7524800447927893
30
0.7565911712933635
31
0.6411801245922919
32
0.6958630774881265
33
0.707291988941758
34
0.7003487106236796
35
0.7880963245934186
36
0.6818116911936285
37
0.6882282528159364
38
0.755357675227565
39
0.7848156225992295
40
0.6928275829955649
41
0.7067307441389035
42
0.7815071724434892
43
0.673067085455601
44
0.7333788145720775
45
0.6352091856558159
46
0.6732250725388015
47
0.7716650884613394

In [None]:
df_test_Results.to_csv("totals")

In [None]:
#df_best_factors = df_factors.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,
 #                                   33,34,35,36,37,38,39,40,41,42,43,44,45,46,47]]
df_best_factors = df_factors.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,21,23,26,27,29,30,31,
                                    33,34,35,36,37,38,39,40,41,42,43,44,45,46,47]]

In [None]:
for i in range(0,500):
    NN = MLPRegressor(activation = 'logistic', solver = 'adam', max_iter = 10000)
    train_x, test_x, train_y, test_y = train_test_split(df_best_factors,df_labels,test_size=0.1, random_state=10)
    NN.fit(train_x,train_y)
    total = total + NN.score(test_x, test_y)
print (total/500)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(df_factors,df_labels,test_size=0.2, random_state=10)
NN = MLPRegressor(activation = 'relu', solver = 'adam', max_iter = 10000)
NN.fit(train_x,train_y)
NN.score(test_x,test_y)

0
0.7477366611943289
1
0.7450918872678572
2
0.7365371215672839
3
0.7353415290661098
4
0.7157993397891446
5
0.7414025627768244
6
0.7432139691857386
7
0.7299629155090428
8
0.7430509940365544
9
0.7431590397923593
10
0.7379941614264255
11
0.7183108497078422
12
0.7359672553310749
13
0.7515365648653871
14
0.7327615548058445
15
0.7367755194065221
16
0.7122104338543712
17
0.7350669415210602
18
0.7351774821246562
19
0.7211025993383763
20
0.7580289523924054
21
0.7279603370575597
22
0.7554106271592013
23
0.7333225437211993
24
0.7618337793902102
25
0.7522192641169432
26
0.7212447362368076
27
0.7132314167375634
28
0.7522934951601369
29
0.7494191251460175
30
0.7270102273600064
31
0.7442020408419321
32
0.750297162703244
33
0.7004031627725568
34
0.7248352585915442
35
0.740584523382293
36
0.7282683443766068
37
0.73456833177976
38
0.7349586841533234
39
0.7307063595104687
40
0.7469908871969103
41
0.7368946729741306
42
0.7465210991017219
43
0.7188909451460147
44
0.7382468534620368
45
0.7348314689406404
46
0.7237537620249058
47
0.7447635388268783
0.7486788908834031