In [146]:
#library and data imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly 
import geopandas as gp
import shapely
import shapefile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  
import plotly.figure_factory as ff
demographics_data = pd.read_csv('demographics_test.csv')
merged_train = pd.read_csv('merged_train.csv')
X = merged_train[['Total Population', 'Percent White, not Hispanic or Latino',
                  'Percent Black, not Hispanic or Latino', 'Percent Hispanic or Latino', 'Percent Foreign Born',
                  'Percent Female', 'Percent Age 29 and Under', 'Percent Age 65 and Older', 'Median Household Income',
                  'Percent Unemployed', 'Percent Less than High School Degree', 'Percent Less than Bachelor\'s Degree',
                  'Percent Rural']]
Y = merged_train[['Democratic','Republican']]

In [147]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=.75, test_size=0.25, random_state=0)

In [148]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)
#print(x_train_scaled)

**3. Build a linear regression model to predict the number of votes cast for the Democratic party in each county. Consider multiple combinations of predictor variables.Compute evaluation metrics for the validation set and report your results.**

In [149]:
#Simple linear regression using 'Population' as predictor to predict Democratic votes.

from sklearn import linear_model
import numpy

n = len(x_train) #Number of observations in the training set

model = linear_model.LinearRegression()
fitted_model = model.fit(X = x_train_scaled[:, 0].reshape(-1, 1), y = y_train['Democratic'])
print(model.coef_ , model.intercept_)

predicted = fitted_model.predict(x_test_scaled[:, 0].reshape(-1, 1))

corr_coef = numpy.corrcoef(predicted,y_test['Democratic'])[1, 0]
R_squared = corr_coef ** 2

adj_R_squared = 1 - ((1 - R_squared)*(n - 1)/(n - 1 - 1))
print(adj_R_squared)

#print(x_train.info())

[74711.50206856] 27569.373883928572
0.9435784812901373


In [150]:
#Simple linear regression using 'Percent Less than High School Degree' as predictor to predict Democratic votes.

model = linear_model.LinearRegression()
fitted_model = model.fit(X = x_train_scaled[:, 10].reshape(-1, 1), y = y_train['Democratic'])
print(model.coef_ , model.intercept_)

predicted = fitted_model.predict(x_test_scaled[:, 10].reshape(-1, 1))

corr_coef = numpy.corrcoef(predicted,y_test['Democratic'])[1, 0]
R_squared = corr_coef ** 2

adj_R_squared = 1 - ((1 - R_squared)*(n - 1)/(n - 1 - 1))
print(adj_R_squared)

[-8137.73810376] 27569.373883928572
0.02164134183638411


In [151]:
#Multiple linear regression using "Population", "Median Household Income" as predictor to predict Democratic votes.

model = linear_model.LinearRegression()
fitted_model = model.fit(X = x_train_scaled[:, [0,8]], y = y_train['Democratic'])
print(model.coef_ , model.intercept_)

predicted = fitted_model.predict(x_test_scaled[:, [0,8]])


corr_coef = numpy.corrcoef(predicted,y_test['Democratic'])[1, 0]
R_squared = corr_coef ** 2

adj_R_squared = 1 - ((1 - R_squared)*(n - 1)/(n - 2 - 1))
print(adj_R_squared)

[73067.37334453  6279.76422366] 27569.37388392857
0.939201701208693


In [152]:
#Multiple linear regression using "Population", "Median Household Income","Percent white, not hispanic or latino",
#"Percent Less than Bachelor's degree" as predictor to predict Democratic votes.

model = linear_model.LinearRegression()
fitted_model = model.fit(X = x_train_scaled[:, [0,1,8,11]], y = y_train['Democratic'])
print(model.coef_ , model.intercept_)

predicted = fitted_model.predict(x_test_scaled[:, [0,1,8,11]])

corr_coef = numpy.corrcoef(predicted,y_test['Democratic'])[1, 0]
R_squared = corr_coef ** 2

adj_R_squared = 1 - ((1 - R_squared)*(n - 1)/(n - 4 - 1))
print(adj_R_squared)

[71012.84796525  -345.05366382  1157.04687807 -8608.17042826] 27569.37388392857
0.9474994738338725


In [153]:
#Multiple linear regression using all predictor to predict Democratic votes.

model = linear_model.LinearRegression()
fitted_model = model.fit(X = x_train_scaled, y = y_train['Democratic'])
print(model.coef_ , model.intercept_)

predicted = fitted_model.predict(x_test_scaled)

corr_coef = numpy.corrcoef(predicted,y_test['Democratic'])[1, 0]
R_squared = corr_coef ** 2

adj_R_squared = 1 - ((1 - R_squared)*(n - 1)/(n - len(x_train.columns) - 1))
print(adj_R_squared)

[ 69224.38708039  -3209.1591268   -1023.23488454  -6931.14708179
   3973.74580741    194.19056985  -5299.5676761   -1853.22320472
   1471.25963216   1467.0213699    4037.7699931  -10519.02638282
   -158.13004477] 27569.37388392856
0.932860992564198


**What is the best performing linear regression model? What is the performance of the model? How did you select the variables
of the model?**

**Answer:** The best performing linear Regression model is Multiple linear Regression model using "Population", "Median Household Income","Percent white, not hispanic or latino", "Percent Less than Bachelor's degree" as predictor. The model perform well with these four predictors with adjusted R square value = 0.947. Selection of the varible is consistant with Project 1 conclusion and also on present analysis as we see here the adjusted R square value decreases if we consider all variables as predictors.

**Build a linear regression model to predict the number of votes cast for the Republican party in each county. Consider multiple combinations of predictor variables.Compute evaluation metrics for the validatiRepublicanon set and report your results.**

In [154]:
#Simple linear regression using 'Population' as predictor to predict Republican votes.

from sklearn import linear_model
import numpy

n = len(x_train) #Number of observations in the training set

model = linear_model.LinearRegression()
fitted_model = model.fit(X = x_train_scaled[:, 0].reshape(-1, 1), y = y_train['Republican'])
print(model.coef_ , model.intercept_)

predicted = fitted_model.predict(x_test_scaled[:, 0].reshape(-1, 1))

corr_coef = numpy.corrcoef(predicted,y_test['Republican'])[1, 0]
R_squared = corr_coef ** 2

adj_R_squared = 1 - ((1 - R_squared)*(n - 1)/(n - 1 - 1))
print(adj_R_squared)


[45306.87897032] 21546.910714285714
0.6714797544800217


In [155]:
#Simple linear regression using 'Percent Less than High School Degree' as predictor to predict Republican votes.

model = linear_model.LinearRegression()
fitted_model = model.fit(X = x_train_scaled[:, 10].reshape(-1, 1), y = y_train['Republican'])
print(model.coef_ , model.intercept_)

predicted = fitted_model.predict(x_test_scaled[:, 10].reshape(-1, 1))

corr_coef = numpy.corrcoef(predicted,y_test['Republican'])[1, 0]
R_squared = corr_coef ** 2

adj_R_squared = 1 - ((1 - R_squared)*(n - 1)/(n - 1 - 1))
print(adj_R_squared)


[-6381.7748349] 21546.910714285714
0.03485762203356779


In [156]:
#Multiple linear regression using "Population", "Median Household Income" as predictor to predict Republican votes.

model = linear_model.LinearRegression()
fitted_model = model.fit(X = x_train_scaled[:, [0,8]], y = y_train['Republican'])
print(model.coef_ , model.intercept_)

predicted = fitted_model.predict(x_test_scaled[:, [0,8]])


corr_coef = numpy.corrcoef(predicted,y_test['Republican'])[1, 0]
R_squared = corr_coef ** 2

adj_R_squared = 1 - ((1 - R_squared)*(n - 1)/(n - 2 - 1))
print(adj_R_squared)

[44042.16950014  4830.56902305] 21546.91071428571
0.6834161715428404


In [157]:
#Multiple linear regression using "Population", "Median Household Income","Percent white, not hispanic or latino",
#"Percent Less than Bachelor's degree" as predictor to predict Republican votes.

model = linear_model.LinearRegression()
fitted_model = model.fit(X = x_train_scaled[:, [0,1,8,11]], y = y_train['Republican'])
print(model.coef_ , model.intercept_)

predicted = fitted_model.predict(x_test_scaled[:, [0,1,8,11]])

corr_coef = numpy.corrcoef(predicted,y_test['Republican'])[1, 0]
R_squared = corr_coef ** 2

adj_R_squared = 1 - ((1 - R_squared)*(n - 1)/(n - 4 - 1))
print(adj_R_squared)

[44609.62027579  3068.87458444  3337.02252553 -2140.80688346] 21546.910714285714
0.682364141897545


In [158]:
#Multiple linear regression using all predictor to predict Republican votes.

model = linear_model.LinearRegression()
fitted_model = model.fit(X = x_train_scaled, y = y_train['Republican'])
print(model.coef_ , model.intercept_)

predicted = fitted_model.predict(x_test_scaled)

corr_coef = numpy.corrcoef(predicted,y_test['Republican'])[1, 0]
R_squared = corr_coef ** 2

adj_R_squared = 1 - ((1 - R_squared)*(n - 1)/(n - len(x_train.columns) - 1))
print(adj_R_squared)

[45467.5097118   1769.95034533 -3141.4206375   1167.17323402
 -6463.65917143 -1121.73432851  -955.67013341  2580.74056065
  5910.97457236  2037.10575397  3530.42010898 -3156.11275644
 -5992.05181735] 21546.910714285706
0.7198319563310677


**What is the best performing linear regression model? What is the performance of the model? How did you select the variables
of the model?**

**Answer:** The best performing linear Regression model while prediction Republican votes is Multiple linear Regression model using all veriables as predictor. The model does not perform too well with maximum adjusted R square value = 0.719.
All the variables are selected for the model as it gives the best adjusted R square value.