<a href="https://colab.research.google.com/github/ellenwterry/PoliticalAnalysis/blob/main/Reg_NE_and_LogReg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import patsy
from sklearn.linear_model import LogisticRegression

import random
!pip install nest-asyncio
import nest_asyncio
nest_asyncio.apply()


!pip install pystan==3.7.0
#!pip install pystan
!pip install corner
import stan

import plotly.express as px
import plotly.graph_objects as go

!pip install geopy
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
!pip install pygris
# import matplotlib.pyplot as plt
from pygris import core_based_statistical_areas
from pygris import tracts

from google.colab import files


import geopandas as gpd
import folium
# from google.colab import files


Get Data

In [2]:
# ---------- Get Data from Github ---------- #

url = 'https://raw.githubusercontent.com/ellenwterry/PoliticalAnalysis/main/VoteBase.csv'
VoteBase = pd.read_csv(url)

Tidy Data

In [3]:
# ---------- Clean up data ---------- #

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(VoteBase['Sex'])

codes = {'NR':0, 'M': 1, 'F': 2}
VoteBase['Sex'] = VoteBase['Sex'].map(codes)

VoteBase['Age']=VoteBase.Age.astype('int32')

#VoteBase['LastPrimary'] = le.transform(VoteBase['LastPrimary'])
codes = {'NR':0, 'R': 1, 'D':2}
VoteBase['LastPrimary'] = VoteBase['LastPrimary'].map(codes)

#VoteBase['Education'] = le.transform(VoteBase['Education'])
codes = {'NR':0, 'HS': 1, 'Some College':2, 'Bachelor':3, 'Masters':4, 'Doctorate':5}
VoteBase['Education'] = VoteBase['Education'].map(codes)

#VoteBase['HHIncome'] = le.transform(VoteBase['HHIncome'])
codes = {'NR':0, 'Under 50k': 1, '50k-100k':2, '100k-200k':3, '200k-300k':4, '300k-500k':5, 'Over 500k':6}
VoteBase['HHIncome'] = VoteBase['HHIncome'].map(codes)

#VoteBase['ReligiousAffil'] = le.transform(VoteBase['ReligiousAffil'])
codes = {'NR':0,'Protestant': 1, 'Catholic':2, 'Other':3, 'None':4}
VoteBase['ReligiousAffil'] = VoteBase['ReligiousAffil'].map(codes)

#VoteBase['Support24'] = le.transform(VoteBase['Support24'])
codes = {'R':0, 'D': 1}
VoteBase['Support24'] = VoteBase['Support24'].map(codes)
# NOTE: NAs were excluded from sample so that algorithms could score using logistic scale - 2nd pass will use imputed values

#VoteBase['TopIssue'] = le.transform(VoteBase['TopIssue'])
codes = {'NR':0, 'RFree':1, 'Parents':2, 'Crime':3, 'Economy':4, 'Womens':5, 'Education':6, 'Environment':7, 'Democracy':8}
VoteBase['TopIssue'] = VoteBase['TopIssue'].map(codes)

# This is for the second data source (later)
codes = {'NS':0, 'NR':1,'Signed':2}
VoteBase['RRPetition'] = VoteBase['RRPetition'].map(codes)

In [4]:
random.seed(316)

In [5]:
VoteMatrix = patsy.dmatrix('Age + Sex + Education + HHIncome+ ReligiousAffil + LastPrimary + TopIssue -1', VoteBase)
yArray = np.array(VoteBase['Support24'])
rows = VoteMatrix.shape[0]
tstInd = np.random.randint(0, rows, size=100)
tstMatrix = VoteMatrix[tstInd]
yTst = yArray[tstInd]
trnMatrix = np.delete(VoteMatrix, tstInd, axis=0)
yTrn = np.delete(yArray, tstInd, axis=0)

Split into Training and Test Sets

Train LogReg Model

In [6]:
model = LogisticRegression(solver='sag')
model.fit(trnMatrix, yTrn)
Pred = model.predict(tstMatrix)
Probs = pd.DataFrame(model.predict_proba(tstMatrix))
theta = np.matrix(model.coef_)
intercept = model.intercept_
Probs[1]



0     0.516644
1     0.960909
2     0.776707
3     0.379402
4     0.744231
        ...   
95    0.896303
96    0.057724
97    0.742926
98    0.445935
99    0.893910
Name: 1, Length: 100, dtype: float64

In [7]:
model.coef_

array([[-0.0254904 , -0.52589203,  0.26414565,  0.41742473,  0.98870898,
         0.33794318,  0.0902947 ]])

In [8]:
model.intercept_

array([-1.35214667])

Select Test Samples and Confirm using Equation.

In [9]:
def stable_sigmoid(x):
  # Using np.where to avoid numerical overflow or underflow.
  return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))

# This logreg algorithm breaks out the bias separately for analysis with multinomials.

Prob2 = stable_sigmoid((np.dot(model.coef_,tstMatrix[0].transpose())+ model.intercept_).transpose())
Prob2

array([0.51664357])

In [10]:
pd.DataFrame(model.predict_proba(tstMatrix))[1]

0     0.516644
1     0.960909
2     0.776707
3     0.379402
4     0.744231
        ...   
95    0.896303
96    0.057724
97    0.742926
98    0.445935
99    0.893910
Name: 1, Length: 100, dtype: float64

In [19]:
tstDF = VoteBase.iloc[tstInd,:]
tstDF['tst'] = Probs[1].tolist()
tstDF

Unnamed: 0,ID,Name,Sex,Age,LastPrimary,Latitude,Longitude,Education,HHIncome,ReligiousAffil,Support24,TopIssue,RRPetition,tst
2272,2273,Voter 2273,1,32,0,41.04367,-73.59345,4,3,0,1,5,0,0.516644
1959,1960,Voter 1960,1,42,0,41.06202,-73.61314,3,4,3,1,8,0,0.960909
1755,1756,Voter 1756,1,101,0,41.06842,-73.60737,3,4,3,0,3,1,0.776707
1004,1005,Voter 1005,2,44,0,41.04334,-73.57968,3,3,1,0,0,0,0.379402
1817,1818,Voter 1818,1,81,0,41.06875,-73.63166,3,3,3,1,0,0,0.744231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1589,1590,Voter 1590,1,56,0,41.04332,-73.59781,3,3,3,1,5,0,0.896303
744,745,Voter 745,2,48,0,41.00856,-73.64902,0,2,0,0,0,2,0.057724
1527,1528,Voter 1528,2,71,0,41.01571,-73.65709,4,3,3,1,0,0,0.742926
1121,1122,Voter 1122,2,72,0,41.00559,-73.63517,3,3,2,0,0,0,0.445935


In [23]:
tstDF.loc[tstDF['ID'] == 1005]

Unnamed: 0,ID,Name,Sex,Age,LastPrimary,Latitude,Longitude,Education,HHIncome,ReligiousAffil,Support24,TopIssue,RRPetition,tst
1004,1005,Voter 1005,2,44,0,41.04334,-73.57968,3,3,1,0,0,0,0.379402


In [41]:
# What happens if we can move Voter 1 to a TopIssue of 5

#tstDF2 = tstDF.loc[tstDF['ID'] == 1005]
#tstDF2
#ProbNew = stable_sigmoid((np.dot(model.coef_,tstMatrix[38].transpose())+ model.intercept_).transpose())

#ProbNew = stable_sigmoid((np.dot(model.coef_,tstMatrix[38].transpose())+ model.intercept_).transpose())
#ProbNew


Unnamed: 0,ID,Name,Sex,Age,LastPrimary,Latitude,Longitude,Education,HHIncome,ReligiousAffil,Support24,TopIssue,RRPetition,tst
1004,1005,Voter 1005,2,44,0,41.04334,-73.57968,3,3,1,0,0,0,0.379402


In [49]:
#tstDF2[0, 12] = 5
#tstDF2
tstMatrix[3,6] = 5
tstMatrix[3]

array([44.,  2.,  3.,  3.,  1.,  0.,  5.])

In [51]:
Prob3 = stable_sigmoid((np.dot(model.coef_,tstMatrix[3].transpose())+ model.intercept_).transpose())
Prob3

array([0.48984774])

In [62]:
ProbAll = stable_sigmoid((np.dot(model.coef_,VoteMatrix.transpose())+ model.intercept_).transpose())
VoteBase['Prob'] = ProbAll.astype(float)
VoteBase


Unnamed: 0,ID,Name,Sex,Age,LastPrimary,Latitude,Longitude,Education,HHIncome,ReligiousAffil,Support24,TopIssue,RRPetition,Probs,Prob
0,1,Voter 1,1,51,0,41.00544,-73.62954,2,3,1,0,0,0,[0.39920600939590173],0.399206
1,2,Voter 2,1,52,0,41.04027,-73.63011,2,2,1,0,0,0,[0.29907853911319315],0.299079
2,3,Voter 3,2,54,0,41.05734,-73.65044,3,6,1,1,4,0,[0.7040087547478763],0.704009
3,4,Voter 4,1,59,0,41.03255,-73.60403,2,3,1,0,0,0,[0.3514440662815238],0.351444
4,5,Voter 5,1,36,0,41.06684,-73.57837,2,3,4,0,0,0,[0.9497748188376445],0.949775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2493,2494,Voter 2494,2,62,0,41.07108,-73.63842,4,3,4,1,5,1,[0.9388215737028786],0.938822
2494,2495,Voter 2495,2,62,2,41.03758,-73.57932,3,3,0,1,5,1,[0.3074081763213841],0.307408
2495,2496,Voter 2496,2,65,2,41.01605,-73.58949,3,6,1,1,0,1,[0.7111085548797187],0.711109
2496,2497,Voter 2497,1,67,2,40.99031,-73.65605,0,6,0,1,0,1,[0.40000108195626544],0.400001


In [63]:
VoteBase['Prob']

0       0.399206
1       0.299079
2       0.704009
3       0.351444
4       0.949775
          ...   
2493    0.938822
2494    0.307408
2495    0.711109
2496    0.400001
2497    0.878400
Name: Prob, Length: 2498, dtype: float64

In [64]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=VoteBase['Prob'], marker_color='#378796', opacity=0.05, name = "Vote Prob"))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.9, nbinsx=100)
fig.update_layout(
    autosize=False,
    width=800,
    height=400,
    plot_bgcolor = "white",
    xaxis=dict(title='Probability of Vote',),
)

fig.show()

In [None]:
theta

matrix([[-1.07528012, -0.01813151, -0.43072415,  0.27272075,  0.48073512,
          0.97630673,  0.34951638,  0.09339288]])

In [None]:
color_map = {
    1: "#5218fa"
}

fig2 = go.Figure()
fig2.add_trace(go.Histogram(x=VoteBase['Prob'], marker_color=VoteBase["TopIssue"].map(color_map)))
fig2.update_traces(opacity=0.9, nbinsx=100)
fig2.update_layout(
    autosize=False,
    width=800,
    height=400,
    plot_bgcolor = "white",
    xaxis=dict(title='Probability of Vote',),
)

fig2.show()