In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census
import pymongo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf

# Import data from MONGODB

In [2]:
# Initialize PyMongo and establish MongoDB connection
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Import census data from MongoDB
db = client.census_DB
results = client.census_DB.census.find()
cluster_data = []
for result in results:
    del result['_id']
    cluster_data.append(result)

In [4]:
cleaned_census_data = pd.DataFrame(cluster_data)
cleaned_census_data

Unnamed: 0,year,state,state_po,county_name,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Winner
0,2012,MISSOURI,MO,LINN,12668.0,43.0,39028.0,20968.0,13.245974,Republican
1,2012,MISSOURI,MO,HOWELL,40330.0,39.6,34148.0,17763.0,20.300025,Republican
2,2012,MISSOURI,MO,JOHNSON,52964.0,29.7,47960.0,21375.0,15.246205,Republican
3,2012,MISSOURI,MO,LACLEDE,35507.0,39.1,39101.0,19788.0,18.244290,Republican
4,2012,MISSOURI,MO,MARIES,9140.0,43.1,44885.0,21883.0,14.070022,Republican
...,...,...,...,...,...,...,...,...,...,...
9002,2020,MINNESOTA,MN,RENVILLE,14572.0,44.0,58542.0,31243.0,9.422180,Republican
9003,2020,MINNESOTA,MN,ROSEAU,15259.0,41.6,62304.0,31452.0,7.425126,Republican
9004,2020,MINNESOTA,MN,SHERBURNE,96015.0,36.1,88671.0,36022.0,5.158569,Republican
9005,2020,MINNESOTA,MN,STEELE,36710.0,39.2,68172.0,34648.0,7.864342,Republican


# Pre-process data for use in SVM Model

In [5]:
# Create numeric value for year and add as 'Years Since' column
cleaned_census_data['years since'] = cleaned_census_data['year'].map({2012:11, 2016: 7, 2020: 3})
cleaned_census_data.head()

Unnamed: 0,year,state,state_po,county_name,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Winner,years since
0,2012,MISSOURI,MO,LINN,12668.0,43.0,39028.0,20968.0,13.245974,Republican,11
1,2012,MISSOURI,MO,HOWELL,40330.0,39.6,34148.0,17763.0,20.300025,Republican,11
2,2012,MISSOURI,MO,JOHNSON,52964.0,29.7,47960.0,21375.0,15.246205,Republican,11
3,2012,MISSOURI,MO,LACLEDE,35507.0,39.1,39101.0,19788.0,18.24429,Republican,11
4,2012,MISSOURI,MO,MARIES,9140.0,43.1,44885.0,21883.0,14.070022,Republican,11


In [6]:
# Split data set to use for model training / testing the remianig data will be used to predict 
shaped_data = cleaned_census_data[(cleaned_census_data['years since'] == 11) | (cleaned_census_data['years since'] == 7)]
shaped_data = shaped_data.drop(columns=['year','state_po'],axis=1)
shaped_data.head()

Unnamed: 0,state,county_name,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Winner,years since
0,MISSOURI,LINN,12668.0,43.0,39028.0,20968.0,13.245974,Republican,11
1,MISSOURI,HOWELL,40330.0,39.6,34148.0,17763.0,20.300025,Republican,11
2,MISSOURI,JOHNSON,52964.0,29.7,47960.0,21375.0,15.246205,Republican,11
3,MISSOURI,LACLEDE,35507.0,39.1,39101.0,19788.0,18.24429,Republican,11
4,MISSOURI,MARIES,9140.0,43.1,44885.0,21883.0,14.070022,Republican,11


In [7]:
# Identify the columns with object/string type variables to one hot encode
svm_data_cat = shaped_data.dtypes[shaped_data.dtypes == "object"].index.tolist()
shaped_data[svm_data_cat].nunique()

state            48
county_name    1739
Winner            2
dtype: int64

In [8]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(shaped_data[svm_data_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(svm_data_cat)
encode_df.head()



Unnamed: 0,state_ALABAMA,state_ARIZONA,state_ARKANSAS,state_CALIFORNIA,state_COLORADO,state_CONNECTICUT,state_DELAWARE,state_FLORIDA,state_GEORGIA,state_HAWAII,...,county_name_YOLO,county_name_YORK,county_name_YOUNG,county_name_YUBA,county_name_YUMA,county_name_ZAPATA,county_name_ZAVALA,county_name_ZIEBACH,Winner_Democrat,Winner_Republican
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Create SVM Model

In [9]:
# Merge one-hot encoded features and drop the originals
shaped_data = shaped_data.merge(encode_df,left_index=True, right_index=True)
svm_data = shaped_data.drop(svm_data_cat,1)
svm_data.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,years since,state_ALABAMA,state_ARIZONA,state_ARKANSAS,state_CALIFORNIA,...,county_name_YOLO,county_name_YORK,county_name_YOUNG,county_name_YUBA,county_name_YUMA,county_name_ZAPATA,county_name_ZAVALA,county_name_ZIEBACH,Winner_Democrat,Winner_Republican
0,12668.0,43.0,39028.0,20968.0,13.245974,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,40330.0,39.6,34148.0,17763.0,20.300025,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,52964.0,29.7,47960.0,21375.0,15.246205,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,35507.0,39.1,39101.0,19788.0,18.24429,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,9140.0,43.1,44885.0,21883.0,14.070022,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
# Set target and target names for the SVM model
target = svm_data['Winner_Republican']
target_names = ['Democrat','Republican']

In [11]:
# Set features for the SVM model
data = svm_data.drop(columns=['Winner_Democrat','Winner_Republican'])
feature_names = data.columns
data.head()

Unnamed: 0,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,years since,state_ALABAMA,state_ARIZONA,state_ARKANSAS,state_CALIFORNIA,...,county_name_YELLOWSTONE,county_name_YOAKUM,county_name_YOLO,county_name_YORK,county_name_YOUNG,county_name_YUBA,county_name_YUMA,county_name_ZAPATA,county_name_ZAVALA,county_name_ZIEBACH
0,12668.0,43.0,39028.0,20968.0,13.245974,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,40330.0,39.6,34148.0,17763.0,20.300025,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,52964.0,29.7,47960.0,21375.0,15.246205,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35507.0,39.1,39101.0,19788.0,18.24429,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9140.0,43.1,44885.0,21883.0,14.070022,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Split the data into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=10)

In [13]:
# Support vector machine with rbf classifier
from sklearn.svm import SVC 
model = SVC(kernel='rbf')
model.fit(X_train, y_train)

SVC()

In [14]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.817


In [15]:
# Calculate the classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

    Democrat       0.70      0.20      0.31       311
  Republican       0.82      0.98      0.89      1191

    accuracy                           0.82      1502
   macro avg       0.76      0.59      0.60      1502
weighted avg       0.80      0.82      0.77      1502



# Use the Model to predict an outcome using new dataset

In [16]:
# Get new data 2020 data from the data set
test = cleaned_census_data[cleaned_census_data['years since'] == 3]
test.head()

Unnamed: 0,year,state,state_po,county_name,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Winner,years since
6006,2020,ALABAMA,AL,AUTAUGA,55639.0,38.6,57982.0,29804.0,15.08654,Republican,3
6007,2020,ALABAMA,AL,BALDWIN,218289.0,43.2,61756.0,33751.0,9.042599,Republican,3
6008,2020,ALABAMA,AL,BARBOUR,25026.0,40.1,34990.0,20074.0,25.221769,Republican,3
6009,2020,ALABAMA,AL,BIBB,22374.0,39.9,51721.0,22626.0,16.782873,Republican,3
6010,2020,ALABAMA,AL,BLOUNT,57755.0,41.0,48922.0,25457.0,13.586702,Republican,3


In [17]:
# Pre-process new data for prediction  
test = test.drop(columns=['year','state_po'],axis=1)
svm_predict_cat = test.dtypes[test.dtypes == "object"].index.tolist()
test[svm_predict_cat].nunique()

state            48
county_name    1738
Winner            2
dtype: int64

In [18]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(test[svm_predict_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(svm_predict_cat)
encode_df.head()



Unnamed: 0,state_ALABAMA,state_ARIZONA,state_ARKANSAS,state_CALIFORNIA,state_COLORADO,state_CONNECTICUT,state_DELAWARE,state_FLORIDA,state_GEORGIA,state_HAWAII,...,county_name_YOLO,county_name_YORK,county_name_YOUNG,county_name_YUBA,county_name_YUMA,county_name_ZAPATA,county_name_ZAVALA,county_name_ZIEBACH,Winner_Democrat,Winner_Republican
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [19]:
# Merge one-hot encoded features and drop the originals
svm_predict = test.merge(encode_df,left_index=True, right_index=True)
svm_predict = shaped_data.drop(svm_predict_cat,1)
svm_predict.tail()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,years since,state_ALABAMA,state_ARIZONA,state_ARKANSAS,state_CALIFORNIA,...,county_name_YOLO,county_name_YORK,county_name_YOUNG,county_name_YUBA,county_name_YUMA,county_name_ZAPATA,county_name_ZAVALA,county_name_ZIEBACH,Winner_Democrat,Winner_Republican
6001,32829.0,41.9,40417.0,21200.0,18.520211,7,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6002,77245.0,40.4,62646.0,31170.0,11.217555,7,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6003,25889.0,46.5,43373.0,25136.0,14.369037,7,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6004,32021.0,48.3,43082.0,24455.0,13.73786,7,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6005,284559.0,31.6,46842.0,26083.0,20.784793,7,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [38]:
# Select all columns and n rows to be predicted by the model 
test_data = svm_predict.iloc[99:200,:]
test_data

Unnamed: 0,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,years since,state_ALABAMA,state_ARIZONA,state_ARKANSAS,state_CALIFORNIA,...,county_name_YOLO,county_name_YORK,county_name_YOUNG,county_name_YUBA,county_name_YUMA,county_name_ZAPATA,county_name_ZAVALA,county_name_ZIEBACH,Winner_Democrat,Winner_Republican
99,74591.0,42.5,46999.0,25779.0,9.919427,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
100,48960.0,38.2,64982.0,28682.0,5.555556,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
101,4486.0,48.5,44652.0,22128.0,13.196612,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
102,166373.0,36.5,55117.0,27012.0,11.880534,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
103,17960.0,37.5,62542.0,26459.0,7.076837,11,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,15674.0,40.4,33481.0,17718.0,23.548552,11,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
196,12301.0,46.9,35059.0,18243.0,19.120397,11,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
197,13790.0,37.1,35236.0,18531.0,23.328499,11,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
198,46388.0,34.6,35651.0,18912.0,23.667759,11,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [39]:
# Capture target info for comparisson with predictions 
target = test_data['Winner_Republican']
target_names = ['Democrat', 'Republican']

In [40]:
# Establish data features to be predicted
test_data = test_data.drop(columns=['Winner_Republican','Winner_Democrat'], axis=1)

In [41]:
# Run the model with ne dataset
predictions = model.predict(test_data)
print(f"predictions {predictions}")
print(f"target {list(target)}")

predictions [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1.]
target [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [42]:
# Create DataFrame to Display the target and predictions 
svm_df = pd.DataFrame(columns=['Target', 'Predictions'])
svm_df['Target'] = list(target) 
svm_df['Predictions'] = predictions
svm_df

Unnamed: 0,Target,Predictions
0,1.0,1.0
1,1.0,1.0
2,1.0,1.0
3,0.0,1.0
4,1.0,1.0
...,...,...
96,1.0,1.0
97,1.0,1.0
98,1.0,1.0
99,1.0,1.0


# SVM END