In [1]:
# Import dependencies 
import pandas as pd
import pymongo
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf


In [2]:
# Import Census CSV data 
census_data = pd.read_csv('./Resources/2016_2020_census_data.csv')
census_data

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Year
0,5762,513,39.2,58558,24235.0,50,9.746589,2016
1,5763,2715,43.9,54968,29674.0,488,17.974217,2016
2,5764,3370,41.3,44468,22651.0,323,9.584570,2016
3,5765,1632,46.7,56771,30786.0,85,5.208333,2016
4,5766,598,45.9,52250,33227.0,76,12.709030,2016
...,...,...,...,...,...,...,...,...
66235,16623,552,54.9,51667,26369.0,80,14.492754,2020
66236,16627,2118,44.6,45000,24699.0,324,15.297450,2020
66237,16634,315,46.1,51500,25084.0,24,7.619048,2020
66238,16640,707,48.6,55982,28335.0,167,23.620934,2020


In [3]:
# Perform some data cleansing options Drop NA
census_data.dropna(inplace=True)
census_data

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Year
0,5762,513,39.2,58558,24235.0,50,9.746589,2016
1,5763,2715,43.9,54968,29674.0,488,17.974217,2016
2,5764,3370,41.3,44468,22651.0,323,9.584570,2016
3,5765,1632,46.7,56771,30786.0,85,5.208333,2016
4,5766,598,45.9,52250,33227.0,76,12.709030,2016
...,...,...,...,...,...,...,...,...
66235,16623,552,54.9,51667,26369.0,80,14.492754,2020
66236,16627,2118,44.6,45000,24699.0,324,15.297450,2020
66237,16634,315,46.1,51500,25084.0,24,7.619048,2020
66238,16640,707,48.6,55982,28335.0,167,23.620934,2020


In [8]:
cleaned_census_data = census_data[census_data['Household Income']>0]
cleaned_census_data

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Year
0,5762,513,39.2,58558,24235.0,50,9.746589,2016
1,5763,2715,43.9,54968,29674.0,488,17.974217,2016
2,5764,3370,41.3,44468,22651.0,323,9.584570,2016
3,5765,1632,46.7,56771,30786.0,85,5.208333,2016
4,5766,598,45.9,52250,33227.0,76,12.709030,2016
...,...,...,...,...,...,...,...,...
66235,16623,552,54.9,51667,26369.0,80,14.492754,2020
66236,16627,2118,44.6,45000,24699.0,324,15.297450,2020
66237,16634,315,46.1,51500,25084.0,24,7.619048,2020
66238,16640,707,48.6,55982,28335.0,167,23.620934,2020


In [11]:
# Convert years to time since current 
cleaned_census_data['Years Since'] = cleaned_census_data['Year'].map({2016: 7, 2020: 3})
cleaned_census_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Year,Years Since
0,5762,513,39.2,58558,24235.0,50,9.746589,2016,7
1,5763,2715,43.9,54968,29674.0,488,17.974217,2016,7
2,5764,3370,41.3,44468,22651.0,323,9.584570,2016,7
3,5765,1632,46.7,56771,30786.0,85,5.208333,2016,7
4,5766,598,45.9,52250,33227.0,76,12.709030,2016,7
...,...,...,...,...,...,...,...,...,...
66235,16623,552,54.9,51667,26369.0,80,14.492754,2020,3
66236,16627,2118,44.6,45000,24699.0,324,15.297450,2020,3
66237,16634,315,46.1,51500,25084.0,24,7.619048,2020,3
66238,16640,707,48.6,55982,28335.0,167,23.620934,2020,3


In [13]:
zip_codes = pd.read_csv('./Resources/usazipcode.csv')
zip_codes.columns

Index(['zip', 'Zipcode name', 'City', 'State', 'County Name'], dtype='object')

In [14]:
zip_codes.dropna(inplace=True)
zip_codes['zip'] = zip_codes['zip'].astype(int)
zip_codes.dtypes

zip              int32
Zipcode name    object
City            object
State           object
County Name     object
dtype: object

In [15]:
# Merge the census data and zipcode DF
new_df = pd.merge(census_data, zip_codes, how='left', left_on='Zipcode', right_on = 'zip')
new_df.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Year,Years Since,zip,Zipcode name,City,State,County Name
0,5762,513,39.2,58558,24235.0,50,9.746589,2016,7,5762.0,"PITTSFIELD, VT",PITTSFIELD,VT,RUTLAND
1,5763,2715,43.9,54968,29674.0,488,17.974217,2016,7,5763.0,"PITTSFORD, VT",PITTSFORD,VT,RUTLAND
2,5764,3370,41.3,44468,22651.0,323,9.58457,2016,7,5764.0,"POULTNEY, VT",POULTNEY,VT,RUTLAND
3,5765,1632,46.7,56771,30786.0,85,5.208333,2016,7,5765.0,"PROCTOR, VT",PROCTOR,VT,RUTLAND
4,5766,598,45.9,52250,33227.0,76,12.70903,2016,7,5766.0,"RIPTON, VT",RIPTON,VT,ADDISON


In [16]:
# Merge the census and zip data 
census_zip_df = new_df.drop(['zip', 'Zipcode name', 'City', 'County Name'],axis=1)
census_zip_df

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Year,Years Since,State
0,5762,513,39.2,58558,24235.0,50,9.746589,2016,7,VT
1,5763,2715,43.9,54968,29674.0,488,17.974217,2016,7,VT
2,5764,3370,41.3,44468,22651.0,323,9.584570,2016,7,VT
3,5765,1632,46.7,56771,30786.0,85,5.208333,2016,7,VT
4,5766,598,45.9,52250,33227.0,76,12.709030,2016,7,VT
...,...,...,...,...,...,...,...,...,...,...
64929,16623,552,54.9,51667,26369.0,80,14.492754,2020,3,PA
64930,16627,2118,44.6,45000,24699.0,324,15.297450,2020,3,PA
64931,16634,315,46.1,51500,25084.0,24,7.619048,2020,3,PA
64932,16640,707,48.6,55982,28335.0,167,23.620934,2020,3,PA


In [17]:
# Import the winner values by state
election_results = pd.read_csv('./Resources/2016_2020_winners.csv')
election_results

Unnamed: 0,STATE,Electoral Vote (D),Electoral Vote (R),Popular Vote (D),Popular Vote (R),Popular Vote All Others,Total Vote,Winner,Year
0,AL,,9.0,849624,1441170,32488,2323282,Republican,2020
1,AK,,3.0,153778,189951,15801,359530,Republican,2020
2,AZ,11.0,,1672143,1661686,53497,3387326,Democrat,2020
3,AR,,6.0,423932,760647,34490,1219069,Republican,2020
4,CA,55.0,,11110639,6006518,384223,17501380,Democrat,2020
...,...,...,...,...,...,...,...,...,...
97,VA,,13.0,1769443,1981473,233715,3984631,Republican,2016
98,WA,,8.0,1221747,1742718,352554,3317019,Republican,2016
99,WV,5.0,,489371,188794,36258,714423,Democrat,2016
100,WI,10.0,,1405284,1382536,188330,2976150,Democrat,2016


In [18]:
election_results.columns

Index(['STATE', 'Electoral Vote (D)', 'Electoral Vote (R)', 'Popular Vote (D)',
       'Popular Vote (R)', 'Popular Vote All Others', 'Total Vote', 'Winner',
       'Year'],
      dtype='object')

In [19]:
# Merge the winner info with the previosuly established DF
next_df = pd.merge(census_zip_df, election_results, how='left', left_on=['State','Year'], right_on = ['STATE','Year'])
next_df.head()

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Year,Years Since,State,STATE,Electoral Vote (D),Electoral Vote (R),Popular Vote (D),Popular Vote (R),Popular Vote All Others,Total Vote,Winner
0,5762,513,39.2,58558,24235.0,50,9.746589,2016,7,VT,VT,,3.0,95369,178573,41125,315067,Republican
1,5763,2715,43.9,54968,29674.0,488,17.974217,2016,7,VT,VT,,3.0,95369,178573,41125,315067,Republican
2,5764,3370,41.3,44468,22651.0,323,9.58457,2016,7,VT,VT,,3.0,95369,178573,41125,315067,Republican
3,5765,1632,46.7,56771,30786.0,85,5.208333,2016,7,VT,VT,,3.0,95369,178573,41125,315067,Republican
4,5766,598,45.9,52250,33227.0,76,12.70903,2016,7,VT,VT,,3.0,95369,178573,41125,315067,Republican


In [20]:
# Create the final dataset for ML application
db_data = next_df.drop(['Year','STATE', 'Electoral Vote (D)', 'Electoral Vote (R)', 'Popular Vote (D)',
       'Popular Vote (R)', 'Popular Vote All Others', 'Total Vote'], axis=1)
db_data

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Years Since,State,Winner
0,5762,513,39.2,58558,24235.0,50,9.746589,7,VT,Republican
1,5763,2715,43.9,54968,29674.0,488,17.974217,7,VT,Republican
2,5764,3370,41.3,44468,22651.0,323,9.584570,7,VT,Republican
3,5765,1632,46.7,56771,30786.0,85,5.208333,7,VT,Republican
4,5766,598,45.9,52250,33227.0,76,12.709030,7,VT,Republican
...,...,...,...,...,...,...,...,...,...,...
64929,16623,552,54.9,51667,26369.0,80,14.492754,3,PA,Democrat
64930,16627,2118,44.6,45000,24699.0,324,15.297450,3,PA,Democrat
64931,16634,315,46.1,51500,25084.0,24,7.619048,3,PA,Democrat
64932,16640,707,48.6,55982,28335.0,167,23.620934,3,PA,Democrat


In [21]:
# last drop of NA values 
db_data.dropna(inplace=True)
db_data

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Years Since,State,Winner
0,5762,513,39.2,58558,24235.0,50,9.746589,7,VT,Republican
1,5763,2715,43.9,54968,29674.0,488,17.974217,7,VT,Republican
2,5764,3370,41.3,44468,22651.0,323,9.584570,7,VT,Republican
3,5765,1632,46.7,56771,30786.0,85,5.208333,7,VT,Republican
4,5766,598,45.9,52250,33227.0,76,12.709030,7,VT,Republican
...,...,...,...,...,...,...,...,...,...,...
64929,16623,552,54.9,51667,26369.0,80,14.492754,3,PA,Democrat
64930,16627,2118,44.6,45000,24699.0,324,15.297450,3,PA,Democrat
64931,16634,315,46.1,51500,25084.0,24,7.619048,3,PA,Democrat
64932,16640,707,48.6,55982,28335.0,167,23.620934,3,PA,Democrat


In [22]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [26]:
# Export data to MongoDB
electiondb = client.election_db
collection = electiondb.election_data
data_dict = db_data.to_dict("records")
collection.insert_many(data_dict)

<pymongo.results.InsertManyResult at 0x1ae904a2b08>

In [25]:
#electiondb.election_data.drop()

In [None]:
db_electoralVote = next_df.drop(['Year','STATE', 'Popular Vote (D)',
       'Popular Vote (R)', 'Popular Vote All Others', 'Total Vote'], axis=1)


In [36]:
# Replacing NaN with Zeros
db_electoralVote[['Electoral Vote (D)', 'Electoral Vote (R)']] = db_electoralVote[['Electoral Vote (D)', 'Electoral Vote (R)']].fillna(0)
db_electoralVote

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Years Since,State,Electoral Vote (D),Electoral Vote (R),Winner
0,5762,513,39.2,58558,24235.0,50,9.746589,7,VT,0.0,3.0,Republican
1,5763,2715,43.9,54968,29674.0,488,17.974217,7,VT,0.0,3.0,Republican
2,5764,3370,41.3,44468,22651.0,323,9.584570,7,VT,0.0,3.0,Republican
3,5765,1632,46.7,56771,30786.0,85,5.208333,7,VT,0.0,3.0,Republican
4,5766,598,45.9,52250,33227.0,76,12.709030,7,VT,0.0,3.0,Republican
...,...,...,...,...,...,...,...,...,...,...,...,...
64929,16623,552,54.9,51667,26369.0,80,14.492754,3,PA,20.0,0.0,Democrat
64930,16627,2118,44.6,45000,24699.0,324,15.297450,3,PA,20.0,0.0,Democrat
64931,16634,315,46.1,51500,25084.0,24,7.619048,3,PA,20.0,0.0,Democrat
64932,16640,707,48.6,55982,28335.0,167,23.620934,3,PA,20.0,0.0,Democrat


In [40]:
db_electoralVote.dropna(inplace=True)
db_electoralVote

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Count,Poverty Rate,Years Since,State,Electoral Vote (D),Electoral Vote (R),Winner
0,5762,513,39.2,58558,24235.0,50,9.746589,7,VT,0.0,3.0,Republican
1,5763,2715,43.9,54968,29674.0,488,17.974217,7,VT,0.0,3.0,Republican
2,5764,3370,41.3,44468,22651.0,323,9.584570,7,VT,0.0,3.0,Republican
3,5765,1632,46.7,56771,30786.0,85,5.208333,7,VT,0.0,3.0,Republican
4,5766,598,45.9,52250,33227.0,76,12.709030,7,VT,0.0,3.0,Republican
...,...,...,...,...,...,...,...,...,...,...,...,...
64929,16623,552,54.9,51667,26369.0,80,14.492754,3,PA,20.0,0.0,Democrat
64930,16627,2118,44.6,45000,24699.0,324,15.297450,3,PA,20.0,0.0,Democrat
64931,16634,315,46.1,51500,25084.0,24,7.619048,3,PA,20.0,0.0,Democrat
64932,16640,707,48.6,55982,28335.0,167,23.620934,3,PA,20.0,0.0,Democrat


In [42]:
# Export data with electoral vote to MongoDB
electiondb = client.election_db
collection = electiondb.electoral_data
data_dict = db_electoralVote.to_dict("records")
collection.insert_many(data_dict)

<pymongo.results.InsertManyResult at 0x1ae8bf4d588>

In [41]:
#electiondb.electoral_data.drop()