In [22]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

In [23]:
df = pd.read_csv('senate_dataset.csv')
df.head()

Unnamed: 0,_year,_state,state_po,state_fips,state_cen,state_ic,office,district,stage,special,candidate,party_detailed,writein,_mode,candidatevotes,totalvotes,unofficial,_version,party_simplified
0,1976,ARIZONA,AZ,4,86,61,US SENATE,statewide,gen,False,SAM STEIGER,REPUBLICAN,False,total,321236,741210,False,20210114,REPUBLICAN
1,1976,ARIZONA,AZ,4,86,61,US SENATE,statewide,gen,False,WM. MATHEWS FEIGHAN,INDEPENDENT,False,total,1565,741210,False,20210114,OTHER
2,1976,ARIZONA,AZ,4,86,61,US SENATE,statewide,gen,False,DENNIS DECONCINI,DEMOCRAT,False,total,400334,741210,False,20210114,DEMOCRAT
3,1976,ARIZONA,AZ,4,86,61,US SENATE,statewide,gen,False,ALLAN NORWITZ,LIBERTARIAN,False,total,7310,741210,False,20210114,LIBERTARIAN
4,1976,ARIZONA,AZ,4,86,61,US SENATE,statewide,gen,False,BOB FIELD,INDEPENDENT,False,total,10765,741210,False,20210114,OTHER


In [24]:
# Create category_columns and numeric_columns variables
numeric_columns = []
category_columns = []
for col in df.columns:
    if is_string_dtype(df[col]) == True:
        category_columns.append(col)
    elif is_numeric_dtype(df[col]) == True:
        numeric_columns.append(col)
print(numeric_columns)
print(category_columns)

['_year', 'state_fips', 'state_cen', 'state_ic', 'special', 'writein', 'candidatevotes', 'totalvotes', 'unofficial', '_version']
['_state', 'state_po', 'office', 'district', 'stage', 'candidate', 'party_detailed', '_mode', 'party_simplified']


In [25]:
 #Create dummy variables for the category_columns and merge on the numeric_columns to create an X dataset
category_columns = pd.get_dummies(df[category_columns])
category_columns
X = df[numeric_columns].merge(category_columns, left_index= True, right_index= True)
X

Unnamed: 0,_year,state_fips,state_cen,state_ic,special,writein,candidatevotes,totalvotes,unofficial,_version,...,party_detailed_WORKERS,party_detailed_WORKERS AGAINST CONCESSIONS,party_detailed_WORKERS LEAGUE,party_detailed_WORKERS WORLD,party_detailed_WORKING FAMILIES,_mode_total,party_simplified_DEMOCRAT,party_simplified_LIBERTARIAN,party_simplified_OTHER,party_simplified_REPUBLICAN
0,1976,4,86,61,False,False,321236,741210,False,20210114,...,0,0,0,0,0,1,0,0,0,1
1,1976,4,86,61,False,False,1565,741210,False,20210114,...,0,0,0,0,0,1,0,0,1,0
2,1976,4,86,61,False,False,400334,741210,False,20210114,...,0,0,0,0,0,1,1,0,0,0
3,1976,4,86,61,False,False,7310,741210,False,20210114,...,0,0,0,0,0,1,0,1,0,0
4,1976,4,86,61,False,False,10765,741210,False,20210114,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3624,2020,56,83,68,False,False,6401,278503,False,20210114,...,0,0,0,0,0,1,0,0,1,0
3625,2021,13,58,44,True,False,2194848,4483294,True,20210114,...,0,0,0,0,0,1,0,0,0,1
3626,2021,13,58,44,True,False,2288446,4483294,True,20210114,...,0,0,0,0,0,1,1,0,0,0
3627,2021,13,58,44,False,False,2213979,4483241,True,20210114,...,0,0,0,0,0,1,0,0,0,1


In [26]:
y = df['totalvotes'].values
y

array([ 741210,  741210,  741210, ..., 4483294, 4483241, 4483241])

In [27]:
# Split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [28]:
# Scale X_train and X_test
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
len(X_train_scaled[0])

2670

In [29]:
# Create a neural network model with keras
nn = tf.keras.models.Sequential()

In [30]:
# Add a hidden layer with twice as many neurons as there are inputs. Use 'relu'
n_input = len(X_train_scaled[0])
n_hidden = n_input * 2
n_hidden_layer2 = n_input * 2
nn.add(tf.keras.layers.Dense(units=n_hidden, input_dim=n_input, activation='relu'))
nn.add(tf.keras.layers.Dense(units=n_hidden_layer2, activation='relu'))

In [31]:
# add an output layer with a 'linear' activation function.
nn.add(tf.keras.layers.Dense(units=1,  activation='linear'))

In [32]:
# print a summary of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 5340)              14263140  
_________________________________________________________________
dense_3 (Dense)              (None, 5340)              28520940  
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 5341      
Total params: 42,789,421
Trainable params: 42,789,421
Non-trainable params: 0
_________________________________________________________________


In [33]:
# compile the model using the "adam" optimizer and "mean_squared_error" loss function
nn.compile(loss='mean_squared_error' , optimizer='adam' , metrics=['mse'])

In [34]:
# train the model for 100 epochs
model = nn.fit(X_train_scaled, y_train, epochs=100)

Train on 2721 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100

In [35]:
# predict values for the train and test sets
y_train_pred = nn.predict(X_train_scaled)
y_test_pred = nn.predict(X_test_scaled)

In [36]:
# score the training predictions with r2_score()
r2_score(y_train, y_train_pred)

0.998724221894063

In [37]:
# score the test predictions with r2_score()
r2_score(y_test, y_test_pred)

0.9856300295206193