In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

In [2]:
df = pd.read_csv('senate_dataset.csv')
#Create category_columns and numeric_columns variables
numeric_columns = []
category_columns = []
for col in df.columns:
    if is_string_dtype(df[col]) == True:
        category_columns.append(col)
    elif is_numeric_dtype(df[col]) == True:
        numeric_columns.append(col)
#Create dummy variables for the category_columns and merge on the numeric_columns to create an X dataset
category_columns = pd.get_dummies(df[category_columns])
X = df[numeric_columns].merge(category_columns, left_index= True, right_index= True)
#Create an y dataset
y = df['totalvotes'].values
# Split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Scale X_train and X_test
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# Create a neural network model with keras
nn = tf.keras.models.Sequential()
# Add a hidden layer with twice as many neurons as there are inputs. Use 'relu'
n_input = len(X_train_scaled[0])

n_hidden = n_input * 2
#n_hidden_layer2 = n_input * 2 #2nd hidden layer

nn.add(tf.keras.layers.Dense(units=n_hidden, input_dim=n_input, activation='relu'))
#nn.add(tf.keras.layers.Dense(units=n_hidden_layer2, activation='relu')) #2nd hidden layer

# add an output layer with a 'linear' activation function.
nn.add(tf.keras.layers.Dense(units=1,  activation='linear'))
# print a summary of the model
print(nn.summary())
# compile the model using the "adam" optimizer and "mean_squared_error" loss function
nn.compile(loss='mean_squared_error' , optimizer='adam' , metrics=['mse'])
# train the model for 100 epochs
model = nn.fit(X_train_scaled, y_train, epochs=100)
# predict values for the train and test sets
y_train_pred = nn.predict(X_train_scaled)
y_test_pred = nn.predict(X_test_scaled)
# score the training predictions with r2_score()
print(f"r2_score of y_train: {r2_score(y_train, y_train_pred)}")
# score the test predictions with r2_score()
print(f"r2_score of y_test: {r2_score(y_test, y_test_pred)}")

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 5340)              14263140  
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5341      
Total params: 14,268,481
Trainable params: 14,268,481
Non-trainable params: 0
_________________________________________________________________
None
Train on 2721 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 3