In [1]:
#Machine learning, array and dataframe libraries.

from numpy.random import seed
seed(1)

import numpy as np
import pandas as pd

import tensorflow
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


# Cleaned Data import 

In [2]:
import math

#To classify donation in ranges.
def roundNum(x):
    
    max = int(math.ceil((x+0.1)/1000)) *1000
    low = int(math.floor(x/1000)) *1000
    return f'[{low} - {max})'

In [3]:
#Import data from S3
cleanedElection_df = pd.read_csv('https://election-data-2020-red-raiders.s3.us-east-2.amazonaws.com/combinedClean.csv')

#Filter dataframe for the coluns we want.
cleanedElection_df = cleanedElection_df[['Zip','Occupation','Amount','Campaign']]

#declare data types
cleanedElection_df['Zip'] = cleanedElection_df['Zip'].astype(str)
cleanedElection_df['Occupation'] = cleanedElection_df['Occupation'].astype(str)
cleanedElection_df['Campaign'] = cleanedElection_df['Campaign'].astype(str)
cleanedElection_df['Amount'] = cleanedElection_df['Amount'].astype(int)

cleanedElection_df.head()

Unnamed: 0,Zip,Occupation,Amount,Campaign
0,35205,Retired,250,Democrat
1,36695,Employed,100,Democrat
2,35213,Retired,500,Democrat
3,35242,Employed,25,Democrat
4,35242,Employed,25,Democrat


In [4]:
#Loop to clasify values in ranges.
for index, row in cleanedElection_df.iterrows():

    if row['Amount'] < 0:
        cleanedElection_df.loc[index, 'Amount'] = 'Refund'
        continue

        # strip the rest
    cleanedElection_df.loc[index, 'Zip'] = row['Zip'][:5].strip()
    
    cleanedElection_df.loc[index, 'Amount'] = roundNum(row['Amount'])

cleanedElection_df = cleanedElection_df[cleanedElection_df['Amount'] != 'Refund']

cleanedElection_df.reset_index(inplace=True, drop = True)

cleanedElection_df.head()

Unnamed: 0,Zip,Occupation,Amount,Campaign
0,35205,Retired,[0 - 1000),Democrat
1,36695,Employed,[0 - 1000),Democrat
2,35213,Retired,[0 - 1000),Democrat
3,35242,Employed,[0 - 1000),Democrat
4,35242,Employed,[0 - 1000),Democrat


# Categorical data encoding, train,test split and numerical data scaling

# Categorical encoding

In [117]:
#Label-encode data set
encoded_df = pd.DataFrame()

for column in cleanedElection_df[['Zip','Occupation','Amount','Campaign']]:
    list_encoded[column] = {'Encoded' : None, 'Raw' : None}

    encoder = LabelEncoder()

    print(column)
    encoder.fit(cleanedElection_df[column])

    np.save(f'encoder{column}.npy', encoder.classes_)

    encoded_df[column] = encoder.transform(cleanedElection_df[column])


Zip
Occupation
Amount
Campaign


In [86]:
encoded_df.head()

Unnamed: 0,Zip,Occupation,Amount,Campaign
0,10307,1,0,0
1,10795,0,0,0
2,10315,1,0,0
3,10333,0,0,0
4,10333,0,0,0


In [79]:
cleanedElection_df.head()

Unnamed: 0,Zip,Occupation,Amount,Campaign
0,35205,Retired,[0 - 1000),Democrat
1,36695,Employed,[0 - 1000),Democrat
2,35213,Retired,[0 - 1000),Democrat
3,35242,Employed,[0 - 1000),Democrat
4,35242,Employed,[0 - 1000),Democrat


In [80]:
#Separate outcome from data 
X = encoded_df.drop('Campaign', axis = 1)
y = encoded_df['Campaign']
print(X.shape, y.shape)

(9654121, 3) (9654121,)


## Train - Test Split

In [81]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1)

## Numerical Scaling

In [57]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train.to_pickle('X_scaler.pkl')

# One hot encoding

In [60]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

#  Model Set up

In [61]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=150, activation='relu', input_dim=3)) #3 inputs 
model.add(Dense(units=120, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax')) #2 outcomes.

In [62]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [63]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=5,
    shuffle=True,
    verbose=2
)

226269/226269 - 171s - loss: 0.4050 - accuracy: 0.8231


<tensorflow.python.keras.callbacks.History at 0x7fc3c6ce7490>

In [64]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               400       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 202       
Total params: 10,702
Trainable params: 10,702
Non-trainable params: 0
_________________________________________________________________


In [65]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

75423/75423 - 40s - loss: 0.4028 - accuracy: 0.8230
Normal Neural Network - Loss: 0.40276870131492615, Accuracy: 0.8229950070381165


# Predictions

In [17]:
encoded_predictions = model.predict_classes(X_test_scaled[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [18]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:10])}")

Predicted classes: ['Democrat' 'Republican' 'Democrat' 'Democrat' 'Democrat' 'Democrat'
 'Republican' 'Democrat' 'Republican' 'Democrat']
Actual Labels: [0, 1, 0, 0, 0, 0, 1, 0, 1, 0]


# Model Save

In [106]:

encoded_predictions = model.predict_classes(X_test_scaled[500:510])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [108]:
# Make a prediction. The result should be 5 - STANDING
print(f"Predicted class: {encoded_predictions}")
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[500:510])}")

Predicted class: [0 0 1 0 0 1 0 0 0 1]
Predicted classes: ['Democrat' 'Democrat' 'Republican' 'Democrat' 'Democrat' 'Republican'
 'Democrat' 'Democrat' 'Democrat' 'Republican']
Actual Labels: [1, 0, 1, 0, 0, 0, 0, 0, 0, 1]
