## Prepare Data

In [151]:
#!pip install keras-tuner

# Import our dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as skl
import tensorflow as tf
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [152]:
#read the csv
ML_Data= pd.read_csv('Data/Five_Year_Data.csv')

In [153]:
ML_Data.columns

Index(['GEO_ID', 'Below_9th_grade', '9_12th_grade', 'High_School_Grad',
       'Some_College', 'Associate's_degree', 'Bachelor's_degree',
       'Graduate_degree', 'Population (16+)', 'Employed',
       'Median Household Income', 'Per capita Income', 'Unemployment Rate',
       'Professional (Occupation)', 'Service (Occupation)',
       'Sales (Occupation)', 'Construction (Occupation)',
       'Production (Occupation)', 'Salary (Class)', 'Gov't (Class)',
       'Self-employed (Class)', 'Fam-unpaid (Class)', 'Poverty',
       'Child-poverty', 'Median Home value', 'Median Mortgage', 'Median Rent',
       'Total Population', 'White', 'Black', 'American Indian', 'Asian',
       'Pacific Islander', 'Hispanic/Latino', 'County', 'State', 'Year'],
      dtype='object')

In [154]:
#Unemployment model: Educational background and demographic features
Poverty_data= ML_Data[['9_12th_grade', 'High_School_Grad',
       "Bachelor's_degree",
       'Graduate_degree', 'Population (16+)',
       'Median Household Income', 'Unemployment Rate',
       'Professional (Occupation)', 'Service (Occupation)',
       'Poverty',
       'Median Home value',
       'White', 'Black', 'State']]
Poverty_data.head()

Unnamed: 0,9_12th_grade,High_School_Grad,Bachelor's_degree,Graduate_degree,Population (16+),Median Household Income,Unemployment Rate,Professional (Occupation),Service (Occupation),Poverty,Median Home value,White,Black,State
0,8.7,32.6,15.9,11.8,43368.0,58786.0,4.2,38.7,15.9,15.4,147900.0,76.9,19.1,Alabama
1,7.0,27.6,20.7,10.6,167712.0,55962.0,4.4,36.3,18.1,10.6,189800.0,86.3,9.5,Alabama
2,18.8,35.7,7.8,4.4,20948.0,34186.0,9.5,25.9,14.6,28.9,92900.0,47.4,47.6,Alabama
3,11.1,47.3,7.6,3.9,18470.0,45340.0,7.5,21.9,18.7,14.0,96500.0,76.7,22.3,Alabama
4,12.4,34.0,8.1,4.5,45915.0,48695.0,4.1,29.9,13.0,14.4,124700.0,95.5,1.5,Alabama


## One Hot Encoder
taking care of categorical data (non-numerical)

In [155]:
Poverty_data.dtypes

9_12th_grade                 float64
High_School_Grad             float64
Bachelor's_degree            float64
Graduate_degree              float64
Population (16+)             float64
Median Household Income      float64
Unemployment Rate            float64
Professional (Occupation)    float64
Service (Occupation)         float64
Poverty                      float64
Median Home value            float64
White                        float64
Black                        float64
State                         object
dtype: object

In [156]:
# Generate our categorical variable lists. #checks for all cetegories that are objects
state_cat = Poverty_data.dtypes[Poverty_data.dtypes == "object"].index.tolist()

In [157]:
# Check the number of unique values in each column
Poverty_data[state_cat].nunique()

State    51
dtype: int64

In [158]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse_output=False)
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(Poverty_data[state_cat]))
# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(state_cat)
encode_df.head()

Unnamed: 0,State_Alabama,State_Alaska,State_Arizona,State_Arkansas,State_California,State_Colorado,State_Connecticut,State_Delaware,State_District of Columbia,State_Florida,...,State_South Dakota,State_Tennessee,State_Texas,State_Utah,State_Vermont,State_Virginia,State_Washington,State_West Virginia,State_Wisconsin,State_Wyoming
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [159]:
# Merge one-hot encoded features and drop the originals
Poverty_data= Poverty_data.merge(encode_df,left_index=True, right_index=True)
Poverty_data= Poverty_data.drop(state_cat, axis=1)
Poverty_data.head()

Unnamed: 0,9_12th_grade,High_School_Grad,Bachelor's_degree,Graduate_degree,Population (16+),Median Household Income,Unemployment Rate,Professional (Occupation),Service (Occupation),Poverty,...,State_South Dakota,State_Tennessee,State_Texas,State_Utah,State_Vermont,State_Virginia,State_Washington,State_West Virginia,State_Wisconsin,State_Wyoming
0,8.7,32.6,15.9,11.8,43368.0,58786.0,4.2,38.7,15.9,15.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7.0,27.6,20.7,10.6,167712.0,55962.0,4.4,36.3,18.1,10.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,18.8,35.7,7.8,4.4,20948.0,34186.0,9.5,25.9,14.6,28.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11.1,47.3,7.6,3.9,18470.0,45340.0,7.5,21.9,18.7,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12.4,34.0,8.1,4.5,45915.0,48695.0,4.1,29.9,13.0,14.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# NN Model

In [160]:
# Split target column and Features Columns
y = Poverty_data['Poverty']
X = Poverty_data.drop(columns='Poverty')

In [161]:
# Use sklearn to split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [162]:
# Create scaler instance
scaler = StandardScaler()
# Fit the Standardscaler
scaler.fit(X_train)
# Scale the data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [163]:
input_dimensions = X_train_scaled.shape[1]
input_dimensions

63

## Auto Optimizer

In [164]:
# Create a method that creates a new Sequential model with hyperparameter options
#In this model we had to design it for for regression instead of classification. 
# we do not need the activation model Sigmoid because it is not a classification model
# Layer Activation: None (linear by default), Loss Function: Mean Squared Error

def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim = input_dimensions))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))
    nn_model.add(tf.keras.layers.Dense(units=1))

    # Compile the model
    nn_model.compile(loss="mean_squared_error", optimizer='adam', metrics=["mean_squared_error"])
    return nn_model

In [165]:
# Import the kerastuner library
import keras_tuner as kt
tuner = kt.Hyperband(
    create_model,
    objective='val_mean_squared_error',
    max_epochs=20,
    hyperband_iterations=2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [166]:
X_train_scaled.shape

(11739, 63)

In [167]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 00m 13s]
val_mean_squared_error: 12.740080833435059

Best val_mean_squared_error So Far: 7.222649097442627
Total elapsed time: 00h 06m 59s


In [168]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'relu',
 'first_units': 9,
 'num_layers': 6,
 'units_0': 7,
 'units_1': 5,
 'units_2': 1,
 'units_3': 1,
 'units_4': 7,
 'units_5': 9,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 7,
 'tuner/bracket': 2,
 'tuner/round': 2,
 'tuner/trial_id': '0012'}

In [169]:
# Get top 3 model hyperparameters and print the values
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
    print(param.values)

{'activation': 'relu', 'first_units': 9, 'num_layers': 6, 'units_0': 7, 'units_1': 5, 'units_2': 1, 'units_3': 1, 'units_4': 7, 'units_5': 9, 'tuner/epochs': 20, 'tuner/initial_epoch': 7, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0012'}
{'activation': 'relu', 'first_units': 9, 'num_layers': 1, 'units_0': 3, 'units_1': 3, 'units_2': 3, 'units_3': 1, 'units_4': 7, 'units_5': 7, 'tuner/epochs': 20, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
{'activation': 'relu', 'first_units': 7, 'num_layers': 5, 'units_0': 7, 'units_1': 1, 'units_2': 5, 'units_3': 9, 'units_4': 5, 'units_5': 1, 'tuner/epochs': 20, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}


In [170]:
import numpy as np
from sklearn.metrics import r2_score
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
# Evaluate the best model against the full test data
model_loss, model_mse = best_model.evaluate(X_test_scaled, y_test, verbose=2)
model_rmse = np.sqrt(model_mse)

# Predict the target values for the test set
y_pred = best_model.predict(X_test_scaled)

# Calculate R^2 score
r2 = r2_score(y_test, y_pred)

print(f"R^2: {r2}")
print(f"Loss (Mean Squared Error): {model_loss}")
print(f"Root Mean Squared Error: {model_rmse}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  model.build_from_config(
  saveable.load_own_variables(weights_store.get(inner_path))


123/123 - 0s - 2ms/step - loss: 7.2226 - mean_squared_error: 7.2226
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
R^2: 0.8138056184219725
Loss (Mean Squared Error): 7.222649097442627
Root Mean Squared Error: 2.687498669291322
