## Preprocessing

In [None]:
# Import our dependencies
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch

#  Import and read the charity_data.csv.
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

# Target variable: IS_SUCCESSFUL
# Feature variables: APPLICATION_TYPE AFFILIATION CLASSIFICATION USE_CASE ORGANIZATION STATUS INCOME_AMT SPECIAL_CONSIDERATIONS ASK_AMT

  from kerastuner.tuners import RandomSearch


Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [None]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(['EIN', 'NAME'], axis=1)

# Displaying the first few rows of application_df to verify the columns were dropped
print(application_df.head())

  APPLICATION_TYPE       AFFILIATION CLASSIFICATION      USE_CASE  \
0              T10       Independent          C1000    ProductDev   
1               T3       Independent          C2000  Preservation   
2               T5  CompanySponsored          C3000    ProductDev   
3               T3  CompanySponsored          C2000  Preservation   
4               T3       Independent          C1000     Heathcare   

   ORGANIZATION  STATUS     INCOME_AMT SPECIAL_CONSIDERATIONS  ASK_AMT  \
0   Association       1              0                      N     5000   
1  Co-operative       1         1-9999                      N   108590   
2   Association       1              0                      N     5000   
3         Trust       1    10000-24999                      N     6692   
4         Trust       1  100000-499999                      N   142590   

   IS_SUCCESSFUL  
0              1  
1              1  
2              0  
3              1  
4              1  


In [None]:
# Determine the number of unique values in each column.
unique_values = application_df.nunique()

# Display the number of unique values for each column
print(unique_values)

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64


In [None]:
# Look at APPLICATION_TYPE value counts for binning

# Display the count of each unique value in the APPLICATION_TYPE column
application_type_counts = application_df['APPLICATION_TYPE'].value_counts()

# Print the counts
print(application_type_counts)

T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64


In [None]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
# Define the cutoff point
app_type_cutoff_point = 500

# Identify application types to bin as 'Other'
application_types_to_replace = application_df['APPLICATION_TYPE'].value_counts()[application_df['APPLICATION_TYPE'].value_counts() < app_type_cutoff_point].index

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure binning was successful
application_df['APPLICATION_TYPE'].value_counts()

T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: APPLICATION_TYPE, dtype: int64

In [None]:
# Look at CLASSIFICATION value counts for binning
# Display the count of each unique value in the CLASSIFICATION column
classification_counts = application_df['CLASSIFICATION'].value_counts()

# Print the counts
print(classification_counts)

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: CLASSIFICATION, Length: 71, dtype: int64


In [None]:
# You may find it helpful to look at CLASSIFICATION value counts >1

# Filter and display CLASSIFICATION value counts greater than 1
classification_counts_filtered = classification_counts[classification_counts > 1]

# Print the filtered counts
print(classification_counts_filtered)

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
C2800       95
C7100       75
C1300       58
C1280       50
C1230       36
C1400       34
C7200       32
C2300       32
C1240       30
C8000       20
C7120       18
C1500       16
C1800       15
C6000       15
C1250       14
C8200       11
C1238       10
C1278       10
C1235        9
C1237        9
C7210        7
C2400        6
C1720        6
C4100        6
C1257        5
C1600        5
C1260        3
C2710        3
C0           3
C3200        2
C1234        2
C1246        2
C1267        2
C1256        2
Name: CLASSIFICATION, dtype: int64


In [None]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
# Define the cutoff point
class_cutoff_point = 1000

# Identify classifications to bin as 'Other'
classifications_to_replace = application_df['CLASSIFICATION'].value_counts()[application_df['CLASSIFICATION'].value_counts() < class_cutoff_point].index

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure binning was successful
application_df['CLASSIFICATION'].value_counts()

C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: CLASSIFICATION, dtype: int64

In [None]:
# Or, into custom quantiles
application_df['ASK_AMT_BINNED'] = pd.qcut(application_df['ASK_AMT'], q=[0, .2, .4, .6, .8, 1])
df.drop(columns=['ASK_AMT'], inplace=True)

print(application_df)

In [None]:
# Convert categorical data to numeric with `pd.get_dummies`

# Encoding categorical variables using pd.get_dummies()
application_df_encoded = pd.get_dummies(application_df, drop_first=True)

# Display the first few rows of the encoded dataframe to verify the encoding
application_df_encoded.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,...,ORGANIZATION_Trust,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,108590,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1,5000,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,6692,1,0,0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
4,1,142590,1,0,0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0


In [None]:
# Check for potential class imbalances
print(application_df_encoded['IS_SUCCESSFUL'].value_counts())

1    18261
0    16038
Name: IS_SUCCESSFUL, dtype: int64


In [None]:
# Split our preprocessed data into our features and target arrays
feature_names = application_df_encoded.drop('IS_SUCCESSFUL', axis=1).columns.tolist() #Feature names array
X = application_df_encoded.drop('IS_SUCCESSFUL', axis=1).values  # Features array
y = application_df_encoded['IS_SUCCESSFUL'].values  # Target array

print(X.shape)  # Should show (number_of_samples, number_of_features)
print(y.shape)  # Should show (number_of_samples,)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69420)

print(X_train.shape)  # Should show (number_of_samples, number_of_features)
print(y_train.shape)  # Should show (number_of_samples,)

# Use synthetic minority oversampling to adjust for class imbalance
smote = SMOTE(random_state=42069)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Verify the split
print(f"X_train shape (with SMOTE): {X_train_smote.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape (with SMOTE): {y_train_smote.shape}")
print(f"y_test shape: {y_test.shape}")

(34299, 36)
(34299,)
(27439, 36)
(27439,)
X_train shape (with SMOTE): (29340, 36)
X_test shape: (6860, 36)
y_train shape (with SMOTE): (29340,)
y_test shape: (6860,)


In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler to oversampled data
X_scaler = scaler.fit(X_train_smote)

# Scale the data
X_train_scaled = X_scaler.transform(X_train_smote)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.


# Number of input features
n_features = X_train_smote.shape[1]

# Create the Sequential model
nn = Sequential()

# Input adjusted layer
nn.add(Dense(units=128, activation='relu', input_shape=(n_features,)))
Dropout(0.3),  # Dropout 30% of the neurons in the layer

# First adjusted hidden layer
nn.add(Dense(units=64, activation='relu'))
Dropout(0.3),  # Again, dropout 30% of the neurons

# Second adjusted hidden layer
nn.add(Dense(units=32, activation='relu'))
Dropout(0.3),  # Again, dropout 30% of the neurons

# Output layer
nn.add(Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               4736      
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 15105 (59.00 KB)
Trainable params: 15105 (59.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Compile the model with a learning rate schedule
# For a binary classification problem, use 'binary_crossentropy' as the loss function
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay

# Define the learning rate schedule
initial_learning_rate = 0.001
lr_schedule = ExponentialDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True)

# Initialize the Adam optimizer with the learning rate schedule
optimizer = Adam(learning_rate=lr_schedule)

# Compile the model with the custom optimizer
nn.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Calculate the save_freq in terms of total steps for 5 epochs
batch_size = 32
steps_per_epoch = len(X_train_smote) // batch_size  # This calculates how many batches (steps) per epoch

# Save every 5 epochs (if you have a different batch size or training configuration, adjust accordingly)
save_freq = steps_per_epoch * 5

# Create the callback for saving the model's weights
cp_callback = ModelCheckpoint(
    filepath="optimization_training_checkpoints/cp-{epoch:04d}.ckpt",
    verbose=1,
    save_weights_only=True,
    save_freq=save_freq)  # Use save_freq with calculated value

# Train the model
epochs = 100  # Set the total number of epochs to train for
history = nn.fit(
    X_train_scaled,
    y_train_smote,
    epochs=epochs,
    validation_split=0.2,  # Use part of the training data for validation
    callbacks=[cp_callback])  # Pass the callback to save weights

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
165/734 [=====>........................] - ETA: 1s - loss: 0.5484 - accuracy: 0.7331
Epoch 7: saving model to optimization_training_checkpoints/cp-0007.ckpt
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 13: saving model to optimization_training_checkpoints/cp-0013.ckpt
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 19: saving model to optimization_training_checkpoints/cp-0019.ckpt
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 25: saving model to optimization_training_checkpoints/cp-0025.ckpt
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
117/734 [===>..........................] - ETA: 1s - loss: 0.5411 - accuracy: 0.7284
Epoch 32: saving model to optimization_training_checkpoints/cp-0032.ckpt
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 3

In [None]:
# Report baseline accuracy
test_loss, test_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.7237609624862671


In [None]:
# Use automated hyperparamter tuning (Hyperband) to find optimal model hyperparameters
from kerastuner.tuners import Hyperband

def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Dense(units=hp.Int('units_input', min_value=32, max_value=512, step=32),
                           activation=hp.Choice('activation_input', ['relu', 'tanh', 'elu']),
                           input_shape=(n_features,)))
    model.add(layers.Dropout(0.3))
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(layers.Dense(units=hp.Int(f'units_layer_{i}', min_value=32, max_value=512, step=32),
                               activation=hp.Choice(f'activation_layer_{i}', ['relu', 'tanh', 'elu'])))
        model.add(layers.Dropout(0.3))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(optimizer=keras.optimizers.Adam(
        hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
        loss='binary_crossentropy',
        metrics=['accuracy'])

    return model

# Initialize the Hyperband tuner
tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=25,
    directory='hyperband_tuning',
    project_name='tune_units_layers_activation'
)

# Start the search for the best hyperparameter configuration
tuner.search(X_train_scaled, y_train_smote, epochs=10, validation_split=0.2)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"The best number of units in the first layer is {best_hps.get('units_input')}.")
for i in range(best_hps.get('num_layers')):
    print(f"The best number of units in layer {i+1} is {best_hps.get(f'units_layer_{i}')}, with activation function {best_hps.get(f'activation_layer_{i}')}.")
print(f"The best learning rate for the optimizer is {best_hps.get('learning_rate')}.")

Trial 30 Complete [00h 01m 18s]
val_accuracy: 0.7384117245674133

Best val_accuracy So Far: 0.7450579404830933
Total elapsed time: 00h 19m 08s
The best number of units in the first layer is 512.
The best number of units in layer 1 is 96, with activation function tanh.
The best number of units in layer 2 is 416, with activation function relu.
The best number of units in layer 3 is 64, with activation function elu.
The best learning rate for the optimizer is 0.001.


In [None]:
best_model = tuner.get_best_models(num_models=1)[0]
test_loss_tuned, test_accuracy_tuned = best_model.evaluate(X_test_scaled, y_test)
print(f"Test Loss: {test_loss_tuned}, Test Accuracy: {test_accuracy_tuned}")

Test Loss: 0.5618760585784912, Test Accuracy: 0.7244898080825806


In [None]:
# Save best model
best_model.save('AlphabetSoupCharity_Optimization.h5')

  saving_api.save_model(
