## Preprocessing

In [1]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import keras_tuner as kt

# Import pandas and read the charity_data.csv from the provided cloud URL.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

# Imports for HDF5 files
import numpy as np
import h5py

In [3]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(columns=['EIN', 'NAME'])
application_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [4]:
# Determine the number of unique values in each column.
print(application_df.nunique())

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64


In [5]:
# Look at the value counts in each column to determine if any can be easily dropped or values condensed
[display(application_df.loc[:,column].value_counts()) for column in application_df.columns];

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
T9,156
T13,66


Unnamed: 0_level_0,count
AFFILIATION,Unnamed: 1_level_1
Independent,18480
CompanySponsored,15705
Family/Parent,64
National,33
Regional,13
Other,4


Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
...,...
C4120,1
C8210,1
C2561,1
C4500,1


Unnamed: 0_level_0,count
USE_CASE,Unnamed: 1_level_1
Preservation,28095
ProductDev,5671
CommunityServ,384
Heathcare,146
Other,3


Unnamed: 0_level_0,count
ORGANIZATION,Unnamed: 1_level_1
Trust,23515
Association,10255
Co-operative,486
Corporation,43


Unnamed: 0_level_0,count
STATUS,Unnamed: 1_level_1
1,34294
0,5


Unnamed: 0_level_0,count
INCOME_AMT,Unnamed: 1_level_1
0,24388
25000-99999,3747
100000-499999,3374
1M-5M,955
1-9999,728
10000-24999,543
10M-50M,240
5M-10M,185
50M+,139


Unnamed: 0_level_0,count
SPECIAL_CONSIDERATIONS,Unnamed: 1_level_1
N,34272
Y,27


Unnamed: 0_level_0,count
ASK_AMT,Unnamed: 1_level_1
5000,25398
10478,3
15583,3
63981,3
6725,3
...,...
5371754,1
30060,1
43091152,1
18683,1


Unnamed: 0_level_0,count
IS_SUCCESSFUL,Unnamed: 1_level_1
1,18261
0,16038


# Value count results

Based on the value_counts output, the following columns can be dropped due to overwhelming counts of a single value:  
  + STATUS
  + SPECIAL_CONSIDERATIONS

Further, the following columns can be condensed by combining their low values into 'Other':  
  + AFFILIATION
  + USE_CASE
  + ORGANIZATION

These are in addition to the previously-determined columns to condense:
  + APPLICATION_TYPE
  + CLASSIFICATION

In [6]:
# Drop unnecessary columns
application_df = application_df.drop(['STATUS', 'SPECIAL_CONSIDERATIONS'], axis=1)
application_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,0,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1-9999,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,0,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,10000-24999,6692,1
4,T3,Independent,C1000,Heathcare,Trust,100000-499999,142590,1


# Function to condense column values based on a count cutoff

In [7]:
def condense_column_values(column_name, cutoff_value):
  # column_name is a string
  # cutoff_value is an integer representing the target value_count

  cutoffs = application_df[column_name].value_counts() < cutoff_value
  values_to_condense = cutoffs[cutoffs].index

  # Replace values below the cutoff with 'Other'
  for v in values_to_condense:
    application_df[column_name] = application_df[column_name].replace(v, 'Other')

  # Verify condensed values
  display(application_df[column_name].value_counts())

In [8]:
# Condense columns

condense_column_values('APPLICATION_TYPE', 500)
condense_column_values('CLASSIFICATION', 1000)
condense_column_values('AFFILIATION', 100)
condense_column_values('USE_CASE', 500)
condense_column_values('ORGANIZATION', 500)

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
Other,276


Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,2261
C3000,1918
C2100,1883


Unnamed: 0_level_0,count
AFFILIATION,Unnamed: 1_level_1
Independent,18480
CompanySponsored,15705
Other,114


Unnamed: 0_level_0,count
USE_CASE,Unnamed: 1_level_1
Preservation,28095
ProductDev,5671
Other,533


Unnamed: 0_level_0,count
ORGANIZATION,Unnamed: 1_level_1
Trust,23515
Association,10255
Other,529


In [9]:
application_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,0,5000,1
1,T3,Independent,C2000,Preservation,Other,1-9999,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,0,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,10000-24999,6692,1
4,T3,Independent,C1000,Other,Trust,100000-499999,142590,1


In [10]:
# Convert categorical data to numeric with `pd.get_dummies`
application_dummy_df = pd.get_dummies(application_df, dtype=int, drop_first=True)
application_dummy_df.head()

Unnamed: 0,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,APPLICATION_TYPE_T8,...,ORGANIZATION_Other,ORGANIZATION_Trust,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M
0,5000,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,108590,1,0,0,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
2,5000,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6692,1,0,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4,142590,1,0,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0


In [11]:
# Split our preprocessed data into our features and target arrays
y = application_dummy_df['IS_SUCCESSFUL']
X = application_dummy_df.drop(columns=['IS_SUCCESSFUL'])

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [12]:
# Get the number of features
features_count = len(X.columns)
features_count

28

In [13]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

Function to Create the Model based on Parameters

In [14]:
def keras_tuning_model(hp):
  nn_model = tf.keras.models.Sequential()
  activation = hp.Choice('activation', ['relu', 'leaky_relu', 'mish'])

  nn_model.add(tf.keras.Input(shape=(features_count,)))
  nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
                                                  min_value = 70,
                                                  max_value = 100,
                                                  step = 10),
                                     activation=activation))

  for i in range(hp.Int('num_layers', 1, 6)):
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('units' + str(i),
                                                    min_value = 50,
                                                    max_value = 80,
                                                    step = 10),
                                       activation=activation))

  nn_model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

  nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return nn_model

In [15]:
# Hyperband tuning
# ktuner = kt.Hyperband(
#     keras_tuning_model,
#     objective='val_accuracy',
#     max_epochs=20,
#     hyperband_iterations=2
# )

In [16]:
# Bayesian tuning
ktuner = kt.BayesianOptimization(
    keras_tuning_model,
    objective='val_accuracy',
    max_trials=20
)

In [17]:
ktuner.search(X_train_scaled, y_train, epochs=20, validation_data=(X_test_scaled, y_test))

Trial 20 Complete [00h 01m 01s]
val_accuracy: 0.7406414151191711

Best val_accuracy So Far: 0.7406414151191711
Total elapsed time: 00h 22m 27s


In [18]:
top_hp = ktuner.get_best_hyperparameters(3)
for hp in top_hp:
  display(hp.values)

{'activation': 'leaky_relu',
 'first_units': 100,
 'num_layers': 6,
 'units0': 50,
 'units1': 50,
 'units2': 80,
 'units3': 70,
 'units4': 70,
 'units5': 50}

{'activation': 'leaky_relu',
 'first_units': 80,
 'num_layers': 2,
 'units0': 70,
 'units1': 60,
 'units2': 50,
 'units3': 50,
 'units4': 60,
 'units5': 70}

{'activation': 'relu',
 'first_units': 90,
 'num_layers': 1,
 'units0': 60,
 'units1': 50,
 'units2': 50,
 'units3': 60,
 'units4': 60}

In [19]:
top_models = ktuner.get_best_models(3)
for model in top_models:
  model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
  print(f"Accuracy: {model_accuracy}, Loss: {model_loss}")

  saveable.load_own_variables(weights_store.get(inner_path))
  saveable.load_own_variables(weights_store.get(inner_path))
  saveable.load_own_variables(weights_store.get(inner_path))


268/268 - 1s - 3ms/step - accuracy: 0.7406 - loss: 0.5465
Accuracy: 0.7406414151191711, Loss: 0.5464704632759094
268/268 - 1s - 4ms/step - accuracy: 0.7406 - loss: 0.5469
Accuracy: 0.7406414151191711, Loss: 0.5468723177909851
268/268 - 1s - 4ms/step - accuracy: 0.7403 - loss: 0.5468
Accuracy: 0.7402915358543396, Loss: 0.5468365550041199


In [25]:
# Export our model to HDF5 file
top_models[0].save("AlphabetSoupCharity_Optimization.h5")

