## Preprocessing

In [1]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m102.4/129.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

In [23]:
#  Import and read the charity_data.csv.
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")

In [24]:
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [25]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(columns=['EIN', 'NAME'])

In [26]:
# Determine the number of unique values in each column.
application_df.nunique()

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64

In [27]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
application_type_value_counts = application_df['APPLICATION_TYPE'].value_counts()
application_type_value_counts

APPLICATION_TYPE
T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: count, dtype: int64

In [28]:
# Xpert Learning Assistant helped me with the code below
import numpy as np
def percent(value_counts):
  total_count = np.sum(value_counts)

  # Calculate the percentage for each value count
  percents = (value_counts / total_count) * 100

  # Create a DataFrame from the percentages
  percents_df = pd.DataFrame({'Count': value_counts,'Percent of Total': percents})

  print(percents_df)

In [34]:
status_value_counts = application_df['STATUS'].value_counts()
status_value_counts

STATUS
1    34294
0        5
Name: count, dtype: int64

In [35]:
percent(status_value_counts)

        Count  Percent of Total
STATUS                         
1       34294         99.985422
0           5          0.014578


In [29]:
percent(application_type_value_counts)

                  Count  Percent of Total
APPLICATION_TYPE                         
T3                27037         78.827371
T4                 1542          4.495758
T6                 1216          3.545293
T5                 1173          3.419925
T19                1065          3.105047
T8                  737          2.148751
T7                  725          2.113764
T10                 528          1.539403
T9                  156          0.454824
T13                  66          0.192425
T12                  27          0.078719
T2                   16          0.046649
T25                   3          0.008747
T14                   3          0.008747
T29                   2          0.005831
T15                   2          0.005831
T17                   1          0.002916


In [30]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = []
for i in application_type_value_counts.items():
  if i[1] < 528:
    application_types_to_replace.append(i[0])

#application_types_to_replace = sum(application_types_to_replace)
application_types_to_replace

['T9', 'T13', 'T12', 'T2', 'T25', 'T14', 'T29', 'T15', 'T17']

In [31]:
# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
application_df['APPLICATION_TYPE'].value_counts()

APPLICATION_TYPE
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: count, dtype: int64

In [32]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
classification_value_counts = application_df['CLASSIFICATION'].value_counts()
percent(classification_value_counts)

                Count  Percent of Total
CLASSIFICATION                         
C1000           17326         50.514592
C2000            6074         17.708971
C1200            4837         14.102452
C3000            1918          5.592000
C2100            1883          5.489956
...               ...               ...
C4120               1          0.002916
C8210               1          0.002916
C2561               1          0.002916
C4500               1          0.002916
C2150               1          0.002916

[71 rows x 2 columns]


In [13]:
# You may find it helpful to look at CLASSIFICATION value counts >1
classification_value_counts_above_one = classification_value_counts[classification_value_counts > 1]
percent(classification_value_counts_above_one)

                Count  Percent of Total
CLASSIFICATION                         
C1000           17326         50.552913
C2000            6074         17.722405
C1200            4837         14.113150
C3000            1918          5.596242
C2100            1883          5.494121
C7000             777          2.267091
C1700             287          0.837394
C4000             194          0.566043
C5000             116          0.338459
C1270             114          0.332623
C2700             104          0.303446
C2800              95          0.277186
C7100              75          0.218831
C1300              58          0.169229
C1280              50          0.145887
C1230              36          0.105039
C1400              34          0.099203
C7200              32          0.093368
C2300              32          0.093368
C1240              30          0.087532
C8000              20          0.058355
C7120              18          0.052519
C1500              16          0.046684


In [14]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classifications_to_replace = []
for i in classification_value_counts.items():
  if i[1] < 1883:
    classifications_to_replace.append(i[0])

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: count, dtype: int64

In [15]:
# Convert categorical data to numeric with `pd.get_dummies`
application_cat_dummies = pd.get_dummies(application_df[['APPLICATION_TYPE', 'CLASSIFICATION', 'AFFILIATION',	'USE_CASE',	'ORGANIZATION',	'INCOME_AMT',	'SPECIAL_CONSIDERATIONS']])
application_cat_dummies

Unnamed: 0,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,APPLICATION_TYPE_T8,CLASSIFICATION_C1000,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,False,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
1,False,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,False,False,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,True,False
4,False,False,False,True,False,False,False,False,False,True,...,False,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34294,False,False,False,False,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
34295,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
34296,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
34297,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [16]:
application_combined_df = pd.concat([application_df, application_cat_dummies], axis=1)
application_combined_df = application_combined_df.drop(columns=['APPLICATION_TYPE', 'CLASSIFICATION', 'AFFILIATION',	'USE_CASE',	'ORGANIZATION',	'INCOME_AMT',	'SPECIAL_CONSIDERATIONS'])
application_combined_df

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,1,108590,1,False,False,False,True,False,False,False,...,True,False,False,False,False,False,False,False,True,False
2,1,5000,0,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
3,1,6692,1,False,False,False,True,False,False,False,...,False,True,False,False,False,False,False,False,True,False
4,1,142590,1,False,False,False,True,False,False,False,...,False,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34294,1,5000,0,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
34295,1,5000,0,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
34296,1,5000,0,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,True,False
34297,1,5000,1,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False


In [17]:
# Split our preprocessed data into our features and target arrays
y = application_combined_df.IS_SUCCESSFUL.values
X = application_combined_df.drop(columns="IS_SUCCESSFUL").values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

In [18]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [36]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  43
hidden_nodes_layer2 = 86
hidden_nodes_layer3 = 129

# Create a method that creates a new Sequential model with hyperparameter options
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 43)                1892      
                                                                 
 dense_5 (Dense)             (None, 86)                3784      
                                                                 
 dense_6 (Dense)             (None, 129)               11223     
                                                                 
 dense_7 (Dense)             (None, 1)                 130       
                                                                 
Total params: 17029 (66.52 KB)
Trainable params: 17029 (66.52 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [37]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

In [38]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
#  Import and read the charity_data.csv again to use all the relevant data and not bin anything.
application_df_second = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df_second

In [None]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df_second = application_df_second.drop(columns=['EIN', 'NAME'])

In [None]:
# Convert categorical data to numeric with `pd.get_dummies`
application_second_dummies = pd.get_dummies(application_df_second[['APPLICATION_TYPE', 'CLASSIFICATION', 'AFFILIATION',	'USE_CASE',	'ORGANIZATION',	'INCOME_AMT',	'SPECIAL_CONSIDERATIONS']])
application_second_dummies

In [None]:
application_second_combined_df = pd.concat([application_df_second, application_second_dummies], axis=1)
application_second_combined_df = application_second_combined_df.drop(columns=['APPLICATION_TYPE', 'CLASSIFICATION', 'AFFILIATION',	'USE_CASE',	'ORGANIZATION',	'INCOME_AMT',	'SPECIAL_CONSIDERATIONS'])
application_second_combined_df

In [None]:
# Split our second batch of preprocessed data into our features and target arrays
y_second = application_second_combined_df.IS_SUCCESSFUL.values
X_second = application_second_combined_df.drop(columns="IS_SUCCESSFUL").values

# Split the preprocessed data into a training and testing dataset
X_train_second, X_test_second, y_train_second, y_test_second = train_test_split(X_second, y_second, random_state=13)

In [None]:
# Create a StandardScaler instances
scaler_second = StandardScaler()

# Fit the StandardScaler
X_scaler_second = scaler_second.fit(X_train_second)

# Scale the data
X_train_scaled_second = X_scaler_second.transform(X_train_second)
X_test_scaled_second = X_scaler_second.transform(X_test_second)

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features_second = len(X_train_second[0])
second_hidden_nodes_layer1 =  117
second_hidden_nodes_layer2 = 234
second_hidden_nodes_layer3 = 348

# Create a method that creates a new Sequential model with hyperparameter options
nn_second = tf.keras.models.Sequential()

# First hidden layer
nn_second.add(tf.keras.layers.Dense(units=second_hidden_nodes_layer1, input_dim=number_input_features_second, activation="relu"))

# Second hidden layer
nn_second.add(tf.keras.layers.Dense(units=second_hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn_second.add(tf.keras.layers.Dense(units=second_hidden_nodes_layer3, activation="relu"))

# Output layer
nn_second.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_second.summary()

In [None]:
# Compile the model
nn_second.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

In [None]:
# Train the model
fit_model_second = nn_second.fit(X_train_scaled_second, y_train_second, epochs=50)

In [None]:
def create_model(hp):
  number_input_features = len(X_train[0])

  #Create a method that creates a new Sequential model with hyperparameter options
  nn_opt = tf.keras.models.Sequential()

  # Allow kerastuner to decide which activation function to use in hidden layers
  activation = hp.Choice('activation', ['relu', 'sigmoid', 'tanh'])

  # Allow kerastuner to decide number of neurons in first layer
  nn_opt.add(tf.keras.layers.Dense(units=hp.Int('first_units',
          min_value=1,
          max_value=8,
          step=5), activation=activation, input_dim=number_input_features))

  # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
  for i in range(hp.Int('num_layers', 1, 5)):
      nn_opt.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
          min_value=1,
          max_value=9,
          step=5),
          activation=activation))

  # Output layer
  nn_opt.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

  # Compile the model
  nn_opt.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

  return nn_opt

In [None]:
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

In [None]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled, y_train, epochs=20, validation_data=(X_test_scaled, y_test))

In [None]:
# Get top 3 model hyperparameters and print the values
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
    print(param.values)

In [None]:
top_model = tuner.get_best_models(3)
for model in top_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Export our model to HDF5 file
nn.save("AlphabetSoupCharity_Optimization.h5")

from google.colab import files
files.download("AlphabetSoupCharity_Optimization.h5")