<a href="https://colab.research.google.com/github/fjdpr/deep-learning-challenge/blob/main/AlphabetSoupCharity/AlphabetSoupCharity_Optimization.ipynb" target="_parent">
    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import pandas as pd
import os

In [2]:
# Load the dataset
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [3]:
# Drop ID column 'EIN'
application_df = application_df.drop(columns=['EIN'])

In [4]:
# Get the number of unique values for each column
application_df.nunique()

NAME                      19568
APPLICATION_TYPE             17
AFFILIATION                   6
CLASSIFICATION               71
USE_CASE                      5
ORGANIZATION                  4
STATUS                        2
INCOME_AMT                    9
SPECIAL_CONSIDERATIONS        2
ASK_AMT                    8747
IS_SUCCESSFUL                 2
dtype: int64

In [5]:
# Count the 'NAME' column for unique value
counts_name = (
    application_df['NAME'].value_counts()
    ).value_counts().sort_index()
counts_name

count
1       18776
2         215
3         103
4          72
5          48
        ...  
408         1
492         1
700         1
765         1
1260        1
Name: count, Length: 92, dtype: int64

In [6]:
# Replace names with fewer than 5 occurrences with 'Other'
application_df['NAME'] = application_df['NAME'].where(
    application_df['NAME'].map(
        application_df['NAME'].value_counts()
    ) >= 5, 
    'Other'
)

In [7]:
# Count the 'APPLICATION_TYPE' column for unique values
application_counts = application_df.APPLICATION_TYPE.value_counts()
application_counts


APPLICATION_TYPE
T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: count, dtype: int64

In [8]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = list(application_counts[application_counts < 500].index)

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
application_df['APPLICATION_TYPE'].value_counts()

APPLICATION_TYPE
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: count, dtype: int64

In [9]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
classification_counts = application_df.CLASSIFICATION.value_counts()
classification_counts

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: count, Length: 71, dtype: int64

In [10]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classifications_to_replace = list(classification_counts[classification_counts < 100].index)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
Other      669
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
Name: count, dtype: int64

In [11]:
# Convert categorical data to numeric with `pd.get_dummies`
application_df = pd.get_dummies(application_df).astype(int)
application_df.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,NAME_AACE INTERNATIONAL,NAME_ACE MENTOR PROGRAM OF AMERICA INC,NAME_ACTS MINISTRY,NAME_ACTS MISSIONS,NAME_AFRICAN-AMERICAN POSTAL LEAGUE UNITED FOR SUCCESS A-PLUS,NAME_AIR FORCE ASSOCIATION,NAME_ALABAMA FEDERATION OF WOMENS CLUBS,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,108590,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,1,5000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,6692,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,1,142590,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [12]:
# Split our preprocessed data into our features and target arrays
X = application_df.drop(columns=["IS_SUCCESSFUL"]).values
y = application_df["IS_SUCCESSFUL"].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [13]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
def determine_layers(input_features, max_layers=5):
    # Determine the number of layers based on input features
    return min(
        1 if input_features <= 5
        else 2 if input_features <= 50
        else 3, 
        max_layers
    )

# Calculate the dynamic number of layers based on input features
number_input_features = X_train_scaled.shape[1]
layers = determine_layers(input_features=number_input_features)
layers

3

In [15]:
def calculate_hidden_nodes(input_features, layers, max_neurons=13):
    # Start with the minimum of input_features or max_neurons
    start_nodes = min(input_features, max_neurons)
    reduction_step = max(start_nodes // layers, 1)
    return [
        max(start_nodes - reduction_step * i, 2)
        for i in range(layers)
    ]

In [16]:
# Determine the number of layers
number_input_features = X_train_scaled.shape[1]
layers = determine_layers(input_features=number_input_features)

# Calculate the dynamic number of nodes for each layer
hidden_nodes = calculate_hidden_nodes(input_features=number_input_features, layers=layers, max_neurons=13)
print(f"Hidden nodes configuration: {hidden_nodes}")


Hidden nodes configuration: [13, 9, 5]


In [17]:
# Initialize the Sequential model
nn = tf.keras.models.Sequential()

# Add the first hidden layer
nn.add(tf.keras.layers.Dense(hidden_nodes[0], input_dim=number_input_features, activation='relu'))

# Add additional hidden layers dynamically
for nodes in hidden_nodes[1:]:
    nn.add(tf.keras.layers.Dense(nodes, activation='relu'))

# Add the output layer
nn.add(tf.keras.layers.Dense(1, activation='sigmoid'))

# Inspect the model structure
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100, verbose=1)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6911 - loss: 0.5960
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7941 - loss: 0.4321
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7960 - loss: 0.4280
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7948 - loss: 0.4248
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8006 - loss: 0.4180
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7967 - loss: 0.4231
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8024 - loss: 0.4143
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8021 - loss: 0.4174
Epoch 9/100
[1m804/804[0m [32

In [20]:
# Evaluate the model
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 1s - 2ms/step - accuracy: 0.7911 - loss: 0.4551
Loss: 0.4550911784172058, Accuracy: 0.7911370396614075


In [21]:
# Ensure the folder to store .h5 model files exists
if not os.path.exists("models"):
    os.makedirs("models")

In [22]:
# Export the optimized/evaluated model to an HDF5 file
nn.save("models/AlphabetSoupCharity_Optimization.h5")

