In [1]:
# Import Dependencies
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load Dataset
url = "https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv"
application_df = pd.read_csv(url)

# Display first few rows
application_df.head()


Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# Drop ID Columns (not useful for prediction)
application_df = application_df.drop(columns=["EIN", "NAME"])

# Check Unique Values to Identify Categorical Variables
application_df.nunique()


Unnamed: 0,0
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
STATUS,2
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2
ASK_AMT,8747
IS_SUCCESSFUL,2


In [3]:
# Group Rare APPLICATION_TYPE Categories
application_counts = application_df['APPLICATION_TYPE'].value_counts()
application_types_to_replace = application_counts[application_counts < 500].index.tolist()
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app, "Other")

# Group Rare CLASSIFICATION Categories
classification_counts = application_df['CLASSIFICATION'].value_counts()
classifications_to_replace = classification_counts[classification_counts < 100].index.tolist()
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls, "Other")

# Apply One-Hot Encoding to Categorical Columns
application_df = pd.get_dummies(application_df, columns=["APPLICATION_TYPE", "CLASSIFICATION", "AFFILIATION", "USE_CASE", "ORGANIZATION", "INCOME_AMT", "SPECIAL_CONSIDERATIONS"], drop_first=True)

# Display Data Types to Confirm Encoding
application_df.dtypes


Unnamed: 0,0
STATUS,int64
ASK_AMT,int64
IS_SUCCESSFUL,int64
APPLICATION_TYPE_T10,bool
APPLICATION_TYPE_T19,bool
APPLICATION_TYPE_T3,bool
APPLICATION_TYPE_T4,bool
APPLICATION_TYPE_T5,bool
APPLICATION_TYPE_T6,bool
APPLICATION_TYPE_T7,bool


In [4]:
# Define Features (X) and Target (y)
y = application_df["IS_SUCCESSFUL"].values
X = application_df.drop(columns=["IS_SUCCESSFUL"])

# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Confirm Shapes of Training and Testing Sets
print(f"Training Features Shape: {X_train.shape}")
print(f"Testing Features Shape: {X_test.shape}")


Training Features Shape: (25724, 42)
Testing Features Shape: (8575, 42)


In [5]:
# Apply StandardScaler to Normalize Feature Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Confirm Scaling by Printing First Few Rows
print(X_train_scaled[:5])


[[ 1.39430557e-02 -3.34843874e-02 -1.26474869e-01 -1.80310401e-01
  -1.93323172e+00 -2.17211760e-01 -1.88868073e-01  5.27316518e+00
  -1.44900757e-01 -1.47122472e-01 -4.05336896e-01 -5.47932520e-02
  -9.28767886e-02  2.15062580e+00 -2.41986076e-01 -5.29792289e-02
  -2.43601984e-01 -7.58112863e-02 -6.05605239e-02 -1.51342016e-01
  -1.39638995e-01 -4.50061455e-02 -1.07924659e+00 -3.18080475e-02
  -6.23503924e-03 -1.97203755e-02 -6.21563724e-02 -8.81784846e-03
  -2.13840836e+00  2.25801349e+00 -1.17446358e-01 -3.35950033e-02
   6.75069188e-01 -1.48495575e-01 -1.26949951e-01 -3.29185563e-01
  -8.34734853e-02 -1.69813249e-01 -3.51865775e-01 -6.40196894e-02
  -6.98784891e-02 -2.78942470e-02]
 [ 1.39430557e-02 -3.34843874e-02 -1.26474869e-01 -1.80310401e-01
   5.17268565e-01 -2.17211760e-01 -1.88868073e-01 -1.89639423e-01
  -1.44900757e-01 -1.47122472e-01 -4.05336896e-01 -5.47932520e-02
  -9.28767886e-02 -4.64980937e-01 -2.41986076e-01 -5.29792289e-02
  -2.43601984e-01 -7.58112863e-02 -6.0560

In [23]:
# Define Neural Network Model
nn = tf.keras.models.Sequential()

# First Hidden Layer - Increased neurons & LeakyReLU activation
nn.add(tf.keras.layers.Dense(units=100, activation=tf.keras.layers.LeakyReLU(alpha=0.01), input_dim=X_train_scaled.shape[1]))

# Second Hidden Layer - More neurons & activation change
nn.add(tf.keras.layers.Dense(units=50, activation='tanh'))

# Third Hidden Layer - Added new layer
nn.add(tf.keras.layers.Dense(units=25, activation='relu'))

# Dropout Layer to prevent overfitting
nn.add(tf.keras.layers.Dropout(0.5))

# Output Layer - Sigmoid for binary classification
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Display Model Summary
nn.summary()


In [24]:
# Compile Model with Optimized Parameters
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Implement Early Stopping to Prevent Overtraining
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)

# Train Model with Increased Epochs
fit_model = nn.fit(X_train_scaled, y_train, epochs=40, batch_size=64, callbacks=[early_stopping])


Epoch 1/40
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6840 - loss: 0.6139
Epoch 2/40
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7263 - loss: 0.5735
Epoch 3/40
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.7273 - loss: 0.5690
Epoch 4/40
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7341 - loss: 0.5564
Epoch 5/40
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7261 - loss: 0.5611
Epoch 6/40
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7311 - loss: 0.5592
Epoch 7/40
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7333 - loss: 0.5573
Epoch 8/40
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7291 - loss: 0.5562
Epoch 9/40
[1m402/402[0m [32m━━━━━━━━

In [26]:
# Evaluate Model Performance
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Optimized Loss: {model_loss}, Optimized Accuracy: {model_accuracy}")


268/268 - 1s - 3ms/step - accuracy: 0.7269 - loss: 0.5518
Optimized Loss: 0.5517569184303284, Optimized Accuracy: 0.7268804907798767


In [28]:
# Save the Optimized Model to an HDF5 File
nn.save("AlphabetSoupCharity_Optimization.h5")


