## Preprocessing

In [37]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dla-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [38]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME' ÁFFILIATION, 'SPECIAL_CONSIDERATIONS'.
application_df.drop(columns=["EIN","NAME","APPLICATION_TYPE","CLASSIFICATION","SPECIAL_CONSIDERATIONS","STATUS"], inplace=True)
application_df

Unnamed: 0,AFFILIATION,USE_CASE,ORGANIZATION,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL
0,Independent,ProductDev,Association,0,5000,1
1,Independent,Preservation,Co-operative,1-9999,108590,1
2,CompanySponsored,ProductDev,Association,0,5000,0
3,CompanySponsored,Preservation,Trust,10000-24999,6692,1
4,Independent,Heathcare,Trust,100000-499999,142590,1
...,...,...,...,...,...,...
34294,Independent,ProductDev,Association,0,5000,0
34295,CompanySponsored,ProductDev,Association,0,5000,0
34296,CompanySponsored,Preservation,Association,0,5000,0
34297,Independent,ProductDev,Association,0,5000,1


In [39]:
# Determine the number of unique values in each column.
unique_col = application_df.nunique()
unique_col

AFFILIATION         6
USE_CASE            5
ORGANIZATION        4
INCOME_AMT          9
ASK_AMT          8747
IS_SUCCESSFUL       2
dtype: int64

In [40]:
#Checking the Income amount categories
print(application_df["INCOME_AMT"].value_counts())

0                24388
25000-99999       3747
100000-499999     3374
1M-5M              955
1-9999             728
10000-24999        543
10M-50M            240
5M-10M             185
50M+               139
Name: INCOME_AMT, dtype: int64


In [41]:
# Convert categorical data to numeric with `pd.get_dummies`
# List of columns that need to be one-hot encoded
columns_to_encode = [ "AFFILIATION", "USE_CASE","ORGANIZATION","INCOME_AMT","ASK_AMT"]

# Convert categorical data to numeric using one-hot encoding
application_df_encoded = pd.get_dummies(application_df, columns=columns_to_encode)

# Display the first few rows of the encoded DataFrame to inspect the changes
application_df_encoded.head()

Unnamed: 0,IS_SUCCESSFUL,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,AFFILIATION_Other,AFFILIATION_Regional,USE_CASE_CommunityServ,USE_CASE_Heathcare,USE_CASE_Other,...,ASK_AMT_1665460552,ASK_AMT_1736232349,ASK_AMT_1893400128,ASK_AMT_2264109450,ASK_AMT_2310256039,ASK_AMT_3391919220,ASK_AMT_4653011914,ASK_AMT_5591584994,ASK_AMT_8556638692,ASK_AMT_8597806340
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
#Split the preprocessed data into target arrays - encoding the IS_SUCCESSFUL column for X and y
X = application_df_encoded.drop(columns=['IS_SUCCESSFUL'])
y = application_df_encoded['IS_SUCCESSFUL']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [43]:
from sklearn.model_selection import train_test_split
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [55]:
 #Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
def create_model(input_features):

    # Create a sequential model
    nn = tf.keras.models.Sequential()

# First hidden layer
    nn.add(tf.keras.layers.Dense(128, activation='sigmoid', input_dim = 8771))

# # Second hidden layer
    nn.add(tf.keras.layers.Dense(64, activation='relu'))

## Third hidden layer
    nn.add(tf.keras.layers.Dense(32, activation='sigmoid'))

# # Output layer
    nn.add(tf.keras.layers.Dense(1, activation='sigmoid'))

# Check the structure of the model
    nn.summary()
    return nn

In [56]:
# Create the model
model = create_model(10)

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 128)               1122816   
                                                                 
 dense_19 (Dense)            (None, 64)                8256      
                                                                 
 dense_20 (Dense)            (None, 32)                2080      
                                                                 
 dense_21 (Dense)            (None, 1)                 33        
                                                                 
Total params: 1,133,185
Trainable params: 1,133,185
Non-trainable params: 0
_________________________________________________________________


In [58]:
# Train the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ab7946addb0>

In [59]:
# Evaluate the model using the test data
model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

215/215 - 1s - loss: 2.4734 - accuracy: 0.4716 - 685ms/epoch - 3ms/step
Loss: 2.4734046459198, Accuracy: 0.4715743362903595


In [54]:
# Export our model to HDF5 file
nn = tf.keras.models.Sequential()
model.save("AlphabetSoupCharity_trial2.h5")