## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf


In [2]:
!pip install keras_tuner

Collecting keras_tuner
  Downloading keras_tuner-1.3.5-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras_tuner
Successfully installed keras_tuner-1.3.5 kt-legacy-1.0.5


In [3]:
# keras_tuner is not part of default google colab, so had to be installed and then imported separately
import keras_tuner as kt

In [4]:
#  Import and read the charity_data.csv.
char_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
char_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [5]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.

charities_df = char_df.copy()

charities_df.drop(['EIN', 'NAME'], axis=1, inplace=True)

charities_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [6]:
# Determine the number of unique values in each column.
unique_counts = charities_df.nunique()
print(unique_counts)

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64


In [7]:
# Look at APPLICATION_TYPE value counts for binning

app_type_counts = charities_df['APPLICATION_TYPE'].value_counts()
app_type_counts

T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64

In [8]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`

# for optimization, increase cutoff to result in two fewer bins
cutoff = 528

application_types_to_replace = app_type_counts[app_type_counts < cutoff].index.tolist()

# Replace in dataframe
for app in application_types_to_replace:
    charities_df['APPLICATION_TYPE'] = charities_df['APPLICATION_TYPE'].replace(app, "Other")

# Check to make sure binning was successful
charities_df['APPLICATION_TYPE'].value_counts()

T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: APPLICATION_TYPE, dtype: int64

In [9]:
# Look at CLASSIFICATION value counts for binning
classification_counts = charities_df['CLASSIFICATION'].value_counts()
classification_counts

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: CLASSIFICATION, Length: 71, dtype: int64

In [10]:
# You may find it helpful to look at CLASSIFICATION value counts > 1
for cls_count in classification_counts:
  if cls_count > 1 :
    print(cls_count)

17326
6074
4837
1918
1883
777
287
194
116
114
104
95
75
58
50
36
34
32
32
30
20
18
16
15
15
14
11
10
10
9
9
7
6
6
6
5
5
3
3
3
2
2
2
2
2


In [11]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`

classification_cutoff = 1000

classification_to_replace = classification_counts[classification_counts < classification_cutoff].index.tolist()

# Replace in dataframe
for cls in classification_to_replace:
    charities_df['CLASSIFICATION'] = charities_df['CLASSIFICATION'].replace(cls, "Other")

# Check to make sure binning was successful
charities_df['CLASSIFICATION'].value_counts()

C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: CLASSIFICATION, dtype: int64

In [12]:
# For optimization, examine additional columns for possible binning.  After examination, leave "INCOME_AMT" groupings as is.
income_counts = charities_df['INCOME_AMT'].value_counts()
income_counts

0                24388
25000-99999       3747
100000-499999     3374
1M-5M              955
1-9999             728
10000-24999        543
10M-50M            240
5M-10M             185
50M+               139
Name: INCOME_AMT, dtype: int64

In [13]:
# For optimization, examine the "SPECIAL_CONSIDERATIONS" column for possible binning or deletion
status_counts = charities_df['STATUS'].value_counts()
status_counts

1    34294
0        5
Name: STATUS, dtype: int64

In [14]:
# for optimization, examine "SPECIAL_CONSIDERATIONS" column for possible binning or deletion
special_counts = charities_df['SPECIAL_CONSIDERATIONS'].value_counts()
special_counts

N    34272
Y       27
Name: SPECIAL_CONSIDERATIONS, dtype: int64

In [15]:
# Drop the 'SPECIAL_CONSIDERATIONS' column because its 27 "yes" members may create skewing or bias in the 34,999 member dataset
# where every other member is "no".  These 27 would have to be examined separately, not as part of a general model.
# Drop the "STATUS" column for a similar reason.   The five items with a value of "0" can be examined separately and should not be used to
# make general predications about the other items, all of which have a value of "1".

charities_df.drop(['SPECIAL_CONSIDERATIONS', 'STATUS'], axis=1, inplace=True)
charities_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,0,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1-9999,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,0,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,10000-24999,6692,1
4,T3,Independent,C1000,Heathcare,Trust,100000-499999,142590,1


In [16]:
# For optimization, examine additional column "USE_CASE" for possible binning.
use_counts = charities_df['USE_CASE'].value_counts()
use_counts

Preservation     28095
ProductDev        5671
CommunityServ      384
Heathcare          146
Other                3
Name: USE_CASE, dtype: int64

In [17]:
# For optimization, examine additional column "ORGANIZATION" for possible binning.
org_counts = charities_df['ORGANIZATION'].value_counts()
org_counts

Trust           23515
Association     10255
Co-operative      486
Corporation        43
Name: ORGANIZATION, dtype: int64

In [18]:
# For optimization, examine additional column "AFFLIATION" for possible binning.
affiliation_counts = charities_df['AFFILIATION'].value_counts()
affiliation_counts

Independent         18480
CompanySponsored    15705
Family/Parent          64
National               33
Regional               13
Other                   4
Name: AFFILIATION, dtype: int64

In [19]:
# Choose a cutoff value and create a list of affiliations to be replaced

aff_cutoff = 100

affiliation_to_replace = affiliation_counts[affiliation_counts < aff_cutoff].index.tolist()

# Replace in dataframe
for aff in affiliation_to_replace:
    charities_df['AFFILIATION'] = charities_df['AFFILIATION'].replace(aff, "Other")

# Check to make sure binning was successful
charities_df['CLASSIFICATION'].value_counts()

C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: CLASSIFICATION, dtype: int64

In [20]:
# Redisplay for charting

charities_df.head(32000)

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,0,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1-9999,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,0,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,10000-24999,6692,1
4,T3,Independent,C1000,Heathcare,Trust,100000-499999,142590,1
...,...,...,...,...,...,...,...,...
31995,T5,CompanySponsored,C3000,ProductDev,Association,0,5000,1
31996,T3,Independent,C1000,Preservation,Trust,1M-5M,2290949,1
31997,T3,Independent,C1000,Preservation,Trust,100000-499999,29612,0
31998,T3,Independent,C1000,Preservation,Association,0,5000,1


In [21]:
# Determine categorical data columns
charities_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34299 entries, 0 to 34298
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   APPLICATION_TYPE  34299 non-null  object
 1   AFFILIATION       34299 non-null  object
 2   CLASSIFICATION    34299 non-null  object
 3   USE_CASE          34299 non-null  object
 4   ORGANIZATION      34299 non-null  object
 5   INCOME_AMT        34299 non-null  object
 6   ASK_AMT           34299 non-null  int64 
 7   IS_SUCCESSFUL     34299 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 2.1+ MB


In [22]:
# Convert categorical data to numeric with `pd.get_dummies`
df_encoded = pd.get_dummies(charities_df, columns=['APPLICATION_TYPE',	'AFFILIATION',	'CLASSIFICATION', 	'USE_CASE', 	'ORGANIZATION', 'INCOME_AMT'])

In [23]:
# check the results
df_encoded.head()

Unnamed: 0,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,...,ORGANIZATION_Trust,INCOME_AMT_0,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M
0,5000,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,108590,1,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,5000,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,6692,1,0,0,0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,142590,1,0,0,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [24]:
# Examine the encoded data
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34299 entries, 0 to 34298
Data columns (total 38 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   ASK_AMT                       34299 non-null  int64
 1   IS_SUCCESSFUL                 34299 non-null  int64
 2   APPLICATION_TYPE_Other        34299 non-null  uint8
 3   APPLICATION_TYPE_T10          34299 non-null  uint8
 4   APPLICATION_TYPE_T19          34299 non-null  uint8
 5   APPLICATION_TYPE_T3           34299 non-null  uint8
 6   APPLICATION_TYPE_T4           34299 non-null  uint8
 7   APPLICATION_TYPE_T5           34299 non-null  uint8
 8   APPLICATION_TYPE_T6           34299 non-null  uint8
 9   APPLICATION_TYPE_T7           34299 non-null  uint8
 10  APPLICATION_TYPE_T8           34299 non-null  uint8
 11  AFFILIATION_CompanySponsored  34299 non-null  uint8
 12  AFFILIATION_Independent       34299 non-null  uint8
 13  AFFILIATION_Other             3

In [25]:
# Split the  preprocessed data into our features and target arrays

y = df_encoded['IS_SUCCESSFUL'].values
X = df_encoded.drop(columns="IS_SUCCESSFUL").values


In [26]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [27]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [28]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

# get the number of input features
number_input_features = len(X_train[0])
print(f"Number of input features: {number_input_features}")

hidden_layer1 = 2 * number_input_features
hidden_layer2 = 2 * number_input_features
hidden_layer3 = 2 * number_input_features
hidden_layer4 = number_input_features

nn_model = tf.keras.models.Sequential()

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_layer2, activation="relu"))

# Third hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_layer3, activation="relu"))

# Fourth hidden layer
nn_model.add(tf.keras.layers.Dense(units=hidden_layer4, activation="sigmoid"))

# Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_model.summary()

Number of input features: 37
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 74)                2812      
                                                                 
 dense_1 (Dense)             (None, 74)                5550      
                                                                 
 dense_2 (Dense)             (None, 74)                5550      
                                                                 
 dense_3 (Dense)             (None, 37)                2775      
                                                                 
 dense_4 (Dense)             (None, 1)                 38        
                                                                 
Total params: 16725 (65.33 KB)
Trainable params: 16725 (65.33 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________

In [29]:
# Compile the model
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [30]:
# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [31]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 1s - loss: 0.5608 - accuracy: 0.7242 - 543ms/epoch - 2ms/step
Loss: 0.5608066916465759, Accuracy: 0.7241982221603394


In [32]:
# Export the model to HDF5 file

nn_model.save('AlphabetSoupCharity_Optimization3.h5')


  saving_api.save_model(
