## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
#  Import and read the charity_data.csv.
import pandas as pd 
df = pd.read_csv("../Resources/charity_data.csv").set_index('EIN')
df.head(2)

Unnamed: 0_level_0,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
EIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1


In [3]:
df.nunique()

NAME                      19568
APPLICATION_TYPE             17
AFFILIATION                   6
CLASSIFICATION               71
USE_CASE                      5
ORGANIZATION                  4
STATUS                        2
INCOME_AMT                    9
SPECIAL_CONSIDERATIONS        2
ASK_AMT                    8747
IS_SUCCESSFUL                 2
dtype: int64

In [4]:
df['STATUS'].value_counts()

1    34294
0        5
Name: STATUS, dtype: int64

In [5]:
# Filter certain columns and keep only rows with the given value/s
# A status of 0 might mean ongoing loan, which might have bad data for training the model
filter_columns = [('STATUS', [1])]

# Create list of columns to drop.
# Reverse the decision from deel_learning2.  If CLASSIFICATION mostly explains each of the other parameters, the other parameters combined probably explain CLASSIFICATION
drop_columns = ['NAME', 'CLASSIFICATION', 'STATUS']


In [6]:
# First filter rows
for col, vals in filter_columns:
    df = df[df[col].isin(vals)]

# Then drop columns
# Drop columns to be dropped
df.drop(columns=drop_columns, inplace=True)


In [7]:
# Determine the number of unique values in each column.
df.nunique()

APPLICATION_TYPE            17
AFFILIATION                  6
USE_CASE                     5
ORGANIZATION                 4
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64

In [8]:
# Look at APPLICATION_TYPE value counts for binning
vc = df['APPLICATION_TYPE'].value_counts()
vc

T3     27032
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T14        3
T25        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64

In [9]:
# Choose a cutoff value and create a list of application types to be replaced
app_repl = vc.loc[lambda x: x<= vc[5]].reset_index()['index']
app_repl

0      T8
1      T7
2     T10
3      T9
4     T13
5     T12
6      T2
7     T14
8     T25
9     T29
10    T15
11    T17
Name: index, dtype: object

In [10]:
# Replace in dataframe
for val in app_repl:
    df['APPLICATION_TYPE'] = df['APPLICATION_TYPE'].replace(val,"Other")

# Check to make sure binning was successful
df['APPLICATION_TYPE'].value_counts()

T3       27032
Other     2266
T4        1542
T6        1216
T5        1173
T19       1065
Name: APPLICATION_TYPE, dtype: int64

In [11]:
df.head(2)

Unnamed: 0_level_0,APPLICATION_TYPE,AFFILIATION,USE_CASE,ORGANIZATION,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
EIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10520599,Other,Independent,ProductDev,Association,0,N,5000,1
10531628,T3,Independent,Preservation,Co-operative,1-9999,N,108590,1


In [12]:
# Convert categorical data to numeric with `pd.get_dummies`
dums = pd.get_dummies(df)
print(dums.shape)
dums.head(2)

(34294, 34)


Unnamed: 0_level_0,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
EIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10520599,5000,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
10531628,108590,1,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [13]:
# Split our preprocessed data into our features and target arrays
y = dums['IS_SUCCESSFUL']
X = dums.drop(columns = 'IS_SUCCESSFUL')

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12345)

In [14]:
scale_cols = ['ASK_AMT']

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

ct = ColumnTransformer([
        ('ct', StandardScaler(), scale_cols)
     ], remainder='passthrough')

X_train_scaled = ct.fit_transform(X_train)
X_test_scaled = ct.fit_transform(X_test)

## Compile, Train and Evaluate the Model

In [15]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
layers = [
    (tf.keras.layers.Dense, {'units' : 100, 'activation': 'relu', 'input_dim': X_train_scaled.shape[1]}),
    (tf.keras.layers.Dense, {'units' : 50, 'activation': 'relu'}),
    (tf.keras.layers.Dense, {'units' : 50, 'activation': 'relu'}),
    (tf.keras.layers.Dense, {'units' : 50, 'activation': 'relu'}),
    (tf.keras.layers.Dropout, {'rate' : 0.3}),
    (tf.keras.layers.Dense, {'units' : 1, 'activation': 'sigmoid'})
]

In [16]:

nn = tf.keras.models.Sequential()

for layer, params in layers:
    nn.add(layer(**params))

# Check the structure of the model
nn.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               3400      
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_2 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_3 (Dense)              (None, 50)                2550      
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 51        
Tota

In [17]:
# Compile the model
nn.compile(optimizer = 'adam', loss='mse', metrics=['accuracy'])

In [18]:
# Train the model
model = nn.fit(X_train_scaled, y_train, epochs = 200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [19]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

8574/8574 - 0s - loss: 0.1939 - acc: 0.7260
Loss: 0.19389839457525695, Accuracy: 0.7260321974754333


In [20]:
# Export our model to HDF5 file
nn.save('mse_dropout_layer_model.h5')