### Keras Exploration
___

In [1]:
# data source
from sklearn.datasets import load_diabetes

# data management
import numpy as np
import pandas as pd

# data visualization
import seaborn as sns

# data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# model objects
from sklearn.neural_network import MLPRegressor
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

# metrics & evaluation
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

# MISC.
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

**Part 1: Implement a Perceptron**

In [2]:
# importing diabetes dataset
diabetes_ds = load_diabetes()
diabetes_df = pd.DataFrame(diabetes_ds.data, columns=diabetes_ds.feature_names)
diabetes_df['disease_progression'] = diabetes_ds.target

diabetes_df.head(5)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,disease_progression
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [3]:
# splitting data
X = diabetes_df.iloc[:, :-1]
y = diabetes_df['disease_progression']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)


X_train: (309, 10)
X_test: (133, 10)


In [4]:
# MLP regressor
mlpReg = MLPRegressor(random_state=1, max_iter=3500).fit(X_train, y_train)

y_train_pred = mlpReg.predict(X_train)
y_test_pred = mlpReg.predict(X_test)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print('RMSE (Training Data):', rmse_train)
print('RMSE (Testing Data):', rmse_test)

RMSE (Training Data): 53.198991057883404
RMSE (Testing Data): 55.251977123821


**Part 2: Keras Classifier**

In [5]:
# importing and cleaning the titanic dataset (same cleaning from assignment 3)
titanic_df = sns.load_dataset('titanic')
titanic_df.drop(columns=['deck', 'alive', 'class'], inplace=True)  # deck (too many nulls) & alive/class (redundant columns)

# median age per class for imputations
medians = list(titanic_df.groupby(['pclass'])['age'].median())

# function for age imputations
def impute_age(row):
    if pd.isna(row[0]):
        if row[1] == 1:
            return medians[0]
        elif row[1] == 2:
            return medians[1]
        elif row[1] == 3:
            return medians[2]
    else:
        return row[0]

titanic_df['age'] = titanic_df[['age', 'pclass']].apply(impute_age, axis=1)

titanic_df.dropna(inplace=True)  # dropping any remaing records with nulls

# create dummy variables for sex, embarked, who, adult_male, embark_town, alone
titanic_df = pd.get_dummies(titanic_df, columns=['sex', 'embarked', 'who', 'adult_male', 
                                                 'embark_town', 'alone'], drop_first=True)

# create 'label' from survived column
labels = titanic_df['survived']

# create 'features' from dropping survived
features = titanic_df.drop(columns=['survived'])


# 70/30 split for data
X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                    test_size=0.3, 
                                                    random_state=17)

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

X_train: (622, 14)
X_test: (267, 14)


In [6]:
# creating model
model = Sequential()
model.add(Dense(30, input_dim=len(X_train.columns)))  # input layer
model.add(Dense(20, activation="relu"))  # hidden layer 1
model.add(Dense(5, activation="relu"))  # hidden layer 2
model.add(Dense(1, activation="sigmoid"))  # output layer

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 30)                450       
                                                                 
 dense_1 (Dense)             (None, 20)                620       
                                                                 
 dense_2 (Dense)             (None, 5)                 105       
                                                                 
 dense_3 (Dense)             (None, 1)                 6         
                                                                 
Total params: 1,181
Trainable params: 1,181
Non-trainable params: 0
_________________________________________________________________


In [7]:
# compiling model
model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

# fitting model
model.fit(X_train, y_train, epochs=40, verbose=0)

# predicting on the training data
y_train_proba = model.predict(X_train)
y_train_pred = np.round(y_train_proba).astype(int)

# predicting on the test data
y_test_proba = model.predict(X_test)
y_test_pred = np.round(y_test_proba).astype(int)

print('Accuracy (Training Data):', accuracy_score(y_train, y_train_pred))
print('F1 (Training Data):', f1_score(y_train, y_train_pred))
print('\nAccuracy (Testing Data):', accuracy_score(y_test, y_test_pred))
print('F1 (Testing Data):', f1_score(y_test, y_test_pred))

Accuracy (Training Data): 0.7942122186495176
F1 (Training Data): 0.7077625570776256

Accuracy (Testing Data): 0.8277153558052435
F1 (Testing Data): 0.787037037037037


**Part 3: Keras Regressor**

In [8]:
# loading bike share data
bike_df = pd.read_csv('bike_share_hour.csv')  # predicting cnt

# convert columns to categorical
cat_vars = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']
bike_df[cat_vars] = bike_df[cat_vars].astype('category')

# scale numerical features with StandardScaler() (temp, atemp, hum, windspeed)
numerical_cols = ['temp', 'atemp', 'hum', 'windspeed']
bike_df[numerical_cols] = StandardScaler().fit_transform(bike_df[numerical_cols])

# dropping unused columns
bike_df = bike_df.drop(columns=['instant', 'dteday', 'casual', 'registered'])

# OHE for categorical columns
bike_df = pd.get_dummies(bike_df, columns=cat_vars, drop_first=True)

# split into training and test data (70/30 split)
features = bike_df.drop('cnt', axis=1)
labels = bike_df['cnt']
X_train, X_test, y_train, y_test = train_test_split(features, labels,
                                                    test_size=0.3,
                                                    random_state=17)

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

X_train: (12165, 53)
X_test: (5214, 53)


In [9]:
# create model
model = Sequential()
model.add(Dense(30, input_dim=len(X_train.columns), activation="relu"))  # input layer
model.add(Dense(20, activation="relu"))  # hidden layer 1
model.add(Dense(5, activation="relu"))  # hidden layer 2
model.add(Dense(1))  # output layer

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 30)                1620      
                                                                 
 dense_5 (Dense)             (None, 20)                620       
                                                                 
 dense_6 (Dense)             (None, 5)                 105       
                                                                 
 dense_7 (Dense)             (None, 1)                 6         
                                                                 
Total params: 2,351
Trainable params: 2,351
Non-trainable params: 0
_________________________________________________________________


In [10]:
# compiling model
model.compile(loss='mean_squared_error', optimizer="adam", metrics=['mse'])

# fitting model
model.fit(X_train, y_train, epochs=40, verbose=0)

# predicting on the training data
y_train_pred = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

# predicting on the test data
y_test_pred = model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print('RMSE (Training Data):', rmse_train)
print('RMSE (Testing Data):', rmse_test)

RMSE (Training Data): 41.7425021316964
RMSE (Testing Data): 45.39439708941542


**Part 4: Tuning the Keras Regressor**

In [11]:
# create model function
def build_model(optimizer="adam"):
    model = Sequential()
    model.add(Dense(30, input_dim=len(X_train.columns), activation="relu"))  # input layer
    model.add(Dense(20, activation="relu"))  # hidden layer 1
    model.add(Dense(5, activation="relu"))  # hidden layer 2
    model.add(Dense(1))  # output layer
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse'])

    return model

model = KerasRegressor(build_fn=build_model, verbose=0)

optimizers = ['SGD', 'RMSprop', 'Adam', 'Adadelta', 'Adagrad', 'Adamax', 'Nadam', 'Ftrl']
param_grid = dict(optimizer=optimizers)

gs = GridSearchCV(estimator=model,
                  param_grid=param_grid,
                  cv=5,
                  n_jobs=-1,
                  scoring='neg_mean_squared_error',
                  verbose=0)

gs_model = gs.fit(X_train, y_train, epochs=40)

print("--GridSearch Results--")
print("Best RMSE:", np.sqrt(-gs_model.best_score_))
print("Best Parameters", gs_model.best_params_)

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/usr/local/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/usr/local/lib/python3.8/site-packages/sklearn/metrics/_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "/usr/local/lib/python3.8/site-packages/sklearn/metrics/_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "/usr/local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 800, in check_array
    _assert_all_finite(array, allow_nan=force_all_finite == "allow-na

--GridSearch Results--
Best RMSE: 47.255342964272145
Best Parameters {'optimizer': 'RMSprop'}


In [12]:
# Test Data
y_test_pred = gs_model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("Test Data")
print("RMSE:", rmse_test)

Test Data
RMSE: 44.95981888524411
