In [1]:
import os
import sys
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # filter out tf warnings - training in a loop causes a retracing efficiency warning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

# Add parent directory to path to allow import of config.py
sys.path.append("..")

import config as conf

In [2]:
data = pd.read_csv(f'{conf.DATA_PATH}{conf.FORMATTED_DATAFILE}')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 24 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Census Enrollment                14 non-null     float64
 1   Census Fill Rate                 14 non-null     float64
 2   Count of Migrated Data           14 non-null     float64
 3   Current Enrollment               14 non-null     float64
 4   Current Fill Rate                14 non-null     float64
 5   Enrollment Capacity              14 non-null     float64
 6   Ftef                             14 non-null     float64
 7   Ftes                             14 non-null     float64
 8   Ftes/Ftef                        14 non-null     float64
 9   Number Retained                  14 non-null     float64
 10  Number Successful                14 non-null     float64
 11  Retention Rate                   14 non-null     float64
 12  Sec. Count              

In [3]:
# Split data randomly into training and testing sets
training_data = data.sample(frac=conf.TRAIN_TEST_SPLIT)
test_data = data.drop(training_data.index)

# Seperate dependent and independent variable
training_features = training_data.copy()
test_features = test_data.copy()

training_labels = training_features.pop(conf.TARGET_VARIABLE)
test_labels = test_features.pop(conf.TARGET_VARIABLE)

In [4]:
# Setup normalization layer for features
normalizer = preprocessing.Normalization()
normalizer.adapt(np.array(training_features))

In [5]:
# Build and compile deep neural network model
DNN_model = keras.Sequential([
    normalizer,
    layers.Dense(conf.UNITS, activation='relu'),
    layers.Dense(conf.UNITS, activation='relu'),
    layers.Dense(1)
])

DNN_model.compile(
    loss='mean_absolute_error',
    optimizer=tf.keras.optimizers.Adam(conf.LEARNING_RATE)
)

DNN_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 23)                47        
_________________________________________________________________
dense (Dense)                (None, 64)                1536      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 5,808
Trainable params: 5,761
Non-trainable params: 47
_________________________________________________________________


In [6]:
# Do training run
%%time

history = DNN_model.fit(
    training_features, 
    training_labels,
    verbose=0,
    epochs=conf.TRAINING_EPOCHS
)

UsageError: Line magic function `%%time` not found.


In [None]:
# Plot training data loss during training
plt.plot(history.history['loss'], label='loss')
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.grid(True)

In [None]:
# Score model on test data
test_score = DNN_model.evaluate(test_features, test_labels, verbose=0)
print(f'Test MAE: {int(test_score)}')

In [None]:
# Use model to predict enrollment for test dataset
predicted_enrollment = DNN_model.predict(test_features)

# Grab actual enrollment numbers to compare with
actual_enrollment = test_labels

# Plot actual enrollment vs predicted enrollment
sns.set(rc={'figure.figsize':(8,8)})

ax = sns.regplot(
    y=predicted_enrollment, 
    x=actual_enrollment, 
    fit_reg=True, 
    ci=False, 
    scatter_kws={'s':60}
)

ax.tick_params(labelsize=12)
ax.set_title('Predicted vs actual enrollment', fontsize=18)
ax.set_xlabel('Actual enrollment', fontsize=14)
ax.set_ylabel('Predicted enrollment', fontsize=15)
ax.set_xlim(4000, 27000)
ax.set_ylim(4000, 27000)
ax.yaxis.set_major_formatter(FormatStrFormatter('%.0f'))

Looks pretty good! Still very worried about overfitting with such a small dataset. To augment this approach I am going to try something a little unconventional - bootstrap aggregation! This dataset is so small and our network is so simple, training completes on the order of seconds. We will use the speed of training to our advantage and train multiple times with different randomly chosen subsets of the data. Then we will use the ensamble of trained networks to make predictions. It will make more sense when you see it (I hope)...

First thing is to encapsulate the major opperations above into functions so we can easily build and train models in a loop.

In [None]:
def create_datasets(data, training_data_fraction, label_column_name):
    '''Takes master data as pandas dataframe splits into train and
    test features and labels, returns a dict of dataframes'''
    
    # Split data randomly into training and testing sets
    training_data = data.sample(frac=training_data_fraction)
    test_data = data.drop(training_data.index)

    # Seperate dependent and independent variable
    training_features = training_data.copy()
    test_features = test_data.copy()

    training_labels = training_features.pop(label_column_name)
    test_labels = test_features.pop(label_column_name)
    
    dataset = {}
    dataset['training_features'] = training_features
    dataset['test_features'] = test_features
    dataset['training_labels'] = training_labels
    dataset['test_labels'] = test_labels
    
    return dataset

In [None]:
def create_normalization_layer(training_features):
    '''Takes a pandas dataframe, returns an adapted tf.keras
    normalization layer'''
    
    normalizer = preprocessing.Normalization()
    normalizer.adapt(np.array(training_features))
    
    return normalizer

In [None]:
def build_and_compile_model(normalizer, units, learning_rate):
    '''Take pre-adapted normalization layer and hyperparameters,
    return compiled neural network model'''
    
    DNN_model = keras.Sequential([
        normalizer,
        layers.Dense(units, activation='relu'),
        layers.Dense(units, activation='relu'),
        layers.Dense(1)
    ])

    DNN_model.compile(
        loss='mean_absolute_error',
        optimizer=tf.keras.optimizers.Adam(learning_rate)
    )
    
    return DNN_model

In [None]:
# First make some empty lists to hold our history objects 
# and trained models

trained_DNN_models = []
training_history_objects = []
test_set_scores = []

# Outer training loop
for i in range(conf.N_MODELS):
    # Make dataset
    dataset = create_datasets(
        data,
        conf.TRAIN_TEST_SPLIT,
        conf.TARGET_VARIABLE
    )
    
    # Adapt normalization layer
    normalizer = create_normalization_layer(dataset['training_features'])
    
    # Build model
    dnn_model = build_and_compile_model(
        normalizer,
        conf.UNITS,
        conf.LEARNING_RATE
    )
    
    print(f'Training model {i}')
    
    # Train model
    history = dnn_model.fit(
        dataset['training_features'], 
        dataset['training_labels'],
        verbose=0,
        epochs=conf.TRAINING_EPOCHS
    )
    
    # Store results
    trained_DNN_models.append(dnn_model)
    training_history_objects.append(history)
    
    # Evaluate model
    test_score = dnn_model.evaluate(
        dataset['test_features'], 
        dataset['test_labels'],
        verbose=0
    )
    
    test_set_scores.append(test_score)
    
    print(f'Model {i} test MAE: {test_score}\n')


In [None]:
# Plot distribution of MAE scores from each model
sns.displot(test_set_scores, kind="kde")
plt.tick_params(labelsize=12)
plt.xlabel('Mean absolute error', fontsize=14)

plt.show()

OK, still higher than I'd like - but much much better than the linear bagging model. There the error distribution had it's major peak around 10000 - and that was for the whole dataset. This distribution is true test set error for each model.

Let's try and use to to make some predictions. Strategy here will be to loop through the list of trained models, use each one to make a list of predictions and then average them at the end.

In [None]:
# Split master data into features and labels
X = data.drop('Next Semester Census Enrollment', axis=1)
y = data['Next Semester Census Enrollment']

# Empty list to hold predictions
predictions = []

for DNN_model in trained_DNN_models:
    predicted_enrollment = DNN_model.predict(X)
    predictions.append(predicted_enrollment)

predicted_enrollment = np.mean(np.array(predictions), axis=0)

In [None]:
# Grab actual enrollment numbers to compare with
actual_enrollment = y

# Plot actual enrollment vs predicted enrollment
sns.set(rc={'figure.figsize':(8,8)})

ax = sns.regplot(
    y=predicted_enrollment, 
    x=actual_enrollment, 
    fit_reg=True, 
    ci=False, 
    scatter_kws={'s':60}
)

ax.tick_params(labelsize=12)
ax.set_title('Predicted vs actual enrollment', fontsize=18)
ax.set_xlabel('Actual enrollment', fontsize=14)
ax.set_ylabel('Predicted enrollment', fontsize=15)
ax.set_xlim(4000, 27000)
ax.set_ylim(4000, 27000)
ax.yaxis.set_major_formatter(FormatStrFormatter('%.0f'))