In [1]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

import statsmodels.api as sm

In [19]:
def get_model(nodes_in_hidden_layers, dropout, nr_of_inputs):
    model = Sequential()
    for i in range(len(nodes_in_hidden_layers)):
        if i == 0:
            model.add(Dense(nodes_in_hidden_layers[i], activation='relu', input_shape=(nr_of_inputs,)))  # Input and hidden layer
        else:
            if dropout != 0:
                model.add(Dropout(dropout))
            model.add(Dense(nodes_in_hidden_layers[i], activation='relu'))  # Additional hidden layers
    model.add(Dense(1, activation="linear"))  # Output layer
    model.compile(optimizer=Adam(learning_rate=0.001), loss="mean_squared_error", metrics=("mean_squared_error"))
    return model

In [3]:
def cross_validation(x_train, y_train, nodes, epochs, dropout, patience):
    kf = KFold(n_splits=5)
    ce = []
    for train_index, validation_index in kf.split(x_train):
        model = get_model(nodes, dropout, len(x_train[0]))
        es = EarlyStopping(monitor='mean_squared_error', mode='min', verbose=1, patience=patience)
        history = model.fit(x_train[train_index], y_train[train_index], validation_data=(x_train[validation_index], y_train[validation_index]), epochs=epochs, verbose=1, callbacks=[es])
        ce.append(min(history.history['mean_squared_error']))
    avg_ce = sum(ce) / len(ce)
    return avg_ce

In [48]:
def get_all_features(data):
    station = np.eye(19)[data["Base Station"]]
    year = np.eye(4)[data["Year"] - 2015]
    month = np.eye(12)[data["Month"] - 1]
    day = np.eye(31)[data["Day"] - 1]
    week = np.eye(53)[data["Week"] - 1]
    weekday = np.eye(7)[data["Weekday"] - 1]
    hour = np.eye(24)[data["Hour"]]
    daytime = np.zeros((len(data), 6))

    # Loop over the four-hour periods and sum the one hot encoded values of each hour in the period
    for i in range(6):
        daytime[:, i] = np.sum(hour[:, i*4:(i+1)*4], axis=1)

    x = np.concatenate([
        station,
        year, 
        month, 
        day, 
        week, 
        weekday, 
        daytime], axis=1)
    return x

In [24]:
def get_features(data):
    def get_daytime(hour, period_hours):
        periods = int(24/period_hours)
        for i in range(periods):
            start_hour = i
            end_hour = (i+1) * period_hours
            if start_hour <= hour < end_hour:
                return i
        return 6

    # Map the Hour column to the Daytime column using the map_hour_to_daytime function
    data["Daytime"] = data["Hour"].apply(lambda x: get_daytime(x, 4))

    return data[["Base Station", "Year", "Month", "Day", "Week", "Weekday", "Daytime"]].values

In [5]:
def revert_all_features(data):
    base_station = np.argmax(data[:, :19], axis=1) 
    year = np.argmax(data[:, 19:23], axis=1) + 2015
    month = np.argmax(data[:, 23:35], axis=1) + 1
    day = np.argmax(data[:, 35:66], axis=1) + 1
    week = np.argmax(data[:, 66:119], axis=1) + 1
    weekday = np.argmax(data[:, 119:126], axis=1) + 1
    daytime = np.argmax(data[:, 126:], axis=1)

    return list(zip(base_station, year, month, day, week, weekday, daytime))

In [27]:
def revert_features(data):
    base_station = np.argmax(data[:, :19], axis=1) 
    year = data[:, 19] + 2015
    month = np.argmax(data[:, 20:32], axis=1) + 1
    day = np.argmax(data[:, 32:63], axis=1) + 1
    week = np.argmax(data[:, 63:116], axis=1) + 1
    weekday = np.argmax(data[:, 116:123], axis=1) + 1
    daytime = np.argmax(data[:, 123:], axis=1)


    return list(zip(base_station, year, month, day, week, weekday, daytime))

In [6]:
def revert_selected_features(data):
    #base_station = np.argmax(data[:, :19], axis=1) 
    #year = np.argmax(data[:, 19:23], axis=1) + 2015
    month = np.argmax(data[:, :12], axis=1) + 1
    #day = np.argmax(data[:, 35:66], axis=1) + 1
    #week = np.argmax(data[:, 66:119], axis=1) + 1
    weekday = np.argmax(data[:, 12:19], axis=1) + 1
    hour = np.argmax(data[:, 19:], axis=1)

    return list(zip(month, weekday, hour))

In [20]:
def preprocess(data, scaler):
    #data = data[data["Base Station"] == 8]

    x = get_features(data)

    y = data["Incidents"].to_numpy()
    #y = np.array(y).reshape(-1, 1)
    #y_log = np.log(y + 1) # Add 1 to avoid log(0)
    #y_scaled = scaler.fit_transform(y_log)
    #y_scaled = y_scaled.flatten()
    #y = y_scaled

    return x, y

In [10]:
DISTRIBUTION_FILE = "data/incidents_distribution_station_count.csv"
DISTRIBUTION_TEST_FILE = "data/incidents_distribution_station_count_test.csv"

data = pd.read_csv(DISTRIBUTION_FILE, encoding='utf-8', escapechar='\\', parse_dates=True)
data_test = pd.read_csv(DISTRIBUTION_TEST_FILE, encoding='utf-8', escapechar='\\', parse_dates=True)

In [25]:
scaler = MinMaxScaler()
X_train, Y_train = preprocess(data, MinMaxScaler())
X_test, Y_test = preprocess(data_test, scaler)

In [17]:
scaler = MinMaxScaler()
x, y = preprocess(data, scaler)
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [26]:
layers = [16, 32, 16, 8]
model = get_model(layers, 0.4, len(X_train[0]))
es = EarlyStopping(monitor='mean_squared_error', mode='min', verbose=1, patience=1)
model.fit(X_train, Y_train, epochs=10, verbose=1, callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 7: early stopping


<keras.callbacks.History at 0x14a86c8eb10>

In [16]:
# Fit the Poisson regression model
model = sm.GLM(Y_train, X_train, family=sm.families.Poisson())
model = model.fit()

# Print the model summary
print(model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:               663024
Model:                            GLM   Df Residuals:                   663017
Model Family:                 Poisson   Df Model:                            6
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -6.7596e+05
Date:                Thu, 06 Apr 2023   Deviance:                   7.9049e+05
Time:                        19:09:12   Pearson chi2:                 9.05e+05
No. Iterations:                     5   Pseudo R-squ. (CS):            0.02381
Covariance Type:            nonrobust                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Base Station     0.0150      0.000     49.555   

In [27]:
predictions = model.predict(X_test)



In [29]:
mse = mean_squared_error(Y_test, predictions)
print("Mean squared error scaled: ", mse)

#predictions_inversed = np.expm1(scaler.inverse_transform(predictions))
#Y_test_inversed = np.expm1(scaler.inverse_transform(Y_test.reshape(-1, 1)))
#mse = mean_squared_error(Y_test, predictions_inversed)
#print("Mean squared error: ", mse)

#X_test_reverted = revert_features(X_test)

for i in range(len(predictions)):
    print(predictions[i], Y_test[i], X_test[i])

""" print("pred target station year month day week weekday hour")
for i in range(len(predictions_inversed)):
    p = round(predictions_inversed[i][0], 3)
    t = round(Y_test_inversed[i][0], 3)
    print(p, t, X_test_reverted[i]) """


Mean squared error scaled:  0.73867397398514
[0.55967236] 0 [   0 2017    8    7   32    1    0]
[0.55967236] 0 [   0 2017    8    7   32    1    0]
[0.55967236] 0 [   0 2017    8    7   32    1    0]
[0.55967236] 0 [   0 2017    8    7   32    1    0]
[0.55967236] 0 [   0 2017    8    7   32    1    1]
[0.55967236] 0 [   0 2017    8    7   32    1    1]
[0.55967236] 0 [   0 2017    8    7   32    1    1]
[0.55967236] 1 [   0 2017    8    7   32    1    1]
[0.55967236] 0 [   0 2017    8    7   32    1    2]
[0.55967236] 1 [   0 2017    8    7   32    1    2]
[0.55967236] 0 [   0 2017    8    7   32    1    2]
[0.55967236] 1 [   0 2017    8    7   32    1    2]
[0.55967236] 0 [   0 2017    8    7   32    1    3]
[0.55967236] 0 [   0 2017    8    7   32    1    3]
[0.55967236] 1 [   0 2017    8    7   32    1    3]
[0.55967236] 1 [   0 2017    8    7   32    1    3]
[0.55967236] 1 [   0 2017    8    7   32    1    4]
[0.55967236] 0 [   0 2017    8    7   32    1    4]
[0.55967236] 0 [   

' print("pred target station year month day week weekday hour")\nfor i in range(len(predictions_inversed)):\n    p = round(predictions_inversed[i][0], 3)\n    t = round(Y_test_inversed[i][0], 3)\n    print(p, t, X_test_reverted[i]) '

In [None]:
avg_ce = cross_validation(X_train, Y_train, layers, 10, 0.2, 5)