In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.optimizers import Adam, AdamW
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.losses import Huber
import shap
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

### Last inn data

In [4]:
# Last inn data
data = pd.read_csv('Data/data_grouped.csv')
# lag nye navn fra-sone-6 og til-sone-6 til fra-sone og til-sone
data = data.rename(columns={'fra-sone-6': 'fra-sone', 'til-sone-6': 'til-sone', 'fri_hen_priv': 'fritid'})
# Lag en hjelpe-tabell med unike soner og befolkning
pop_lookup = data[['fra-sone', 'befolkning']].drop_duplicates()
pop_lookup.columns = ['sone', 'til_befolkning']

# Slå opp befolkning for to_zone basert på from_zone-informasjon
df = data.merge(pop_lookup, left_on='til-sone', right_on='sone', how='left')
df = df.drop(columns='sone')

data_grouped = df

### Forbered til DNN

In [None]:

# Kopier data
data_prep = data_grouped.copy()

total_rows = len(data_prep)


# Konverter `til-sone` og `fra-sone` til numeriske labels (brukes kun for embeddings)
label_encoder_fra = LabelEncoder()
data_prep['fra-sone-encoded'] = label_encoder_fra.fit_transform(data_prep['fra-sone'])

label_encoder_til = LabelEncoder()
data_prep['til-sone-encoded'] = label_encoder_til.fit_transform(data_prep['til-sone'])

label_encoder_tid = LabelEncoder()
data_prep['tid-encoded'] = label_encoder_tid.fit_transform(data_prep['tid'])

# Definer input-variabler
X_fra_sone = data_prep['fra-sone-encoded'].values  # Kategorisk input
X_til_sone = data_prep['til-sone-encoded'].values  # Kategorisk input
X_tid = data_prep['tid-encoded'].values  # Kategorisk input
X_num = data_prep[['befolkning', 'to_befolkning', 'arbeidsplasser', 'handel', 'fri_hen_priv', 'distanse']].values  # Numeriske input
y = data_prep['reiser'].values  # Målvariabel (antall reiser)

# Log-transformasjon av `Reiser` for å håndtere skjevfordeling
#y = np.log1p(y)  # log(1 + y) for å unngå log(0)-problemer
#pt_y = PowerTransformer()
#y = pt_y.fit_transform(y.reshape(-1, 1))

# Split data i trening og testsett
X_fra_train, X_fra_test, X_til_train, X_til_test, X_tid_train, X_tid_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_fra_sone, X_til_sone, X_tid, X_num, y, test_size=0.2, random_state=42
)



In [None]:
# Input for `fra-sone`
fra_sone_input = Input(shape=(1,), name="fra_sone_input")
fra_sone_embedding = Embedding(
    input_dim=len(label_encoder_fra.classes_) + 1,  # +1 for å unngå indeksfeil
    output_dim=50,  # Størrelse på embedding-vektoren
    name="fra_sone_embedding"
)(fra_sone_input)
fra_sone_flat = Flatten()(fra_sone_embedding)

# Input for `til-sone`
til_sone_input = Input(shape=(1,), name="til_sone_input")
til_sone_embedding = Embedding(
    input_dim=len(label_encoder_til.classes_) + 1,  # +1 for å unngå indeksfeil
    output_dim=50,  # Størrelse på embedding-vektoren
    name="til_sone_embedding"
)(til_sone_input)
til_sone_flat = Flatten()(til_sone_embedding)

# Input for 'tid'
tid_input = Input(shape=(1,), name="tid_input")
tid_embedding = Embedding(
    input_dim=len(label_encoder_tid.classes_) + 1,  # +1 for å unngå
    output_dim=3,  # Størrelse på embedding-vektoren
    name="tid_embedding"
)(tid_input)
tid_flat = Flatten()(tid_embedding)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_num_train)
X_num_test = scaler.transform(X_num_test)


# Numerisk input
num_input = Input(shape=(X_num_train.shape[1],), name="numerical_features")

# Kombiner embeddings og numeriske trekk
merged = Concatenate()([fra_sone_flat, til_sone_flat, tid_flat, num_input])

# DNN-lag
x = Dense(128)(merged)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = Dropout(0.3)(x)

x = Dense(64)(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = Dropout(0.2)(x)

x = Dense(32)(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)

# Output-lag (bruker 'linear' istedenfor ReLU for å få mer fleksible prediksjoner)
output = Dense(1, activation="relu", name="output_layer")(x)

def build_model():
  # Bygg og kompiler modellen
  model = Model(inputs=[fra_sone_input, til_sone_input, tid_input, num_input], outputs=output)
  model.compile(optimizer=Adam(learning_rate=0.0001),
                loss='mean_squared_error', metrics=["mae", "mse"])
  return model

model = build_model()

# Print modelloversikt
model.summary()

### Tren

In [None]:
# Early Stopping for å unngå overtilpasning
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

import time
start_time = time.time()
# Tren modellen
model.fit(
    [X_fra_train, X_til_train, X_tid_train, X_num_train], y_train,
    epochs=140, batch_size=64,
    validation_data=([X_fra_test, X_til_test, X_tid_test, X_num_test], y_test),
    callbacks=[early_stopping]
)
end_time = time.time()
print(f"Total training time: {end_time - start_time} seconds")

### Resultater

In [None]:
# Prediker på testsett
st = time.time()
y_pred = model.predict([X_fra_test, X_til_test, X_tid_test, X_num_test])
en = time.time()
print(f"Total prediction time: {en - st} seconds")


# Varianter fra PT og log-transform
# Invers log-transformasjon for å få tilbake reelle verdier
#y_pred = np.expm1(y_pred)
y_test_real = y_test #np.expm1(y_test)

#y_pred = pt_y.inverse_transform(y_pred.reshape(-1, 1)).ravel()
#y_test_real = pt_y.inverse_transform(y_test.reshape(-1, 1)).ravel()

# Evaluer modellen
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


mse = mean_squared_error(y_test_real, y_pred)
mae = mean_absolute_error(y_test_real, y_pred)
r2 = r2_score(y_test_real, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R2): {r2:.2f}")


### Plotte predikert mot faktiske verdier

In [None]:
# Plot true vs predicted 
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5, label='Predicted vs True')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--r', label='Perfect Prediction')
plt.xlabel('True Values (y_test)')
plt.ylabel('Predicted Values (y_pred)')
plt.title('True vs Predicted Values - DNN')
plt.legend()
plt.grid(True)

mask = y_test <= 10000
filtered_y_test = y_test[mask]
filtered_y_pred = y_pred[mask]

# Plot filtered true vs predicted
plt.figure(figsize=(10, 6))
plt.scatter(filtered_y_test, filtered_y_pred, alpha=0.5, label='Predicted vs True')
plt.plot([min(filtered_y_test), max(filtered_y_test)], [min(filtered_y_test), max(filtered_y_test)], '--r', label='Perfect Prediction')
plt.xlabel('True Values (y_test)')
plt.ylabel('Predicted Values (y_pred)')
plt.title('True vs Predicted Values (Filtered)')
plt.legend()
plt.grid(True)
plt.show()

### SHAP

In [None]:

# Sample for SHAP
X_fra_sample = X_fra_train[:1000]
X_til_sample = X_til_train[:1000]
X_tid_sample = X_tid_train[:1000]
X_num_sample = X_num_train[:1000]


In [None]:
def model_predict(X):
    return model.predict([X[:, 0].astype(int),   # fra_sone_input
                          X[:, 1].astype(int),   # til_sone_input
                          X[:, 2].astype(int),   # tid_input
                          X[:, 3:]])             # numerical_features


In [None]:
# Kombiner sample til én SHAP-kompatibel input
X_shap_sample = np.column_stack([X_fra_sample, X_til_sample, X_tid_sample, X_num_sample])


In [None]:
explainer = shap.Explainer(model_predict, X_shap_sample)
shap_values = explainer(X_shap_sample)


In [None]:
import matplotlib.pyplot as plt
shap.summary_plot(shap_values, X_shap_sample,
                  feature_names=['fra_sone', 'til_sone', 'tid', 'befolkning', 'to_befolkning','arbeidsplasser', 'handel', 'fri_hen_priv', 'distanse'], show=False)
plt.xlim(-2000, 2000)
plt.show()


In [20]:
feature_names_all = ['fra_sone', 'til_sone', 'tid', 'befolkning', 'befolkning i til-sone','arbeidsplasser', 'handel', 'fri_hen_priv', 'distanse']



In [21]:
selected_features = ['befolkning', 'befolkning i til-sone','arbeidsplasser', 'handel', 'fri_hen_priv', 'distanse']
selected_indices = [feature_names_all.index(feat) for feat in selected_features]


In [22]:
shap_values_filtered = shap_values.values[:, selected_indices]
X_filtered = X_shap_sample[:, selected_indices]
selected_features = ['befolkning', 'befolkning i til-sone', 'arbeidsplasser', 'handel', 'fritid', 'distanse']

In [None]:
import matplotlib.pyplot as plt
shap.summary_plot(shap_values_filtered, features=X_filtered, feature_names=selected_features, show=False)
plt.xlim(-1000, 1000)
plt.title('SHAP summary Plot - DNN')
plt.show()



### MAPE

In [None]:
# MAPE
mape = np.mean(np.abs((y_test_real - y_pred.flatten()) / y_test_real)) * 100
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

med_mape = np.median(np.abs((y_test_real - y_pred.flatten()) / y_test_real)) * 100
print(f"Median Absolute Percentage Error (MAPE): {med_mape:.2f}%")



### Kryssvalidasjon

In [None]:

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, val_index in kf.split(X_fra_sone):
    X_fra_train, X_fra_val = X_fra_sone[train_index], X_fra_sone[val_index]
    X_til_train, X_til_val = X_til_sone[train_index], X_til_sone[val_index]
    X_tid_train, X_tid_val = X_tid[train_index], X_tid[val_index]
    X_num_train, X_num_val = X_num[train_index], X_num[val_index]
    y_train, y_val = y[train_index], y[val_index]

    model = build_model()
    model.fit([X_fra_train, X_til_train, X_tid_train, X_num_train], y_train,
              validation_data=([X_fra_val, X_til_val, X_tid_val, X_num_val], y_val),
              epochs=100, batch_size=64, verbose=1)

    y_pred = model.predict([X_fra_val, X_til_val, X_tid_val, X_num_val]).ravel()

    #finn MAPE
    mape = np.mean(np.abs((y_val - y_pred) / y_val)) * 100
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

    print("MSE:", mean_squared_error(y_val, y_pred))
    print("MAE:", mean_absolute_error(y_val, y_pred))
    print("R2:", r2_score(y_val, y_pred))


### Overtilpasning

In [None]:
# sammenlign resultater med prediksjonser på testsettet med prediksjoner på treningssettet
y_train_pred = model.predict([X_fra_train, X_til_train, X_tid_train, X_num_train])
y_test_pred = model.predict([X_fra_test, X_til_test, X_tid_test, X_num_test])

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print("\nModellens ytelse med optimaliserte hyperparametere:")
print(f"Mean Squared Error (MSE) - Treningssett: {mse_train:.2f}")
print(f"Mean Squared Error (MSE) - Testsett: {mse_test:.2f}")
print(f"Mean Absolute Error (MAE) - Treningssett: {mae_train:.2f}")
print(f"Mean Absolute Error (MAE) - Testsett: {mae_test:.2f}")
print(f"R-squared (R2) - Treningssett: {r2_train:.2f}")
print(f"R-squared (R2) - Testsett: {r2_test:.2f}")

### Modellusikkerhet

In [None]:
n_models = 10
all_preds = []

for seed in range(n_models):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    model = build_model()  # Din modellarkitektur
    model.fit([X_fra_train, X_til_train, X_tid_train, X_num_train], y_train, epochs=140, verbose=0, batch_size=64) # Reduser epochs ved begrensede ressurser
    # Prediker og lagre
    preds = model.predict([X_fra_test, X_til_test, X_tid_test, X_num_test]).flatten()
    all_preds.append(preds)

all_preds = np.array(all_preds)
mean_preds = all_preds.mean(axis=0)
std_preds = all_preds.std(axis=0)


In [None]:
# relativ std
rel_std = (std_preds / mean_preds)
print(f"Relativ standardavvik: {np.mean(rel_std):.2f}%")

In [None]:
# mean std
print(f"Mean relativ standardavvik: {np.mean(std_preds):.2f}%")


In [None]:
# maks rel std
print(f"Maks relativ standardavvik: {np.max(rel_std):.2f}%")
print(f"min relativ standardavvik: {np.min(rel_std):.2f}%")

In [None]:
rel_std = (y_pred_std / y_pred_mean)
print(f"Relativ standardavvik: {np.mean(rel_std):.2f}%")
print(f"Relativ standardavvik: {np.mean(y_pred_std):.2f}%")

### Residualplott

In [None]:
# Plot residuals
import matplotlib.pyplot as plt

residuals = y_test - y_pred.flatten()
plt.figure(figsize=(10, 5))
plt.scatter(y_test, residuals, alpha=0.5)
plt.xlabel('Faktiske verider')
plt.ylabel('Residual')
plt.axhline(y=0, color='black', linewidth=2)
plt.title('Residualer mot faktiske verdier')
plt.grid(True)
plt.show()





In [None]:
# Beregn residualer
residuals = y_test - y_pred.flatten()

# Filtrer der y_test ≤ 40000
mask = y_test <= 40000
y_test_filtered = y_test[mask]
residuals_filtered = residuals[mask]

# Plot
plt.figure(figsize=(10, 5))
plt.scatter(y_test_filtered, residuals_filtered, alpha=0.5)
plt.xlabel('Faktisk verdier')
plt.ylabel('Residual')
plt.axhline(y=0, color='black', linewidth=2)
plt.title('Residualer mot Faktisk verdi (≤ 40 000)')
plt.grid(True)
plt.show()

### Analyse av rom og tid

In [None]:
def compute_mape_for_ranges(y_true, y_pred, low_threshold=1000, high_threshold=10000):
    low_mask = y_true <= low_threshold
    medium_mask = (y_true > low_threshold) & (y_true <= high_threshold)
    high_mask = y_true >= high_threshold

    mape_low = np.mean(np.abs((y_true[low_mask] - y_pred[low_mask]) / y_true[low_mask])) * 100
    mape_medium = np.mean(np.abs((y_true[medium_mask] - y_pred[medium_mask]) / y_true[medium_mask])) * 100
    mape_high = np.mean(np.abs((y_true[high_mask] - y_pred[high_mask]) / y_true[high_mask])) * 100

    print(f"MAPE for lave verdier (≤ {low_threshold}): {mape_low:.2f}%")
    print(f"MAPE for mellomverdier ({low_threshold} < x <= {high_threshold}): {mape_medium:.2f}%")
    print(f"MAPE for høye verdier (≥ {high_threshold}): {mape_high:.2f}%")

    # finn også mae, mse og r2
    mae_low = mean_absolute_error(y_true[low_mask], y_pred[low_mask])
    mae_medium = mean_absolute_error(y_true[medium_mask], y_pred[medium_mask])
    mae_high = mean_absolute_error(y_true[high_mask], y_pred[high_mask])

    mse_low = mean_squared_error(y_true[low_mask], y_pred[low_mask])
    mse_medium = mean_squared_error(y_true[medium_mask], y_pred[medium_mask])
    mse_high = mean_squared_error(y_true[high_mask], y_pred[high_mask])
    r2_low = r2_score(y_true[low_mask], y_pred[low_mask])
    r2_medium = r2_score(y_true[medium_mask], y_pred[medium_mask])
    r2_high = r2_score(y_true[high_mask], y_pred[high_mask])
    print(f"MAE for lave verdier (≤ {low_threshold}): {mae_low:.2f}")
    print(f"MAE for mellomverdier ({low_threshold} < x <= {high_threshold}): {mae_medium:.2f}")
    print(f"MAE for høye verdier (≥ {high_threshold}): {mae_high:.2f}")
    print(f"MSE for lave verdier (≤ {low_threshold}): {mse_low:.2f}")
    print(f"MSE for mellomverdier ({low_threshold} < x <= {high_threshold}): {mse_medium:.2f}")
    print(f"MSE for høye verdier (≥ {high_threshold}): {mse_high:.2f}")
    print(f"R2 for lave verdier (≤ {low_threshold}): {r2_low:.2f}")
    print(f"R2 for mellomverdier ({low_threshold} < x <= {high_threshold}): {r2_medium:.2f}")
    print(f"R2 for høye verdier (≥ {high_threshold}): {r2_high:.2f}")

compute_mape_for_ranges(y_test_real, y_pred.flatten())


In [None]:
# Før split, behold indeksene fra `data_prep`
X = data_prep.drop(columns=["reiser"])  # Fjern target for å lage feature-matrix
y = data_prep["reiser"]  # Target

# Lagre indeksene for senere bruk
X["index"] = data_prep.index
y = y.reset_index(drop=True)  # Sørg for at `y` ikke mister tilknytning

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Behold indekser i testsettet
X_test_indices = X_test["index"]  # Lagre de opprinnelige indeksene
X_test = X_test.drop(columns=["index"])  # Fjern index-kolonnen etterpå


# Hent `time_category` basert på de originale indeksene
time_test = data_prep.loc[X_test_indices, "tid"] # Endre til befolkning for videre test

# Finn indekser for "uten-rush" data
ur_mask = (time_test == 'HELG')

# Filtrer testsettet
X_test_ur = [
    X_fra_test[ur_mask],
    X_til_test[ur_mask],
    X_tid_test[ur_mask],
    X_num_test[ur_mask]
]
y_test_ur = y_test.loc[ur_mask] 

# Prediker kun for "uten-rush" data
y_pred_ur = model.predict(X_test_ur)


mae_ur = mean_absolute_error(y_test_ur, y_pred_ur)
mse_ur = mean_squared_error(y_test_ur, y_pred_ur)
r2_ur = r2_score(y_test_ur, y_pred_ur)
MAPE = np.mean(np.abs((y_test_ur - y_pred_ur.flatten()) / y_test_ur)) * 100

print(f"Mean Absolute Error (MAE): {mae_ur:.2f}")
print(f"Mean Squared Error (MSE): {mse_ur:.2f}")
print(f"R² Score: {r2_ur:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {MAPE:.2f}%")


