In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(10)

In [69]:
df = pd.read_csv("../data/Meal Durations.csv")

In [70]:
df['start_time'] = df['start_time'].apply(lambda x: (int(str(x)[0]) * 60 + int(str(x)[1:])) if x < 999 else (int(str(x)[:2]) * 60 + int(str(x)[2:])))
df['start_time'] = df['start_time']/(24*60)

In [71]:
df = df.drop(['day'], axis=1)

In [72]:
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the data: 80% for training and 20% for testing
split_index = int(0.8 * len(shuffled_df))
train_df = shuffled_df.iloc[:split_index]
test_df = shuffled_df.iloc[split_index:]

print(f"Total: {len(df)}, Train: {len(train_df)}, Test: {len(test_df)}")

Total: 84, Train: 67, Test: 17


In [73]:
# Separate independent and dependent variables
X = train_df.drop(columns='meal_duration').to_numpy()
y = train_df['meal_duration'].to_numpy()
X = np.hstack([np.ones((X.shape[0], 1)), X])  # Add bias/intercept term

# Poisson regression via gradient ascent
def compute_lambda(X, beta):
    return np.exp(X @ beta)

def gradient_log_likelihood(X, y, beta):
    lambda_ = compute_lambda(X, beta)
    return X.T @ (y - lambda_)

def poisson_regression_GD(X, y, lr=0.000001, num_iter=100000):
    beta = np.zeros(X.shape[1])
    for i in range(num_iter):
        gradient = gradient_log_likelihood(X, y, beta)
        beta += lr * gradient  # gradient ascent since we maximize log-likelihood
    return beta

# Training the model
beta = poisson_regression_GD(X, y)

# Analyzing the importance of parameters
importance = abs(beta)
sorted_indices = np.argsort(importance)[::-1]
column_names = ['Bias'] + train_df.drop(columns='meal_duration').columns.tolist()

print("Importance of parameters:")
for idx in sorted_indices:
    print(f"{column_names[idx]}: {importance[idx]}")

Importance of parameters:
Bias: 2.113513009351181
start_time: 0.45411827282303546
num_courses: 0.44957318306167865
is_holiday: 0.21839168037963952
location: 0.05939664788258471
busy_after: 0.048132348665203
hunger: 0.04662673900881657
meal_type: 0.04279068003190537


In [74]:
print(beta)

[ 2.11351301  0.45411827  0.21839168 -0.04279068 -0.05939665 -0.04813235
 -0.04662674  0.44957318]


In [75]:

X_test = test_df.drop(columns='meal_duration').to_numpy()
X_test = np.hstack([np.ones((X_test.shape[0], 1)), X_test])  # Add bias/intercept term


predictions = np.exp(X_test @ beta)

In [76]:
# test_df['predicted_duration'] = predictions

In [77]:
test_df

Unnamed: 0,start_time,is_holiday,meal_type,location,busy_after,hunger,num_courses,meal_duration
67,0.811111,1,5,0,0,3,2,37
68,0.546528,1,3,0,0,2,2,33
69,0.276389,0,0,0,0,3,1,14
70,0.390972,1,1,0,0,3,2,36
71,0.739583,0,5,0,1,2,2,21
72,0.334028,0,1,0,1,2,1,16
73,0.65,1,4,0,0,2,1,12
74,0.563194,0,3,0,1,2,2,31
75,0.500694,0,2,1,1,1,1,9
76,0.793056,0,5,0,1,3,2,24


In [78]:
def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

# Extract actual values from test_df
y_test = test_df['meal_duration'].to_numpy()

# Compute MSE and MAE
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")


Mean Squared Error: 45.7383635107912
Mean Absolute Error: 5.893989274688731


In [79]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

X_train = train_df.drop(['meal_duration'], axis=1).to_numpy()
y_train = train_df['meal_duration'].to_numpy()

X_test = test_df.drop(['meal_duration'], axis=1).to_numpy()
y_test = test_df['meal_duration'].to_numpy()

lr = LinearRegression()
lr.fit(X_train, y_train)

lr_predictions = lr.predict(X_test)

mse_lr = mean_squared_error(y_test, lr_predictions)
mae_lr = mean_absolute_error(y_test, lr_predictions)

print(f"Linear Regression - Mean Squared Error: {mse_lr}")
print(f"Linear Regression - Mean Absolute Error: {mae_lr}")


Linear Regression - Mean Squared Error: 40.594823610342914
Linear Regression - Mean Absolute Error: 5.643134346247568
