In [490]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(110)

In [491]:
df = pd.read_csv("../data/Meal Durations.csv")

In [492]:
df['start_time'] = df['start_time'].apply(lambda x: (int(str(x)[0]) * 60 + int(str(x)[1:])) if x < 999 else (int(str(x)[:2]) * 60 + int(str(x)[2:])))
df['start_time'] = df['start_time']/(24*60)


In [493]:
# df = df.drop(['day'], axis=1)

In [494]:
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the data: 80% for training and 20% for testing
split_index = int(0.8 * len(shuffled_df))
train_df = shuffled_df.iloc[:split_index]
test_df = shuffled_df.iloc[split_index:]

print(f"Total: {len(df)}, Train: {len(train_df)}, Test: {len(test_df)}")

Total: 84, Train: 67, Test: 17


In [495]:
train_df

Unnamed: 0,day,start_time,is_holiday,meal_type,location,busy_after,hunger,num_courses,meal_duration
0,2,0.340278,0,1,0,1,2,1,9
1,4,0.279167,0,0,0,1,3,1,12
2,6,0.659028,1,4,0,0,2,1,28
3,0,0.618056,0,4,0,1,2,1,12
4,6,0.305556,1,0,0,0,3,1,14
...,...,...,...,...,...,...,...,...,...
62,3,0.433333,0,2,1,1,2,1,3
63,2,0.739583,0,5,0,1,3,2,14
64,6,0.545139,1,3,0,0,1,2,38
65,2,0.422222,0,2,1,1,2,1,11


In [496]:
# Separate independent and dependent variables
X = train_df.drop(columns='meal_duration').to_numpy()
y = train_df['meal_duration'].to_numpy()
X = np.hstack([np.ones((X.shape[0], 1)), X])  # Add bias/intercept term

# Poisson regression via gradient ascent
def compute_lambda(X, beta):
    return np.exp(X @ beta)

def gradient_log_likelihood(X, y, beta):
    lambda_ = compute_lambda(X, beta)
    return X.T @ (y - lambda_)

def poisson_regression_GD(X, y, lr=0.000001, num_iter=100000):
    beta = np.zeros(X.shape[1])
    for i in range(num_iter):
        gradient = gradient_log_likelihood(X, y, beta)
        beta += lr * gradient  # gradient ascent since we maximize log-likelihood
    return beta

# Training the model
beta = poisson_regression_GD(X, y)

# Analyzing the importance of parameters
importance = abs(beta)
sorted_indices = np.argsort(importance)[::-1]
column_names = ['Bias'] + train_df.drop(columns='meal_duration').columns.tolist()

print("Importance of parameters:")
for idx in sorted_indices:
    print(f"{column_names[idx]}: {importance[idx]}")

Importance of parameters:
Bias: 2.011308596077432
num_courses: 0.44664781605865433
start_time: 0.411636556391895
is_holiday: 0.16279472615102814
location: 0.04930592299852654
hunger: 0.043611939405606905
day: 0.03708972322535852
meal_type: 0.03635493588882833
busy_after: 0.03145353505015056


In [499]:
print(beta) # 37
train_id = 22
print(train_df.iloc[train_id])
train_x = train_df.iloc[train_id].to_numpy()[:-1]
train_x = np.insert(train_x, 0, 1)
prediction = np.exp(train_x @ beta)
prediction

[ 2.0113086   0.03708972  0.41163656  0.16279473 -0.03635494 -0.04930592
 -0.03145354 -0.04361194  0.44664782]
day               3.000000
start_time        0.547222
is_holiday        1.000000
meal_type         3.000000
location          0.000000
busy_after        0.000000
hunger            2.000000
num_courses       2.000000
meal_duration    30.000000
Name: 22, dtype: float64


24.7205803357897

Based 