In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the preprocessed dataset
file_path = '../data/engineered_marathon_data.csv'
df = pd.read_csv(file_path)

# Print column names to verify
print("Column names in the dataset:")
print(df.columns)

# Check for leading/trailing spaces or special characters in column names
df.columns = df.columns.str.strip()  # Strip leading/trailing spaces

# Verify again after stripping spaces
print("\nCleaned Column names in the dataset:")
print(df.columns)

# Diagnostic step: Print first few rows to inspect the data
print("\nFirst few rows of the dataset:")
print(df.head())

# Ensure 'MarathonTime' exists in the dataset (case-sensitive)
assert 'MarathonTime' in df.columns, "Column 'MarathonTime' not found in the dataset."

# Separate features and target variable
X = df.drop(columns=['MarathonTime'])  # Features
y = df['MarathonTime']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"\nMean Squared Error (MSE): {mse}")


Column names in the dataset:
Index(['id', 'km4week', 'sp4week', 'MarathonTime', 'Marathon_Prague17',
       'Name_????? ?????', 'Name_Ale? Kuchynka', 'Name_Andrej Madliak',
       'Name_Barry Sacher', 'Name_Blair MORGAN',
       ...
       'Wall21_1.93', 'Wall21_1.94', 'Wall21_1.97', 'Wall21_1.98',
       'Wall21_2.02', 'Wall21_2.05', 'CATEGORY_A', 'CATEGORY_B', 'CATEGORY_C',
       'CATEGORY_D'],
      dtype='object', length=160)

Cleaned Column names in the dataset:
Index(['id', 'km4week', 'sp4week', 'MarathonTime', 'Marathon_Prague17',
       'Name_????? ?????', 'Name_Ale? Kuchynka', 'Name_Andrej Madliak',
       'Name_Barry Sacher', 'Name_Blair MORGAN',
       ...
       'Wall21_1.93', 'Wall21_1.94', 'Wall21_1.97', 'Wall21_1.98',
       'Wall21_2.02', 'Wall21_2.05', 'CATEGORY_A', 'CATEGORY_B', 'CATEGORY_C',
       'CATEGORY_D'],
      dtype='object', length=160)

First few rows of the dataset:
         id   km4week   sp4week  MarathonTime  Marathon_Prague17  \
0 -1.712255  2.628775