In [4]:
# A2 Perform prediction on the test data and compare the metric values between train and test set.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

# Loading the dataset
df = pd.read_excel('/content/TRAINING_DATA.xlsx')

# Defining the feature and target variables
X = df[['embed_0', 'embed_1', 'embed_2']]  # feature variables
y = df['OUTPUT']  # target variable

# Spliting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating and train a multiple linear regression model
reg = LinearRegression()
reg.fit(X_train, y_train)

# Making predictions on the training data
y_train_pred = reg.predict(X_train)

# Calculating metrics for the training data
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = mse_train ** 0.5
mape_train = mean_absolute_percentage_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print("Training metrics\n")
print(f"MSE: {mse_train:.2f}")
print(f"RMSE: {rmse_train:.2f}")
print(f"MAPE: {mape_train:.2f}%")
print(f"R2: {r2_train:.2f}")

# Making predictions on the test data
y_test_pred = reg.predict(X_test)

# Calculating metrics for the test data
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = mse_test ** 0.5
mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print("\nTest metrics\n")
print(f"MSE: {mse_test:.2f}")
print(f"RMSE: {rmse_test:.2f}")
print(f"MAPE: {mape_test:.2f}%")
print(f"R2: {r2_test:.2f}")

Training metrics

MSE: 1.32
RMSE: 1.15
MAPE: 136413459216867.97%
R2: 0.03

Test metrics

MSE: 1.37
RMSE: 1.17
MAPE: 61523974522849.21%
R2: 0.01
