### Ridge Regression

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [10]:
df = pd.read_csv("merged_df.csv")
df.describe()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday,StateHoliday_encoded,isHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,StoreType_encoded,Assortment_encoded,PromoInterval_encoded
count,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0
mean,558.4297,3.998341,5773.819,633.1459,0.8301067,0.3815145,0.1786467,0.04516279,0.8213533,5422.021,7.336981,2008.925,0.5005638,11.64767,1007.011,1.207467,0.9351412,0.3093396
std,321.9087,1.997391,3849.926,464.4117,0.3755392,0.4857586,0.3830564,0.2836559,0.3830564,7706.918,2.674456,4.99937,0.4999999,15.32393,1005.877,1.365376,0.9938011,1.421467
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0,1900.0,0.0,0.0,0.0,0.0,0.0,-1.0
25%,280.0,2.0,3727.0,405.0,1.0,0.0,0.0,0.0,1.0,710.0,6.0,2008.0,0.0,0.0,0.0,0.0,0.0,-1.0
50%,558.0,4.0,5744.0,609.0,1.0,0.0,0.0,0.0,1.0,2325.0,8.0,2010.0,1.0,1.0,2009.0,0.0,0.0,1.0
75%,838.0,6.0,7856.0,837.0,1.0,1.0,0.0,0.0,1.0,6880.0,9.0,2011.0,1.0,22.0,2012.0,3.0,2.0,1.0
max,1115.0,7.0,41551.0,7388.0,1.0,1.0,1.0,3.0,1.0,75860.0,12.0,2015.0,1.0,50.0,2015.0,3.0,2.0,3.0


In [11]:
# Convert date fields to datetime objects
df['Date'] = pd.to_datetime(df['Date'])
df['CompetitionOpenSinceDate'] = pd.to_datetime(df['CompetitionOpenSinceDate'])

# Calculate the duration of competition being open in months
df['CompetitionDurationMonths'] = (
    (df['Date'].dt.year - df['CompetitionOpenSinceDate'].dt.year) * 12 +
    (df['Date'].dt.month - df['CompetitionOpenSinceDate'].dt.month)
)

# Replace NaNs in 'CompetitionDurationMonths' with the median duration
df['CompetitionDurationMonths'].fillna(df['CompetitionDurationMonths'].median(), inplace=True)

# Drop the original date fields and other non-numeric or unnecessary columns
df = df.drop(['Date', 'CompetitionOpenSinceDate', 'Promo2Date'], axis=1)

# Convert categorical variables to dummy variables
df = pd.get_dummies(df, columns=['DayOfWeek', 'StateHoliday', 'StoreType', 'Assortment', 'PromoInterval'])

In [12]:
# drop sales and customers as they should not be part of the model
X = df.drop(['Sales', 'Customers'], axis=1)

# Define a range of alpha values to search
alphas = [0.1, 1, 10, 100]  # Add more values based on your requirements

In [13]:
# Define target for Sales
y_sales = df['Sales']

# Split the data for Sales
X_train_sales, X_test_sales, y_train_sales, y_test_sales = train_test_split(X, y_sales, test_size=0.2, random_state=42)

# Standardize the features for Sales
scaler_sales = StandardScaler()
X_train_sales_scaled = scaler_sales.fit_transform(X_train_sales)
X_test_sales_scaled = scaler_sales.transform(X_test_sales)

# Create and fit RidgeCV model for Sales using standardized features
ridge_cv_model_sales = RidgeCV(alphas=alphas, store_cv_values=True)
ridge_cv_model_sales.fit(X_train_sales_scaled, y_train_sales)

# Access the optimal alpha for Sales
optimal_alpha_sales = ridge_cv_model_sales.alpha_
print(f'Optimal Alpha for Sales: {optimal_alpha_sales}')

# Predict on the test set for Sales
y_pred_sales = ridge_cv_model_sales.predict(X_test_sales_scaled)

# Evaluate performance for Sales
mse_sales = mean_squared_error(y_test_sales, y_pred_sales)
rmse_sales = np.sqrt(mse_sales)
rmspe_sales = rmse_sales / np.mean(y_test_sales) * 100

print(f'Mean Squared Error for Sales: {mse_sales}')
print(f'Root Mean Squared Error for Sales: {rmse_sales}')
print(f'Root Mean Squared Percentage Error for Sales: {rmspe_sales}')

Optimal Alpha for Sales: 0.1
Mean Squared Error for Sales: 6184401.960422605
Root Mean Squared Error for Sales: 2486.845785412237
Root Mean Squared Percentage Error for Sales: 43.16206982958382


In [14]:
# Define features and target for Customers
y_customers = df['Customers']

# Split the data for Customers
X_train_customers, X_test_customers, y_train_customers, y_test_customers = train_test_split(X, y_customers, test_size=0.2, random_state=42)

# Standardize the features for Customers
scaler_customers = StandardScaler()
X_train_customers_scaled = scaler_customers.fit_transform(X_train_customers)
X_test_customers_scaled = scaler_customers.transform(X_test_customers)

# Create and fit RidgeCV model for Customers using standardized features
ridge_cv_model_customers = RidgeCV(alphas=alphas, store_cv_values=True)
ridge_cv_model_customers.fit(X_train_customers_scaled, y_train_customers)

# Access the optimal alpha for Customers
optimal_alpha_customers = ridge_cv_model_customers.alpha_
print(f'Optimal Alpha for Customers: {optimal_alpha_customers}')

# Predict on the test set for Customers
y_pred_customers = ridge_cv_model_customers.predict(X_test_customers_scaled)

# Evaluate performance for Customers
mse_customers = mean_squared_error(y_test_customers, y_pred_customers)
rmse_customers = np.sqrt(mse_customers)
rmspe_customers = rmse_customers / np.mean(y_test_customers) * 100

print(f'Mean Squared Error for Customers: {mse_customers}')
print(f'Root Mean Squared Error for Customers: {rmse_customers}')
print(f'Root Mean Squared Percentage Error for Customers: {rmspe_customers}')

Optimal Alpha for Customers: 0.1
Mean Squared Error for Customers: 91593.92623335756
Root Mean Squared Error for Customers: 302.6448846971605
Root Mean Squared Percentage Error for Customers: 47.88234197649414
