In [1]:
pip install --upgrade scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [56]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression

# Read the csv for further processing
def read_dataset(file_path):
    return pd.read_csv(file_path)

# Data preprocessing 
def preprocess_data(df):
    # Drop rows with missing values
    df.dropna(inplace=True)
    
    # Convert 'Date' column to datetime type
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Convert datetime to numeric representation
    df['Date'] = df['Date'].astype(np.int64) // 10**9 
    
    # Drop irrelevant columns if they exist
    irrelevant_columns = ['Customer ID', 'Invoice ID', 'Time', 'cogs', 'gross margin percentage', 'gross income', 'Rating']
    df = df.drop(columns=[col for col in irrelevant_columns if col in df.columns], errors='ignore')

    # Encode categorical variables
    le = LabelEncoder()
    df['Gender'] = le.fit_transform(df['Gender'])
    
    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=['City', 'Customer type', 'Branch', 'Product line', 'Payment'], drop_first=True)
    
    return df

# Split data into features and target
def split_data(df):
    X = df.drop('Total', axis=1)
    y = df['Total']
    return train_test_split(X, y, test_size=0.3, random_state=42)

# Hyperparameter Tuning
def hyperparameter_tuning(model, params, X_train, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Print metrics for each model
def print_metrics(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print("Mean Squared Error (MSE):", mse)
    print("Mean Absolute Error (MAE):", mae)
    print("R-squared (R2) Score:", r2)
    print("*" * 60)

# Plotting feature importance if the model supports it 
def plot_feature_importance(features, importances, best_model):    
    if importances is None:
        print("Feature importances not available for the selected model.")
        return
    feature_importance = pd.DataFrame({'Feature': features, 'Importance': importances})
    feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title(f'Feature Importance for {best_model}')
    plt.show()
    print("Feature importance plot generated successfully!")

#Executing the program
if __name__ == "__main__":
    # Reading the dataset
    df = read_dataset("supermarket_sales.csv")
    
    # Preprocessing the data
    df = preprocess_data(df)

    # Split the data
    X_train, X_test, y_train, y_test = split_data(df)

    # Defining the models being used (chosen based on the type of data - continuous)
    mlModels = {        
        "Linear Regression": LinearRegression(),
        "Random Forest": RandomForestRegressor(),
        "Gradient Boosting": GradientBoostingRegressor()
    }

   # Hyperparameter Tuning and best model choosing
    best_r2_score = -float('inf')
    best_model = None
    
    for name, model in mlModels.items():
        params = {}  # Defining the hyperparameters for each model
        mlModels[name] = hyperparameter_tuning(model, params, X_train, y_train)
        y_pred = mlModels[name].predict(X_test)
        r2_score_current = r2_score(y_test, y_pred)
        
        print(f"{name} R-squared (R2) Score:", r2_score_current)
        
        if r2_score_current > best_r2_score:
            best_r2_score = r2_score_current
            best_model = name
        print_metrics(y_test, y_pred) # Printing the metrics for each model

    # Predicted total  for each model vs the actual total in the dataset for comparison
    for name, model in mlModels.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            # Aggregate predicted and actual sales per month
            df_test = X_test.copy()
            df_test["Predicted Total"] = y_pred
            df_test["Actual Total"] = df["Total"]

            df_test["permonth"] = pd.to_datetime(df_test["Date"]).dt.to_period("M")
            print(f"Total Income for {name}:")
            print(df_test[["permonth", "Predicted Total", "Actual Total"]])
            

    # Print the best model and its R2 score
    print(f"Best Model: {best_model}, R-squared (R2) Score: {best_r2_score}")
    
    # Feature importance graph 
    best_clf = mlModels[best_model]
    if best_model in ['Random Forest', 'Gradient Boosting'] and hasattr(best_clf, 'feature_importances_'):
        feature_importance = best_clf.feature_importances_
        features = X_train.columns
        plot_feature_importance(features, feature_importance, best_model)
    else:
        print("Feature importance is not available for the selected model or the model does not support it.")

Linear Regression R-squared (R2) Score: 1.0
Mean Squared Error (MSE): 4.541694326203986e-17
Mean Absolute Error (MAE): 5.846804818077089e-09
R-squared (R2) Score: 1.0
************************************************************
Random Forest R-squared (R2) Score: 0.9999154708845687
Mean Squared Error (MSE): 5.573432897271934
Mean Absolute Error (MAE): 1.15694215000003
R-squared (R2) Score: 0.9999154708845687
************************************************************
Gradient Boosting R-squared (R2) Score: 0.9999383758639572
Mean Squared Error (MSE): 4.063191544527292
Mean Absolute Error (MAE): 1.4445446730840246
R-squared (R2) Score: 0.9999383758639572
************************************************************
Total Income for Linear Regression:
    permonth  Predicted Total  Actual Total
521  1970-01         523.9710      523.9710
737  1970-01         616.9800      616.9800
740  1970-01         408.7335      408.7335
660  1970-01         135.3555      135.3555
411  1970-01        