In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/sales_prediction.csv')

In [None]:
df

In [None]:
df.columns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
def machine_learning(df, algorithm):
    # Specify features (X) and target variable (y)
    X = df.drop('Weekly_Sales_log', axis=1)
    y = df['Weekly_Sales_log']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Dictionary to store results
    results = {}

    # Train and evaluate the selected algorithm
    model = algorithm
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    # Store results in the dictionary
    results['MAE'] = mae
    results['MSE'] = mse
    results['RMSE'] = rmse
    results['R2'] = r2

    return results

# List of algorithms
algorithms = [
    DecisionTreeRegressor(random_state=42),
    ExtraTreesRegressor(random_state=42),
    RandomForestRegressor(random_state=42),
    AdaBoostRegressor(random_state=42),
    GradientBoostingRegressor(random_state=42)
]

# Iterate through algorithms and print results
for algorithm in algorithms:
    results = machine_learning(df, algorithm)
    print(f'Model: {algorithm.__class__.__name__}')
    print(f'Mean Absolute Error: {results["MAE"]:.4f}')
    print(f'Mean Squared Error: {results["MSE"]:.4f}')
    print(f'Root Mean Squared Error: {results["RMSE"]:.4f}')
    print(f'R^2 Score: {results["R2"]:.4f}')
    print('-' * 40)

In [None]:
# evaluate the performance of regression algorithms using standard metrics such as
# Mean Absolute Error (MAE), Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and R-squared (R2) score.
# It takes time to train and evaluate the selected algorithm around 10-15 minutes.please wait till it completes.

In [None]:
# ExtraTreesRegressor and RandomForestRegressor perform the best among the models,
# with lower MAE, MSE, and RMSE, indicating better prediction accuracy and precision.
# They also have higher R2 scores, suggesting a better fit to the data.

In [None]:
# note: hyper parameter tuning is a process of selecting best parameters from set of parameters values  to increase the model performace
# Grid search cv is the one of the method for hyper parameter tuning
# it takes time around 2 hours to select the best parameters, please wait till it completes.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Define the feature matrix (X) and target vector (y)
x = df.drop(['Weekly_Sales_log'], axis=1)
y = df['Weekly_Sales_log']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Define a reduced parameter grid
param_grid_r = {
    'max_depth': [5, 10, 15],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [2, 4, 6],
    'max_features': ['sqrt', 'log2']
}

# Initialize GridSearchCV
grid_search_r = GridSearchCV(estimator=RandomForestRegressor(),
                             param_grid=param_grid_r,
                             cv=5,
                             n_jobs=-1)

# Fit GridSearchCV to training data
grid_search_r.fit(x_train, y_train)


In [None]:
best_params = grid_search_r.best_params_

score = grid_search_r.best_score_

print("Best Parameters:", best_params)

print("Best Score:", score)

In [None]:
# pass the parameters and check the accuracy for both training and testing & overfitting

x = df.drop(columns=['Weekly_Sales_log'], axis=1)
y = df['Weekly_Sales_log']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

model = RandomForestRegressor(max_depth=15, max_features='log2', min_samples_leaf=4, min_samples_split=10).fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
r2_train, r2_test

In [None]:
# predict the sales with hypertuning parameters and calculate the accuracy using metrics

x = df.drop(columns=['Weekly_Sales_log'], axis=1)
y = df['Weekly_Sales_log']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

model = RandomForestRegressor(max_depth=15, max_features='log2', min_samples_leaf=4, min_samples_split=10).fit(x_train, y_train)
y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

metrics_r = {'R2': r2,
           'Mean Absolute Error': mae,
           'Mean Squared Error': mse,
           'Root Mean Squared Error': rmse}

metrics_r

In [None]:
df

In [None]:
# manually pass the user input and predict the department wide sales for each store in the following year.
user_data = np.array([[1,2,118221,93,0,3.882,211.096358,5,2,2010,3.768384,0.000000,2.208934,10.595510]])
y_pred = model.predict(user_data)
y_pred[0]

In [None]:
import pandas as pd
import numpy as np

# Assuming you have feature names in a list (replace with your actual feature names)
feature_names = ['Store', 'Type', 'Size', 'Dept', 'IsHoliday', 'Fuel_Price', 'CPI', 'Day', 'Month', 'Year', 'Temperature_log', 'MarkDown_Total_log', 'Unemployment_log','Expected_Sales']

# Manually pass user input features with feature names
user_data = pd.DataFrame([[1, 2, 118221, 93, 0, 3.882, 211.096358, 5, 2, 2010, 3.768384, 0.000000, 2.208934, 10.595510]], columns=feature_names)

# Predict department-wide sales
y_pred = model.predict(user_data)

# Print or use the predicted sales
print("Predicted Department-Wide Sales:", y_pred[0])


In [None]:
# using Inverse Log Transformation to convert the value to original scale of the data (exp)
np.exp(y_pred[0])

In [None]:
# save the regression model by using pickle
import pickle
with open('reg_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
import pickle
import numpy as np
import pandas as pd

# Load the pickle model
with open('/content/reg_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Your input data (replace this with your actual input data)
user_data = np.array([[1, 2, 118221, 93, 0, 3.882, 211.096358, 5, 2, 2010, 3.768384, 0.000000, 2.208934, 10.595510]])

# Feature names for your input data (replace this with actual feature names)
feature_names = ['Store', 'Type', 'Size', 'Dept', 'IsHoliday', 'Fuel_Price', 'CPI', 'Day', 'Month', 'Year', 'Temperature_log', 'MarkDown_Total_log', 'Unemployment_log','Expected_Sales']

# Create a DataFrame with the input data and feature names
user_df = pd.DataFrame(user_data, columns=feature_names)

# Make predictions by providing the input data (X) to the predict method
y_pred = model.predict(user_df)

# Assuming you're using log-transformed target values, exponentiate the prediction
predicted_price = np.exp(y_pred[0])

# Print the predicted selling price
print("Predicted Department-Wide Sales:", predicted_price)
