<a href="https://colab.research.google.com/github/chaimaeelh/Data_Scientist_NY/blob/main/Linear_Regression_Forcast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Train and Evaluate Linear Regression Model

In [43]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

def preprocess_data(data):
    data_Hoodie = data[['date', 'Hoodie']]
    data_Hoodie["sales_diff"] = data_Hoodie['Hoodie'].diff()
    data_Hoodie = data_Hoodie.dropna()

    for i in range(1, 13):
        col_name = 'sales_diff_week' + str(i)
        data_Hoodie[col_name] = data_Hoodie['sales_diff'].shift(i)

    supervised_data = data_Hoodie.dropna().reset_index(drop=True)
    supervised_data_test = supervised_data
    supervised_data = supervised_data.drop(columns=['date', 'Hoodie'])

    return supervised_data, supervised_data_test

def scale_data(train_data, test_data):
    # Extract features and target variable for training data
    x_train, y_train = train_data.iloc[:, 1:], train_data.iloc[:, 0]

    # Extract features and target variable for test data
    x_test, y_test = test_data.iloc[:, 1:], test_data.iloc[:, 0]

    # Apply MinMaxScaler to the features
    scaler = MinMaxScaler(feature_range=(-1, 1))
    x_train= pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
    x_test = pd.DataFrame(scaler.transform(x_test), columns=x_train.columns)

    return x_train, y_train.values.ravel(), x_test, y_test.values.ravel(), scaler

def train_linear_regression(x_train, y_train):
    lr_model = LinearRegression()
    lr_model.fit(x_train, y_train)
    return lr_model

def make_predictions(lr_model, x_test, scaler, act_sales):
    lr_pr = lr_model.predict(x_test)
    lr_pr = lr_pr.reshape(-1, 1)

    lr_pre_test_set = np.concatenate([lr_pr, x_test], axis=1)
    #lr_pre_test_set = scaler.inverse_transform(lr_pre_test_set)

    result_array = lr_pre_test_set[:, 0] + act_sales
    lr_pre_series = pd.Series(result_array, name="Linear Prediction")

    return lr_pre_series

def evaluate_model(predictions, actual_values):
    mse = np.sqrt(mean_squared_error(predictions, actual_values))
    mae = mean_absolute_error(predictions, actual_values)
    r2 = r2_score(predictions, actual_values)

    print("Linear Regression MSE:", mse)
    print("Linear Regression MAE:", mae)
    print("Linear Regression R^2:", r2)

def save_model(model, filename):
    joblib.dump(model, filename)
    print(f"Model saved to {filename}")

# Load data
csv_path = "/content/sales.csv"
data = pd.read_csv(csv_path, sep=",")

# Preprocess data
supervised_data, supervised_data_test = preprocess_data(data)

# Split data into train and test sets
train_data = supervised_data[:70]
test_data = supervised_data[70:]

# Scale data
x_train, y_train, x_test, y_test, scaler = scale_data(train_data, test_data)

# Extract actual sales for evaluation
act_sales = supervised_data_test['Hoodie'][70:].to_list()

# Train linear regression model
lr_model = train_linear_regression(x_train, y_train)

# Make predictions
lr_pre_series = make_predictions(lr_model, x_test, scaler, act_sales)

# Evaluate model
evaluate_model(lr_pre_series, supervised_data_test['Hoodie'][70:])
# Save model
joblib.dump(lr_model, 'linear_regression_model.joblib')


Linear Regression MSE: 984.0150697182573
Linear Regression MAE: 580.4605636668762
Linear Regression R^2: 0.7415122753525625


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_Hoodie["sales_diff"] = data_Hoodie['Hoodie'].diff()


Forcasting about sales in the next 5 weeks

In [50]:
loaded_model = joblib.load('linear_regressionMmodel.joblib')


In [51]:

# Get the last row from the entire dataset
last_observation = supervised_data.iloc[-1, 1:].values.reshape(1, -1)
last_observation = scaler.transform(last_observation)
#last_observation = supervised_data.tail(5)

# Extract features for the last 5 rows
#last_observation = last_observation.iloc[:, 1:]

# Scale the features using the same MinMaxScaler used for training

#last_observation = pd.DataFrame(scaler.transform(last_observation), columns=last_observation.columns)

# Predict the next 5 weeks
predictions = []
for _ in range(5):
    # Predict the next week's sales
    next_week_pred = loaded_model.predict(last_observation)

    # Update the input features for the next prediction
    last_observation = np.roll(last_observation, shift=-1)
    last_observation[0, -1] = next_week_pred[0]

    # Append the predicted value to the result list
    predictions.append(next_week_pred[0])

# Create a DataFrame to store the predictions
prediction_dates = pd.date_range(start='2020-09-11', periods=5, freq='W')
prediction_df = pd.DataFrame({'date': prediction_dates, 'sales_diff_prediction': predictions})

# Display the predictions
print(prediction_df.tail())


        date  sales_diff_prediction
0 2020-09-13          -1.326205e+03
1 2020-09-20          -5.777632e+04
2 2020-09-27          -2.934244e+06
3 2020-10-04          -1.452466e+08
4 2020-10-11          -7.220039e+09


