<a href="https://colab.research.google.com/github/chaimaeelh/Data_Scientist_NY/blob/main/Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Time series with Decision Tree

In [2]:
# import necessary libraries

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from sklearn.tree import DecisionTreeRegressor
# Function to preprocess the data (to transform time series problem to a regression problem )
def preprocess_data(data):
    data_Hoodie = data[['date', 'Hoodie']]
    # Create a sale diff column to see the sale difference between each week
    data_Hoodie["sales_diff"] = data_Hoodie['Hoodie'].diff()
    data_Hoodie = data_Hoodie.dropna()
    # Create lag features (they are the 20 previous record of each point starting with Index=21)
    for i in range(1, 20):
        col_name = 'sales_diff_week' + str(i)
        data_Hoodie[col_name] = data_Hoodie['sales_diff'].shift(i)
    # Create a copy for testing purposes
    supervised_data = data_Hoodie.dropna().reset_index(drop=True)
    supervised_data_test = supervised_data
    supervised_data = supervised_data.drop(columns=['date', 'Hoodie'])

    return supervised_data, supervised_data_test


def train_decision_tree(x_train, y_train):
    lr_model = DecisionTreeRegressor()
    lr_model.fit(x_train, y_train)
    return lr_model

def make_predictions(lr_model, x_test, act_sales):
    lr_pr = lr_model.predict(x_test)
    lr_pr = lr_pr.reshape(-1, 1)

    lr_pre_test_set = np.concatenate([lr_pr, x_test], axis=1)


    result_array = lr_pre_test_set[:, 0] + act_sales
    lr_pre_series = pd.Series(result_array, name="Linear Prediction")

    return lr_pre_series

def evaluate_model(predictions, actual_values):
    mse = np.sqrt(mean_squared_error(predictions, actual_values))
    mae = mean_absolute_error(predictions, actual_values)
    r2 = r2_score(predictions, actual_values)

    print("Linear Regression MSE:", mse)
    print("Linear Regression MAE:", mae)
    print("Linear Regression R^2:", r2)

def save_model(model, filename):
    joblib.dump(model, filename)
    print(f"Model saved to {filename}")

# Load data
csv_path = "/content/sales.csv"
data = pd.read_csv(csv_path, sep=",")

# Preprocess data
supervised_data, supervised_data_test = preprocess_data(data)

# Split data into train and test sets
train_data = supervised_data[:70]
test_data = supervised_data[70:]

    # Extract features and target variable for training data
x_train, y_train = train_data.iloc[:, 1:], train_data.iloc[:, 0]

    # Extract features and target variable for test data
x_test, y_test = test_data.iloc[:, 1:], test_data.iloc[:, 0]

# Extract actual sales for evaluation
act_sales = supervised_data_test['Hoodie'][70:].to_list()

# Train linear regression model
lr_model =  train_decision_tree(x_train, y_train)

# Make predictions
lr_pre_series = make_predictions(lr_model, x_test, act_sales)

# Evaluate model
evaluate_model(lr_pre_series, supervised_data_test['Hoodie'][70:])
# Save model



Linear Regression MSE: 671.8901581474346
Linear Regression MAE: 573.1538461538462
Linear Regression R^2: 0.9531374717308561


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_Hoodie["sales_diff"] = data_Hoodie['Hoodie'].diff()


In [3]:
joblib.dump(lr_model, 'logistic_regression_model.joblib')

['logistic_regression_model.joblib']

Forcast for the next 5 sales

In [4]:
# Extract the last line of data as input of my model
future_weeks_data = supervised_data_test.iloc[-1, 3:].values.reshape(1, -1)


In [6]:
predictions = []
current_sales = supervised_data_test['Hoodie'].values[-1]
for _ in range(5):
    # Predict the next week's sales  NB: By reshaping future_weeks_data , we convert it into a 2D array with one row and x columns.
    next_week_pred = lr_model.predict(future_weeks_data.reshape(1, -1))

    # Update the input features for the next prediction
    future_weeks_data[0, :-1] = np.roll(future_weeks_data[0, :-1], shift=-1)
    future_weeks_data[0, -1] = next_week_pred[0]
    next_week_sales_pred = current_sales + next_week_pred[0]

    # Update current_sales for the next iteration
    current_sales = next_week_sales_pred
        # Update current_sales for the next iteration

    # Append the predicted value to the result list
    predictions.append(next_week_sales_pred)

# Create a DataFrame to store the predictions
prediction_dates = pd.date_range(start='2020-09-10', periods=5, freq='W')
prediction_df = pd.DataFrame({'date': prediction_dates, 'sales_diff_prediction': predictions})

# Display the predictions
print(prediction_df.tail())


        date  sales_diff_prediction
0 2020-09-13                 7130.0
1 2020-09-20                 6012.0
2 2020-09-27                 7419.0
3 2020-10-04                 8826.0
4 2020-10-11                 7979.0


