<a href="https://colab.research.google.com/github/guilhermelaviola/SalesPrediction/blob/main/NumberOfOrdersOrdersPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Importig all the necessary libraries:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
import lightgbm as ltb

In [2]:
# Importing the dataset:
data = pd.read_csv('https://raw.githubusercontent.com/amankharwal/Website-data/master/supplement.csv')
# Displayimg the first 10 rows of the dataset:
data.head(10)

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52
5,T1000006,249,S1,L3,R2,2018-01-01,1,Yes,39,34211.22
6,T1000007,248,S1,L1,R2,2018-01-01,1,Yes,40,35352.66
7,T1000008,247,S1,L1,R3,2018-01-01,1,Yes,64,52650.0
8,T1000009,246,S3,L1,R3,2018-01-01,1,Yes,62,42633.78
9,T1000010,254,S4,L1,R1,2018-01-01,1,Yes,87,62572.8


In [3]:
# Checking some necessary insights in the dataset:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188340 entries, 0 to 188339
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ID             188340 non-null  object 
 1   Store_id       188340 non-null  int64  
 2   Store_Type     188340 non-null  object 
 3   Location_Type  188340 non-null  object 
 4   Region_Code    188340 non-null  object 
 5   Date           188340 non-null  object 
 6   Holiday        188340 non-null  int64  
 7   Discount       188340 non-null  object 
 8   #Order         188340 non-null  int64  
 9   Sales          188340 non-null  float64
dtypes: float64(1), int64(3), object(6)
memory usage: 14.4+ MB


In [4]:
# Checking if these's any null value in the dataset:
data.isnull().sum()

ID               0
Store_id         0
Store_Type       0
Location_Type    0
Region_Code      0
Date             0
Holiday          0
Discount         0
#Order           0
Sales            0
dtype: int64

In [5]:
data.describe()

Unnamed: 0,Store_id,Holiday,#Order,Sales
count,188340.0,188340.0,188340.0,188340.0
mean,183.0,0.131783,68.205692,42784.327982
std,105.366308,0.338256,30.467415,18456.708302
min,1.0,0.0,0.0,0.0
25%,92.0,0.0,48.0,30426.0
50%,183.0,0.0,63.0,39678.0
75%,274.0,0.0,82.0,51909.0
max,365.0,1.0,371.0,247215.0


In [6]:
# Exploring some of the important features from the dataset with a pie chart
# to know about the factors affecting the number of orders:
pie = data['Store_Type'].value_counts()
store = pie.index
orders = pie.values

fig = px.pie(data, values = orders, names = store)
fig.show()

In [7]:
# Checking the distribution of the number of orders according to the location
# using the same pie chart:
pie2 = data['Location_Type'].value_counts()
location = pie2.index
orders = pie2.values

fig = px.pie(data, values = orders, names = location)
fig.show()

In [8]:
# With the same pie chart, now we check the distribution of the
# number of orders according to the discount:
pie3 = data['Discount'].value_counts()
discount = pie3.index
orders = pie3.values

fig = px.pie(data, values = orders, names = discount)
fig.show()

In [9]:
# With the pie chart once again, now we check how the holidays
# affect the number of orders:
pie4 = data['Holiday'].value_counts()
holiday = pie4.index
orders = pie4.values

fig = px.pie(data, values = orders, names = holiday)
fig.show()

In [10]:
# Preparing the data for the usage of a Machine Learning model
# for the task of the number of orders prediction.
data['Discount'] = data['Discount'].map({'No' : 0, 'Yes' : 1})
data['Store_Type'] = data['Store_Type'].map({'S1' : 1,
                                             'S2' : 2,
                                             'S3' : 3,
                                             'S4' : 4})
data['Location_Type'] = data['Location_Type'].map({'L1' : 1,
                                                   'L2' : 2,
                                                   'L3' : 3,
                                                   'L4' : 4,
                                                   'L5' : 5})
data.dropna()

x = np.array(data[['Store_Type', 'Location_Type', 'Holiday', 'Discount']])
y = np.array(data['#Order'])

In [11]:
# Splitting the data into 80% training set and 20% test set::
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.2,
                                                    random_state = 42)

In [13]:
# Training the model with the light gradient boosting regression algorithm:
model = ltb.LGBMRegressor()
model.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15
[LightGBM] [Info] Number of data points in the train set: 150672, number of used features: 4
[LightGBM] [Info] Start training from score 68.163401


In [16]:
# Displaying the predicted values:
y_pred = model.predict(x_test)
data = pd.DataFrame(data = {'Predicted Orders' : y_pred.flatten()})
print(data.head())

   Predicted Orders
0         47.351897
1         97.068717
2         66.577788
3         85.143083
4         54.451098
