# Introduction

In this notebook we are going to try few models to predict how many units of a product will be sold given a price

## 1 - Importing the libraries

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import operator
import pickle

from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.simplefilter(action='ignore', category='FutureWarning')

import matplotlib.pyplot as plt
# import pylab

path = 'data/'

## 2 -  Data manipulation

In [59]:
df_sales = pd.read_csv(path + 'sales.csv')
df_sales['UNIT_VALUE'] = df_sales['REVENUE']/df_sales['QTY_ORDER']
# df_sales['PROD_CAT'] = df_sales['PROD_ID'].astype('category').cat.codes
df_sales.head()

Unnamed: 0,PROD_ID,DATE_ORDER,QTY_ORDER,REVENUE,UNIT_VALUE
0,P6,2015-08-02,1.0,1808.99,1808.99
1,P6,2015-08-17,1.0,1674.0,1674.0
2,P6,2015-08-17,1.0,1673.95,1673.95
3,P6,2015-08-11,1.0,1674.0,1674.0
4,P6,2015-08-17,1.0,1674.0,1674.0


In [60]:
df = df_sales.groupby(by=['DATE_ORDER', 'PROD_ID', 'UNIT_VALUE'])[['QTY_ORDER']].sum().reset_index()

Let's remove the outliers

In [61]:
df.describe()

Unnamed: 0,UNIT_VALUE,QTY_ORDER
count,9890.0,9890.0
mean,822.138054,38.298281
std,392.234913,165.165207
min,229.0,1.0
25%,548.0,1.0
50%,719.1,4.0
75%,859.9,13.0
max,2599.0,3978.0


In [62]:
products = sorted(df_sales["PROD_ID"].unique())

In [63]:
print("DF size before change: ", len(df))

DF size before change:  9890


In [64]:
qty_percentile = {}
for prod in products:
    data = df.loc[df['PROD_ID'] == prod]['QTY_ORDER']
    qty_percentile[prod] = np.percentile(data,99)
print(qty_percentile)

{'P1': 46.0, 'P2': 768.44, 'P3': 38.649999999999864, 'P4': 280.53999999999996, 'P5': 398.1500000000002, 'P6': 48.1599999999994, 'P7': 1483.8999999999983, 'P8': 646.7500000000002, 'P9': 519.2599999999989}


In [65]:
for prod in products:
    df = df.loc[~((df['PROD_ID'] == prod) & (df['QTY_ORDER'] > qty_percentile[prod]))]

In [66]:
print("DF size after change: ",len(df))

DF size after change:  9788


## 3 - Creating the models

In [85]:
def create_model(df, prod_cat):
    df = df.loc[df['PROD_ID'] == prod]

    # Splitting the data
    test_data_size = 0.2
    x_train, x_test, y_train, y_test = train_test_split(df[['UNIT_VALUE']], df['QTY_ORDER'], test_size=test_data_size, random_state=0)

    # Standardization
    z_mean = x_train['UNIT_VALUE'].mean()
    z_std = x_train['UNIT_VALUE'].std()
    
    x_train['VALUE_STAND'] = x_train.apply(lambda row: stand(row['UNIT_VALUE']), axis=1)
    x_test['VALUE_STAND'] = x_test.apply(lambda row: stand(row['UNIT_VALUE']), axis=1)
    x_train = x_train[['VALUE_STAND']]
    x_test = x_test[['VALUE_STAND']]
    
    # Transform data
    polyformer  = PolynomialFeatures(degree=2, include_bias=True)
    x_train_model = polyformer.fit_transform(x_train)
    x_test_model = polyformer.fit_transform(x_test)
    model = LinearRegression()
    
    # Train model
    model.fit(x_train_model , y_train)  
    y_train_pred = model.predict(x_train_model)
    y_pred = model.predict(x_test_model)
    
    score = model.score(x_test_model, y_test)

    mse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    r2_train = r2_score(y_train, y_train_pred)

    mse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    print("Model: {}".format('Polynomial linear regression'))
    print("Model Score: {}".format(score))
    print("\n")
    print("Performace on the training set")
    print("Mean squared error: {}".format(mse_train))
    print("RS score: {}".format(r2_train))
    print("\n")
    print("Performace on the test set")
    print("Mean squared error: {}".format(mse_test))
    print("RS score: {}".format(r2_test))
    
    return model, z_mean, z_std

In [68]:
model, z_mean, z_std = create_model(df, 'P1')

Model: Polynomial linear regression
Model Score: 0.026493026579735734


Performace on the training set
Mean squared error: 46.73870009092017
RS score: 0.020251038505376018


Performace on the test set
Mean squared error: 45.71437053005557
RS score: 0.026493026579735734


In [69]:
model, z_mean, z_std = create_model(df, 'P2')

Model: Polynomial linear regression
Model Score: 0.026493026579736175


Performace on the training set
Mean squared error: 46.73870009092017
RS score: 0.020251038505376018


Performace on the test set
Mean squared error: 45.714370530055554
RS score: 0.02649302657973618


In [57]:
teste = [[1]]
polyformer  = PolynomialFeatures(degree=2, include_bias=True)
x_test_model = polyformer.fit_transform(teste)
x_test_model.shape

model.predict(x_test_model)

array([8.52267231])

## 4 -  Saving model

In [79]:
z_scores = pd.DataFrame(columns=[['PROD_ID', 'MEAN', 'STD']])
z_scores

Unnamed: 0,PROD_ID,MEAN,STD


In [80]:
# save the model
for prod in products:
    filename = 'trained_model_{}.sav'.format(prod)
    model, z_mean, z_std = create_model(df, prod)
    pickle.dump(model, open(filename, 'wb'))
    z_scores.loc[len(z_scores)] = [prod, z_mean, z_std]

# load the model
# model = pickle.load(open(filename, 'rb'))

Model: Polynomial linear regression
Model Score: 0.06949321231933914


Performace on the training set
Mean squared error: 7.9975561591715
RS score: 0.06667363959577954


Performace on the test set
Mean squared error: 8.20658306981184
RS score: 0.06949321231933914
Model: Polynomial linear regression
Model Score: 0.010166278806196027


Performace on the training set
Mean squared error: 64.93039683427185
RS score: 0.003880943663556735


Performace on the test set
Mean squared error: 68.1426509887581
RS score: 0.010166278806196027
Model: Polynomial linear regression
Model Score: 0.06250966631797483


Performace on the training set
Mean squared error: 5.123279090857451
RS score: 0.06658234248982142


Performace on the test set
Mean squared error: 4.326502264536097
RS score: 0.06250966631797483
Model: Polynomial linear regression
Model Score: -0.009466458376125919


Performace on the training set
Mean squared error: 36.204516034961735
RS score: 0.009534339137095116


Performace on the test s

In [84]:
print(z_scores)
z_scores.to_csv("z_table.csv", index=False)

  PROD_ID         MEAN         STD
0      P1  1417.677429   87.583247
1      P2   713.876863   79.946288
2      P3  1300.186243  115.602554
3      P4   504.974353   54.222466
4      P5   813.295453  131.287904
5      P6  1757.779065  191.944093
6      P7   744.071136   75.508464
7      P8   444.339749   61.367843
8      P9   443.214940   64.258734


In [95]:
prod = 'P1'
df = df.loc[df['PROD_ID'] == prod]

# Splitting the data
test_data_size = 0.2
x_train, x_test, y_train, y_test = train_test_split(df[['UNIT_VALUE']], df['QTY_ORDER'], test_size=test_data_size, random_state=0)

# Standardization
z_mean = x_train['UNIT_VALUE'].mean()
z_std = x_train['UNIT_VALUE'].std()

x_train['VALUE_STAND'] = x_train.apply(lambda row: stand(row['UNIT_VALUE']), axis=1)
x_test['VALUE_STAND'] = x_test.apply(lambda row: stand(row['UNIT_VALUE']), axis=1)
x_train = x_train[['VALUE_STAND']]
x_test = x_test[['VALUE_STAND']]

# Transform data
polyformer  = PolynomialFeatures(degree=2, include_bias=True)
x_train_model = polyformer.fit_transform(x_train)
x_test_model = polyformer.fit_transform(x_test)
model = LinearRegression()

# Train model
model.fit(x_train_model , y_train)  
y_train_pred = model.predict(x_train_model)
y_pred = model.predict(x_test_model)
