In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import xgboost as xgb

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score,recall_score

import warnings
warnings.simplefilter(action='ignore', category='FutureWarning')

path = 'data/'

In [2]:
df_sales = pd.read_csv(path + 'sales.csv')
df_sales['UNIT_VALUE'] = df_sales['REVENUE']/df_sales['QTY_ORDER']
df_sales.head()

Unnamed: 0,PROD_ID,DATE_ORDER,QTY_ORDER,REVENUE,UNIT_VALUE
0,P6,2015-08-02,1.0,1808.99,1808.99
1,P6,2015-08-17,1.0,1674.0,1674.0
2,P6,2015-08-17,1.0,1673.95,1673.95
3,P6,2015-08-11,1.0,1674.0,1674.0
4,P6,2015-08-17,1.0,1674.0,1674.0


In [3]:
df_sales['PROD_CAT'] = df_sales['PROD_ID'].astype('category').cat.codes
df_sales.head()

Unnamed: 0,PROD_ID,DATE_ORDER,QTY_ORDER,REVENUE,UNIT_VALUE,PROD_CAT
0,P6,2015-08-02,1.0,1808.99,1808.99,5
1,P6,2015-08-17,1.0,1674.0,1674.0,5
2,P6,2015-08-17,1.0,1673.95,1673.95,5
3,P6,2015-08-11,1.0,1674.0,1674.0,5
4,P6,2015-08-17,1.0,1674.0,1674.0,5


Without date

In [4]:
df = df_sales.groupby(by=['PROD_CAT', 'UNIT_VALUE'])[['QTY_ORDER']].sum().reset_index()

Models
- Linear Regression
- XGboost

In [5]:
test_data_size = 0.3
x_train, x_test, y_train, y_test = train_test_split(df[['PROD_CAT','UNIT_VALUE']], df['QTY_ORDER'], test_size=test_data_size, random_state=0)

## Standardization

In [6]:
z1 = x_train.groupby(['PROD_CAT'], as_index=False).agg({'UNIT_VALUE':['mean','std']}).reset_index(drop=True)

In [7]:
def stand(z, product, value):
    z_mean = z.loc[z['PROD_CAT'] == product]['UNIT_VALUE']['mean'].values[0]
    z_std =  z.loc[z['PROD_CAT'] == product]['UNIT_VALUE']['std'].values[0]
    
    if z_std == 0:
        return value
    
    return (value-z_mean)/z_std

In [8]:
x_train['VALUE_STAND'] = x_train.apply(lambda row: stand(z1, row['PROD_CAT'], row['UNIT_VALUE']), axis=1)
x_test['VALUE_STAND'] = x_test.apply(lambda row: stand(z1, row['PROD_CAT'], row['UNIT_VALUE']), axis=1)

In [9]:
x_train = x_train[['PROD_CAT', 'VALUE_STAND']]
x_test = x_test[['PROD_CAT', 'VALUE_STAND']]

## Linear Regression

In [10]:
model = LinearRegression()
model.fit(x_train , y_train)  

y_train_pred = model.predict(x_train)
y_pred = model.predict(x_test)

model.score(x_test, y_test)

-0.00016040785146853587

In [11]:
mse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
r2_train = r2_score(y_train, y_train_pred)

mse_test = np.sqrt(mean_squared_error(y_test, y_pred))
r2_test = r2_score(y_test, y_pred)


print("Performace on the training set")
print("Mean squared error: {}".format(mse_train))
print("RS score: {}".format(r2_train))
print("\n")
print("Performace on the test set")
print("Mean squared error: {}".format(mse_test))
print("RS score: {}".format(r2_test))


# print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
# print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
# print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Performace on the training set
Mean squared error: 1289.7242179843995
RS score: 0.0004867231585152787


Performace on the test set
Mean squared error: 2580.5162272552693
RS score: -0.00016040785146853587


## Polynomial Regression

In [12]:
transformer  = PolynomialFeatures(degree=2, include_bias=True)

In [13]:
x_train_trans = transformer.fit_transform(x_train)
x_test_trans = transformer.fit_transform(x_test)

In [14]:
model = LinearRegression(fit_intercept=False)
model.fit(x_train_trans , y_train)  

y_train_pred = model.predict(x_train_trans)
y_pred = model.predict(x_test_trans)

model.score(x_test_trans, y_test)

-0.0002964684580395449

## XGBoost

In [20]:
model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)
model.fit(x_train, y_train)

TypeError: issubclass() arg 2 must be a class or tuple of classes

## To do:
- Fix the XGBoost
- Try other models
- Analyse the data
- Do the graphs
