# Machine Learning Model

In [59]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
import joblib

In [39]:
def pipeline(path):
  df = (pd
    .read_csv(path, parse_dates=['date'])
    .drop_duplicates()
    .dropna())

  df['sales_channel'] = df['sales_channel'].str.capitalize()
  df = df.drop(df.query('invested < 1 or returned < 1').index)

  df['month'] = df['date'].dt.month_name()
  df['day_of_week'] = df['date'].dt.day_name()
  df['day_of_month'] = df['date'].dt.day

  return df.reset_index(drop=True)

In [40]:
df = pipeline('data.csv')

In [41]:
df

Unnamed: 0,date,invested,returned,sales_channel,month,day_of_week,day_of_month
0,2020-01-01,74025.0,108610.0,Print advertising,January,Wednesday,1
1,2020-01-02,33993.0,137278.0,Tv,January,Thursday,2
2,2020-01-03,92217.0,28635.0,Social media,January,Friday,3
3,2020-01-04,79190.0,34487.0,Radio,January,Saturday,4
4,2020-01-05,40134.0,43405.0,Social media,January,Sunday,5
...,...,...,...,...,...,...,...
897,2022-12-27,34306.0,14006.0,Tv,December,Tuesday,27
898,2022-12-28,42139.0,37726.0,Print advertising,December,Wednesday,28
899,2022-12-29,87553.0,47059.0,Radio,December,Thursday,29
900,2022-12-30,20543.0,13637.0,Tv,December,Friday,30


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 902 entries, 0 to 901
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           902 non-null    datetime64[ns]
 1   invested       902 non-null    float64       
 2   returned       902 non-null    float64       
 3   sales_channel  902 non-null    object        
 4   month          902 non-null    object        
 5   day_of_week    902 non-null    object        
 6   day_of_month   902 non-null    int64         
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 49.5+ KB


In [43]:
df = df.drop('date', axis=1)

In [44]:
categorical_features = df.select_dtypes('object')
categorical_features

Unnamed: 0,sales_channel,month,day_of_week
0,Print advertising,January,Wednesday
1,Tv,January,Thursday
2,Social media,January,Friday
3,Radio,January,Saturday
4,Social media,January,Sunday
...,...,...,...
897,Tv,December,Tuesday
898,Print advertising,December,Wednesday
899,Radio,December,Thursday
900,Tv,December,Friday


In [45]:
labelEncoder_fit = [(feature, LabelEncoder().fit(df[feature])) for feature in categorical_features.columns]

# joblib.dump(labelEncoder_fit, './assets/labelEncoder_fit.jbl')
# exporting the fitted features from the data tu use later on
# the deployed model

for feature, fit in labelEncoder_fit:
  df[feature] = fit.transform(df[feature])
  # transforming the categorical features from string to 1,2,3...

df.head()

Unnamed: 0,invested,returned,sales_channel,month,day_of_week,day_of_month
0,74025.0,108610.0,0,4,6,1
1,33993.0,137278.0,3,4,4,2
2,92217.0,28635.0,2,4,0,3
3,79190.0,34487.0,1,4,2,4
4,40134.0,43405.0,2,4,3,5


In [46]:
df.describe()

Unnamed: 0,invested,returned,sales_channel,month,day_of_week,day_of_month
count,902.0,902.0,902.0,902.0,902.0,902.0
mean,60731.542129,75300.872506,1.810421,5.506652,2.990022,15.832594
std,23634.155846,37787.897986,1.062206,3.443771,1.99024,8.843799
min,20047.0,10154.0,0.0,0.0,0.0,1.0
25%,40787.25,43062.75,1.0,2.25,1.0,8.0
50%,59914.0,75735.0,2.0,5.5,3.0,16.0
75%,82429.5,107867.0,3.0,9.0,5.0,23.0
max,99997.0,139917.0,3.0,11.0,6.0,31.0


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 902 entries, 0 to 901
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   invested       902 non-null    float64
 1   returned       902 non-null    float64
 2   sales_channel  902 non-null    int64  
 3   month          902 non-null    int64  
 4   day_of_week    902 non-null    int64  
 5   day_of_month   902 non-null    int64  
dtypes: float64(2), int64(4)
memory usage: 42.4 KB


In [48]:
# for categorical_feature in categorical_features.columns:
#   df[categorical_feature] = df[categorical_feature].astype('category')

In [49]:
# df.info()

In [50]:
x = df.drop('returned', axis=1)
y = df['returned']

In [51]:
x

Unnamed: 0,invested,sales_channel,month,day_of_week,day_of_month
0,74025.0,0,4,6,1
1,33993.0,3,4,4,2
2,92217.0,2,4,0,3
3,79190.0,1,4,2,4
4,40134.0,2,4,3,5
...,...,...,...,...,...
897,34306.0,3,2,5,27
898,42139.0,0,2,6,28
899,87553.0,1,2,4,29
900,20543.0,3,2,0,30


In [52]:
y

0      108610.0
1      137278.0
2       28635.0
3       34487.0
4       43405.0
         ...   
897     14006.0
898     37726.0
899     47059.0
900     13637.0
901     23371.0
Name: returned, Length: 902, dtype: float64

In [53]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=7)

In [54]:
print(f'x_train = {x_train.shape} | y_train = {y_train.shape} | x_test = {x_test.shape} | y_test = {y_test.shape}')

x_train = (721, 5) | y_train = (721,) | x_test = (181, 5) | y_test = (181,)


In [55]:
model = xgb.XGBRegressor()

In [58]:
model.objective

'reg:squarederror'

In [61]:
cross_val_score(model, x_train, y_train, cv=5, scoring='neg_root_mean_squared_error').mean()

-45362.88399628062