# Testing models trained 

## Problem 2 
### Part 1: Predict gross income using multiple linear regression with Lasso regularization

In [25]:
import joblib 

# Import test sets for model 1

X_test_M1 = joblib.load('test_sets/X_test_M1.pkl')
X_test_M1

Unnamed: 0,Product line,Unit price,Quantity,Day of Week,Time
686,Sports and travel,64.83,2,1,1
195,Fashion accessories,74.29,1,6,4
374,Home and lifestyle,67.09,5,3,2
86,Fashion accessories,76.52,5,0,1
40,Home and lifestyle,86.72,1,3,3
...,...,...,...,...,...
609,Food and beverages,57.89,2,3,1
54,Home and lifestyle,16.16,2,3,1
580,Food and beverages,27.22,3,0,1
988,Electronic accessories,82.34,10,4,4


In [28]:
t_test_M1 = joblib.load('test_sets/t_test_M1.pkl')
t_test_M1

686     6.4830
195     3.7145
374    16.7725
86     19.1300
40      4.3360
        ...   
609     5.7890
54      1.6160
580     4.0830
988    41.1700
743     3.7690
Name: gross income, Length: 200, dtype: float64

In [30]:
# Import pipeline 

pipeline_M1_lasso = joblib.load('pipelines/pipeline_problem2_lasso.pkl')
pipeline_M1_lasso

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   MinMaxScaler())]),
                                                  ['Unit price', 'Quantity']),
                                                 ('cat',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['Product line',
                                                   'Day of Week', 'Time'])])),
                ('lasso_reg', Lasso(alpha=0.07581981981981982))])

In [5]:
from sklearn.metrics import r2_score

# find coefficient of determination using test set
r2_score(t_test_M1, pipeline_M1_lasso.predict(X_test_M1))

0.8762544491843862

In [32]:
from sklearn.model_selection import cross_val_score

# To compute 95% CI
scores_1 = cross_val_score(pipeline_M1_lasso, X_test_M1, t_test_M1,
                        cv=10,
                        scoring='r2')

scores_1

array([0.83039798, 0.6730198 , 0.89708989, 0.87412847, 0.87158382,
       0.72770541, 0.92813058, 0.81338891, 0.91059734, 0.87875687])

In [33]:
from scipy import stats
import numpy as np
# confidence interval for scores
confidence = 0.95

print('95% CI: ', stats.t.interval(confidence, len(scores_1) - 1,
                 loc=scores_1.mean(),
                 scale=scores_1.std(ddof=1)/np.sqrt(len(scores_1))))

95% CI:  (0.7816033670686401, 0.8993564445512715)


### Part 2: Predict gross income using multiple linear regression without Lasso regularization

In [9]:
# Import pipeline 

pipeline_M1_noLasso = joblib.load('pipelines/pipeline_problem2_noLasso.pkl')
pipeline_M1_noLasso

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   MinMaxScaler())]),
                                                  ['Unit price', 'Quantity']),
                                                 ('cat',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['Product line',
                                                   'Day of Week', 'Time'])])),
                ('lin_reg', LinearRegression(fit_intercept=False))])

In [10]:
# find coefficient of determination using test set
r2_score(t_test_M1, pipeline_M1_noLasso.predict(X_test_M1))

0.8767654460097892

In [11]:
# To compute 95% CI
scores_2 = cross_val_score(pipeline_M1_noLasso, X_test_M1, t_test_M1,
                        cv=10,
                        scoring='r2')

scores_2

array([0.83519447, 0.66135093, 0.90034951, 0.87620638, 0.87235645,
       0.64192114, 0.91118906, 0.77767678, 0.91237427, 0.86237491])

In [12]:
# confidence interval for scores
confidence = 0.95

print('95% CI: ', stats.t.interval(confidence, len(scores_2) - 1,
                 loc=scores_2.mean(),
                 scale=scores_2.std(ddof=1)/np.sqrt(len(scores_2))))

95% CI:  (0.7537052484687681, 0.8964935272226313)


Based on these results, the second model where lasso regularization is not used, would be the most optimal choice considering it has a higher r2 score and produces a 95% CI with smalller possible errors.

## Problem 3
### Part 1: Predict Unit price using multiple linear regression with Lasso Regularization

In [34]:
# Import test sets for model 2

X_test_M2 = joblib.load('test_sets/X_test_M2.pkl')
X_test_M2

Unnamed: 0,Product line,Quantity,gross income,Day of Week,Time
479,Electronic accessories,1,1.9300,1,1
876,Fashion accessories,1,2.6175,1,3
285,Health and beauty,5,16.6700,2,3
419,Electronic accessories,7,8.8270,0,1
825,Sports and travel,4,4.4020,1,3
...,...,...,...,...,...
841,Electronic accessories,1,3.0150,3,3
132,Sports and travel,4,6.9680,6,3
438,Food and beverages,4,3.4080,4,4
129,Sports and travel,9,40.6260,4,1


In [35]:
t_test_M2 = joblib.load('test_sets/t_test_M2.pkl')
t_test_M2

479    38.60
876    52.35
285    66.68
419    25.22
825    22.01
       ...  
841    60.30
132    34.84
438    17.04
129    90.28
632    83.77
Name: Unit price, Length: 200, dtype: float64

In [38]:
# Import pipeline 

pipeline_M2_lasso = joblib.load('pipelines/pipeline_problem3_lasso.pkl')
pipeline_M2_lasso

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   MinMaxScaler())]),
                                                  ['gross income', 'Quantity']),
                                                 ('cat',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['Product line',
                                                   'Day of Week', 'Time'])])),
                ('lasso_reg', Lasso(alpha=0.1))])

In [39]:
# find coefficient of determination using test set
r2_score(t_test_M2, pipeline_M2_lasso.predict(X_test_M2))

0.7841479066711223

In [40]:
# To compute 95% CI
scores_3 = cross_val_score(pipeline_M2_lasso, X_test_M2, t_test_M2,
                        cv=10,
                        scoring='r2')

scores_3

array([0.86950075, 0.8031451 , 0.73594587, 0.85515594, 0.93647798,
       0.53195232, 0.83182797, 0.60362848, 0.87909811, 0.8164887 ])

In [41]:
# confidence interval for scores
confidence = 0.95

print('95% CI: ', stats.t.interval(confidence, len(scores_3) - 1,
                 loc=scores_3.mean(),
                 scale=scores_3.std(ddof=1)/np.sqrt(len(scores_3))))

95% CI:  (0.6949934932240026, 0.8776507514097328)


### Part 2: Predict Unit price using Multiple Linear Regression without Lasso Regularization

In [42]:
# Import pipeline 

pipeline_M2_noLasso = joblib.load('pipelines/pipeline_problem3_noLasso.pkl')
pipeline_M2_noLasso

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   MinMaxScaler())]),
                                                  ['gross income', 'Quantity']),
                                                 ('cat',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['Product line',
                                                   'Day of Week', 'Time'])])),
                ('lin_reg', LinearRegression(fit_intercept=False))])

In [43]:
# find coefficient of determination using test set
r2_score(t_test_M2, pipeline_M2_noLasso.predict(X_test_M2))

0.778630710608738

In [44]:
# To compute 95% CI
scores_4 = cross_val_score(pipeline_M2_noLasso, X_test_M2, t_test_M2,
                        cv=10,
                        scoring='r2')

scores_4

array([0.87010075, 0.79054475, 0.72368755, 0.85453321, 0.92904932,
       0.51938385, 0.81663286, 0.60859172, 0.86151834, 0.80724193])

In [45]:
# confidence interval for scores
confidence = 0.95

print('95% CI: ', stats.t.interval(confidence, len(scores_4) - 1,
                 loc=scores_4.mean(),
                 scale=scores_4.std(ddof=1)/np.sqrt(len(scores_4))))

95% CI:  (0.6872761590283674, 0.8689806978946246)


Since the first model, where Lasso regularization is used, has a higher r2 score and  produces a 95% CI with smalller possible errors, it is the most optimal

## Problem 6
### Part 1: Predict the day of purchase using a Decision Tree Classifier

In [51]:
# Import test sets for model 3

X_test_M3 = joblib.load('test_sets/X_test_M3.pkl')
X_test_M3

Unnamed: 0,Product line,Unit price,Quantity,gross income,Time
686,Sports and travel,64.83,2,6.4830,1
195,Fashion accessories,74.29,1,3.7145,4
374,Home and lifestyle,67.09,5,16.7725,2
86,Fashion accessories,76.52,5,19.1300,1
40,Home and lifestyle,86.72,1,4.3360,3
...,...,...,...,...,...
609,Food and beverages,57.89,2,5.7890,1
54,Home and lifestyle,16.16,2,1.6160,1
580,Food and beverages,27.22,3,4.0830,1
988,Electronic accessories,82.34,10,41.1700,4


In [52]:
t_test_M3 = joblib.load('test_sets/t_test_M3.pkl')
t_test_M3

686    1
195    6
374    3
86     0
40     3
      ..
609    3
54     3
580    0
988    4
743    2
Name: Day of Week, Length: 200, dtype: int64

In [53]:
# Import pipeline 

pipeline_M3_DT = joblib.load('pipelines/pipeline_problem6_DT.pkl')
pipeline_M3_DT

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   MinMaxScaler())]),
                                                  ['Unit price', 'Quantity',
                                                   'gross income']),
                                                 ('cat',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['Product line', 'Time'])])),
                ('DT',
                 DecisionTreeClassifier(criterion='entropy', max_depth=9,
                                        min_samples_leaf=4, random_state=0))])

In [54]:
# Get the accuracy score 
from sklearn.metrics import accuracy_score

accuracy_score(t_test_M3, pipeline_M3_DT.predict(X_test_M3))

0.16

In [55]:
from sklearn.model_selection import cross_val_score

# To compute 95% CI

scores_5 = cross_val_score(pipeline_M3_DT, X_test_M3, t_test_M3,
                        cv=10,
                        scoring='accuracy')

scores_5

array([0.15, 0.25, 0.2 , 0.3 , 0.1 , 0.1 , 0.05, 0.2 , 0.1 , 0.05])

In [56]:
confidence = 0.95

print('95% CI: ', stats.t.interval(confidence, len(scores_5) - 1,
                 loc=scores_5.mean(),
                 scale=scores_5.std(ddof=1)/np.sqrt(len(scores_5))))

95% CI:  (0.08920635295607461, 0.21079364704392545)


### Part 2: Predict the day of purchase using a Random Forest Classifier

In [66]:
# Import pipeline 

pipeline_M3_RF = joblib.load('pipelines/pipeline_problem6_rf.pkl')
pipeline_M3_RF

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   MinMaxScaler())]),
                                                  ['Unit price', 'Quantity',
                                                   'gross income']),
                                                 ('cat',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder())]),
                                                  ['Product line', 'Time'])])),
                ('RF',
                 RandomForestClassifier(max_depth=9, min_samples_split=3,
                                        n_estimators=50, random_state=0))])

In [67]:
# Get the accuracy score 

accuracy_score(t_test_M3, pipeline_M3_RF.predict(X_test_M3))

0.155

In [68]:
# To compute 95% CI

scores_6 = cross_val_score(pipeline_M3_RF, X_test_M3, t_test_M3,
                        cv=10,
                        scoring='accuracy')

scores_6

array([0.3 , 0.1 , 0.05, 0.2 , 0.05, 0.1 , 0.15, 0.  , 0.1 , 0.15])

In [69]:
confidence = 0.95

print('95% CI: ', stats.t.interval(confidence, len(scores_6) - 1,
                 loc=scores_6.mean(),
                 scale=scores_6.std(ddof=1)/np.sqrt(len(scores_6))))

95% CI:  (0.058740494441899556, 0.18125950555810044)


Both the classifiers have a similar accuracy score of around 0.16, but the random forest classifier model produces a 95% CI with smalller possible errors, so I would chose the random forest classifier