### Import Libraries (Exercise 3: Training a Classification Model)

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import math

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

pd.set_option("display.max_rows", 400)

### Load data and explore

In [2]:
df = pd.read_csv("lab4q2.csv")

In [3]:
df.head()

Unnamed: 0,Date,Day,Sales,Temperature,Flyers,rainfall_log_value,Price,Profitable
0,1/1/2017,Sunday,10,-2.085514,-1.921262,0.866486,0,0
1,2/1/2017,Monday,13,-1.968042,-1.921262,0.622386,0,0
2,3/1/2017,Tuesday,15,-1.621808,-1.009448,0.622386,0,0
3,4/1/2017,Wednesday,17,-1.028266,-0.933464,0.480947,0,0
4,5/1/2017,Thursday,18,-1.133372,-0.553542,0.451754,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 8 columns):
Date                  365 non-null object
Day                   365 non-null object
Sales                 365 non-null int64
Temperature           365 non-null float64
Flyers                365 non-null float64
rainfall_log_value    365 non-null float64
Price                 365 non-null int64
Profitable            365 non-null int64
dtypes: float64(3), int64(3), object(2)
memory usage: 22.9+ KB


In [5]:
df["Profitable"].value_counts()

0    189
1    176
Name: Profitable, dtype: int64

### Transform data for training

In [6]:
#Drop Date and Day columns
df = df.drop(["Date"],axis=1)

In [7]:
df

Unnamed: 0,Day,Sales,Temperature,Flyers,rainfall_log_value,Price,Profitable
0,Sunday,10,-2.085514,-1.921262,0.866486,0,0
1,Monday,13,-1.968042,-1.921262,0.622386,0,0
2,Tuesday,15,-1.621808,-1.009448,0.622386,0,0
3,Wednesday,17,-1.028266,-0.933464,0.480947,0,0
4,Thursday,18,-1.133372,-0.553542,0.451754,0,0
...,...,...,...,...,...,...,...
360,Wednesday,19,-1.114824,-0.553542,0.451754,0,0
361,Thursday,16,-1.417778,-0.629526,0.585268,0,0
362,Friday,15,-1.312672,-1.769293,0.585268,0,0
363,Saturday,13,-1.844387,-1.389371,0.665762,0,0


In [8]:
X_cat = df["Day"]

In [9]:
X_cat.head()

0       Sunday
1       Monday
2      Tuesday
3    Wednesday
4     Thursday
Name: Day, dtype: object

In [10]:
X_cat

0         Sunday
1         Monday
2        Tuesday
3      Wednesday
4       Thursday
         ...    
360    Wednesday
361     Thursday
362       Friday
363     Saturday
364       Sunday
Name: Day, Length: 365, dtype: object

In [11]:
# Label encoding for Day feature
le = LabelEncoder()

In [12]:
le.fit_transform(X_cat)

array([3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3,
       1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1,
       5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5,
       6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6,
       4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4,
       0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0,
       2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2,
       3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3,
       1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1,
       5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5,
       6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6,
       4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4,
       0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0,
       2, 3, 1, 5, 6, 4, 0, 2, 3, 1, 5, 6, 4, 0, 2,

In [13]:
le.classes_

array(['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
       'Wednesday'], dtype=object)

In [14]:
X_encoded = le.fit_transform(X_cat)

In [16]:
X_encoded = pd.DataFrame(X_encoded)

In [17]:
X_encoded

Unnamed: 0,0
0,3
1,1
2,5
3,6
4,4
...,...
360,6
361,4
362,0
363,2


In [18]:
df2 = pd.concat([df,X_encoded],axis=1)

In [22]:
df2

Unnamed: 0,Day,Sales,Temperature,Flyers,rainfall_log_value,Price,Profitable,0
0,Sunday,10,-2.085514,-1.921262,0.866486,0,0,3
1,Monday,13,-1.968042,-1.921262,0.622386,0,0,1
2,Tuesday,15,-1.621808,-1.009448,0.622386,0,0,5
3,Wednesday,17,-1.028266,-0.933464,0.480947,0,0,6
4,Thursday,18,-1.133372,-0.553542,0.451754,0,0,4
5,Friday,11,-2.19062,-1.313386,0.710103,0,0,0
6,Saturday,13,-1.720732,-1.617324,0.710103,0,0,2
7,Sunday,15,-1.436326,-0.933464,0.550787,0,0,3
8,Monday,17,-1.39923,-1.54134,0.550787,0,0,1
9,Tuesday,18,-1.071545,-0.553542,0.480947,0,0,5


In [23]:
df2 = df2.drop("Day",axis=1)

In [24]:
df2.head()

Unnamed: 0,Sales,Temperature,Flyers,rainfall_log_value,Price,Profitable,0
0,10,-2.085514,-1.921262,0.866486,0,0,3
1,13,-1.968042,-1.921262,0.622386,0,0,1
2,15,-1.621808,-1.009448,0.622386,0,0,5
3,17,-1.028266,-0.933464,0.480947,0,0,6
4,18,-1.133372,-0.553542,0.451754,0,0,4


In [30]:
df2.rename(columns={0: "Day"},inplace=True)

In [31]:
df2.head()

Unnamed: 0,Sales,Temperature,Flyers,rainfall_log_value,Price,Profitable,Day
0,10,-2.085514,-1.921262,0.866486,0,0,3
1,13,-1.968042,-1.921262,0.622386,0,0,1
2,15,-1.621808,-1.009448,0.622386,0,0,5
3,17,-1.028266,-0.933464,0.480947,0,0,6
4,18,-1.133372,-0.553542,0.451754,0,0,4


In [32]:
df2.shape

(365, 7)

In [34]:
#Rearrange columns
df2 = df2[['Sales', 'Temperature', 'Flyers', 'rainfall_log_value', 'Price','Day','Profitable']]

In [35]:
df2.head()

Unnamed: 0,Sales,Temperature,Flyers,rainfall_log_value,Price,Day,Profitable
0,10,-2.085514,-1.921262,0.866486,0,3,0
1,13,-1.968042,-1.921262,0.622386,0,1,0
2,15,-1.621808,-1.009448,0.622386,0,5,0
3,17,-1.028266,-0.933464,0.480947,0,6,0
4,18,-1.133372,-0.553542,0.451754,0,4,0


In [36]:
X = df2.iloc[:,0:6]
y = df2.iloc[:,6]

In [38]:
X.head()

Unnamed: 0,Sales,Temperature,Flyers,rainfall_log_value,Price,Day
0,10,-2.085514,-1.921262,0.866486,0,3
1,13,-1.968042,-1.921262,0.622386,0,1
2,15,-1.621808,-1.009448,0.622386,0,5
3,17,-1.028266,-0.933464,0.480947,0,6
4,18,-1.133372,-0.553542,0.451754,0,4


In [39]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Profitable, dtype: int64

In [40]:
X.values, y.values

(array([[10.        , -2.08551378, -1.92126193,  0.86648611,  0.        ,
          3.        ],
        [13.        , -1.96804175, -1.92126193,  0.62238571,  0.        ,
          1.        ],
        [15.        , -1.62180842, -1.00944838,  0.62238571,  0.        ,
          5.        ],
        ...,
        [15.        , -1.31267152, -1.769293  ,  0.58526796,  0.        ,
          0.        ],
        [13.        , -1.84438699, -1.38937069,  0.66576208,  0.        ,
          2.        ],
        [ 7.        , -2.82125961, -2.3771687 ,  1.        ,  0.        ,
          3.        ]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
        1, 0, 1, 1, 1, 0, 0, 1, 1

### Model Training

In [41]:
#Split for training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True)

In [42]:
X_train.shape, X_test.shape

((255, 6), (110, 6))

In [43]:
model = LogisticRegression()

In [44]:
model.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
y_predict = model.predict(X_test)

In [46]:
y_predict  #Prediction results

array([1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1],
      dtype=int64)

In [57]:
y_test.values.reshape(-1,1)

array([[1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
    

### Model Evaluation

In [59]:
accuracy = accuracy_score(y_test,y_predict)
print('Accuracy Score is',accuracy)

Accuracy Score is 0.9545454545454546


In [61]:
cm = confusion_matrix(y_test,y_predict)
print(cm)

[[52  2]
 [ 3 53]]


In [63]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95        54
           1       0.96      0.95      0.95        56

    accuracy                           0.95       110
   macro avg       0.95      0.95      0.95       110
weighted avg       0.95      0.95      0.95       110

