## Model

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv('data/online_sales.csv')
df.head()

Unnamed: 0,age,new_user,total_pages_visited,converted
0,25,1,1,0
1,23,1,5,0
2,28,1,4,0
3,39,1,5,0
4,30,1,6,0


In [3]:
df.converted.value_counts()

converted
0    306000
1     10200
Name: count, dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316200 entries, 0 to 316199
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype
---  ------               --------------   -----
 0   age                  316200 non-null  int64
 1   new_user             316200 non-null  int64
 2   total_pages_visited  316200 non-null  int64
 3   converted            316200 non-null  int64
dtypes: int64(4)
memory usage: 9.6 MB


In [5]:
input_cols = [col for col in df.columns if col != 'converted']
output_cols = 'converted'
print(input_cols)

['age', 'new_user', 'total_pages_visited']


In [6]:
y = df.loc[:, output_cols]
X = df.loc[:, input_cols].values

In [7]:
print(X.shape, y.shape)

(316200, 3) (316200,)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(252960, 3) (252960,)
(63240, 3) (63240,)


In [9]:
lr = LogisticRegression(class_weight='balanced').fit(X_train, y_train)

In [10]:
lr.score(X_test, y_test)

0.9353573687539531

In [11]:
preds = lr.predict(X_test)

In [12]:
print(classification_report(y_test, preds, target_names=['not converted', 'converted']))

               precision    recall  f1-score   support

not converted       1.00      0.94      0.97     61200
    converted       0.32      0.93      0.48      2040

     accuracy                           0.94     63240
    macro avg       0.66      0.93      0.72     63240
 weighted avg       0.98      0.94      0.95     63240



## Export the Train Model

In [13]:
import pickle

In [14]:
pickle_out = open("logreg.pkl", "wb")
pickle.dump(lr, pickle_out)
pickle_out.close()

## Test Model

In [15]:
pickle_in = open("logreg.pkl", "rb")
model = pickle.load(pickle_in)

In [16]:
model.predict([[45,0,5]])[0]

0

In [17]:
df_test = pd.read_csv('data/test_data.csv')
df_test.head()

Unnamed: 0,age,new_user,total_pages_visited
0,36,1,5
1,18,0,6
2,21,1,14
3,31,1,3
4,27,1,2


In [18]:
prediction = model.predict(df_test.values)
print(list(prediction))

[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
