# Logistic Regression (L1 Regularization)

#### Load the packages and import the data

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("./Data Files/Log_Reg_Sample_File.csv")
data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Country,Gender,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,Tunisia,Male,3/27/16 0:53,No
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,Nauru,Male,4/4/16 1:39,No
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,San Marino,Male,3/13/16 20:35,No
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,Italy,Male,1/10/16 2:31,No
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,Iceland,Male,6/3/16 3:36,No


In [2]:
data.columns

Index(['Daily Time Spent on Site', 'Age', 'Area Income',
       'Daily Internet Usage', 'Ad Topic Line', 'City', 'Country', 'Gender',
       'Timestamp', 'Clicked on Ad'],
      dtype='object')

#### Split data into an X DataFrame and y vector

In [3]:
X = data[['Daily Time Spent on Site', 'Age', 'Area Income',
       'Daily Internet Usage', 'Gender']]
y = data["Clicked on Ad"]

#### Encode the dummy variables

In [4]:
X_dummies = pd.get_dummies(X[["Gender"]], drop_first = True)

In [5]:
X = pd.concat([X, X_dummies], axis = 1)
X = X.drop(["Gender"], axis = 1)

#### Split the data into a train_set and test_set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1111)

In [7]:
X_test.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Gender_Male
293,81.61,33,62667.51,228.76,1
123,37.75,36,35466.8,225.24,0
939,60.83,19,40478.83,185.46,0
38,50.43,46,57425.87,119.32,1
831,42.83,34,54324.73,132.38,0


#### Fit the Logistic Regression Model

In [8]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(penalty="l1")
log_model.fit(X_train, y_train)

coef = np.append(log_model.intercept_, log_model.coef_)
col_names = np.append("Intercept", X.columns)
pd.DataFrame(coef, col_names, columns=["Coefficients"])

Unnamed: 0,Coefficients
Intercept,18.532499
Daily Time Spent on Site,-0.149633
Age,0.164214
Area Income,-9.6e-05
Daily Internet Usage,-0.04658
Gender_Male,0.5618


#### Predict on Test Set

In [9]:
y_pred = log_model.predict(X_test)
y_prob = log_model.predict_proba(X_test)

pred_summary = X_test.copy()
pred_summary[y.name] = y_test
pred_summary["y_pred"] = y_pred
pred_summary["y_prob"] = np.round(y_prob[:, 1], 2)
pred_summary.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Gender_Male,Clicked on Ad,y_pred,y_prob
293,81.61,33,62667.51,228.76,1,No,No,0.01
123,37.75,36,35466.8,225.24,0,Yes,Yes,0.99
939,60.83,19,40478.83,185.46,0,No,Yes,0.5
38,50.43,46,57425.87,119.32,1,Yes,Yes,1.0
831,42.83,34,54324.73,132.38,0,Yes,Yes,1.0


#### Evaluate the Logistic Regression Model

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Logistic Regression Model (L1 Regularization)", "\n")
print(pd.DataFrame(confusion_matrix(y_test, y_pred)), 
      "      Accuracy:", round(accuracy_score(y_test, y_pred), 3), 
      "\n")
print(classification_report(y_test, y_pred))

Logistic Regression Model (L1 Regularization) 

     0    1
0  147    3
1    6  144       Accuracy: 0.97 

             precision    recall  f1-score   support

         No       0.96      0.98      0.97       150
        Yes       0.98      0.96      0.97       150

avg / total       0.97      0.97      0.97       300

