In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [11]:
# Importing the dataset as dataframe (df)
df = pd.read_csv('Social_Network_Ads.csv')
# view head
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


## Data preparation

In [4]:
# convert data from pandas data frame to arrays
# all machine learning calculations are done with arrays

# NOTE! encoding needs to be done 'in-one-go'
x = df.iloc[:,1:4].values
y = df.iloc[:, 4].values

# import encoders from scikit
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
gender_encoder = LabelEncoder()
# encode gender labels to numbers
x[:,0] = gender_encoder.fit_transform(x[:,0])

# define encode of gender to boolean
gender_boolean = OneHotEncoder(categorical_features = [0])
# execute conversion
x = gender_boolean.fit_transform(x).toarray()

In [12]:
# check encoding
df_x = pd.DataFrame(x)
df_x.head()

Unnamed: 0,0,1,2,3
0,0.0,1.0,19.0,19000.0
1,0.0,1.0,35.0,20000.0
2,1.0,0.0,26.0,43000.0
3,1.0,0.0,27.0,57000.0
4,0.0,1.0,19.0,76000.0


In [6]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

In [8]:
# Feature scaling - normalizing data
from sklearn.preprocessing import StandardScaler
scale_x = StandardScaler()
x_train = scale_x.fit_transform(x_train)
x_test = scale_x.transform(x_test)

In [17]:
# view x_train data set
df_x = pd.DataFrame(x_train)
df_x.head()

Unnamed: 0,0,1,2,3
0,0.980196,-0.980196,0.581649,-0.886707
1,-1.020204,1.020204,-0.606738,1.461738
2,0.980196,-0.980196,-0.012544,-0.567782
3,0.980196,-0.980196,-0.606738,1.896635
4,0.980196,-0.980196,1.373907,-1.408584


In [18]:
# view y_train data test
df_y = pd.DataFrame(y_train)
df_y.head()

Unnamed: 0,0
0,0
1,1
2,0
3,1
4,1


## Logistic regression model
Fitting training set into logistic regression. We use linear model from scikit-learn. Module accepts number of parameters, in this example we use only random_state in order to be able to reproduce same results on each execution.

In [20]:
from sklearn.linear_model import LogisticRegression
# create logistic regression model
classifier = LogisticRegression(random_state=0)
# fit training set to model
classifier.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
# predicting result of test data
y_pred = classifier.predict(x_test)
# examine predicted and actual y
y_df = pd.DataFrame([y_pred, y_test]).T
y_df.describe()

Unnamed: 0,0,1
count,100.0,100.0
mean,0.29,0.32
std,0.456048,0.468826
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,1.0,1.0
max,1.0,1.0


In [32]:
# compare prediction and actual test y values
# using confusion matrix
from sklearn.metrics import confusion_matrix
# compare accuracy (actual, prediction)
cm = confusion_matrix(y_test, y_pred)
# in this example (65 + 26) correct predictions out of 100
cm

array([[65,  3],
       [ 6, 26]])