# Logistic Regression

#### 1. Load libraries

In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import pylab as pl
import numpy as np
import statistics
import scipy.optimize as opt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import jaccard_score


#### 2. Load in data and Select Model Features

In [2]:
data_path = 'https://raw.githubusercontent.com/kvinlazy/Dataset/master/ChurnData.csv'
df = pd.read_csv(data_path)
df = df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip', 'callcard', 'wireless', 'churn']]
df.churn = df.churn.astype('int') # Need to coerce outcome variable to numeric

In [3]:
df_names = df.columns.values.tolist()
print(
    f'There are {len(df)} rows and {len(df.columns)} columns in the data.',
    f'The columns are named: {", ".join([name for name in df_names])}'
)


There are 200 rows and 10 columns in the data. The columns are named: tenure, age, address, income, ed, employ, equip, callcard, wireless, churn


#### 3. Data preprocessing

We'll need to select our target and feature as well as normalize our data.

In [4]:
X = np.asarray(df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip']])
y = np.asarray(df.churn)

In [5]:
X = preprocessing.StandardScaler().fit(X).transform(X)

# Check that we have a standard deviation of 1 and mean of 0
flattened = []
for lst in X:
    for sublist in lst:
        flattened.append(sublist)

print(
    f'The mean is {np.round(np.mean(flattened), 1)} and the standard deviation is',
    f' {np.round(statistics.stdev(flattened), 1)}'
)

The mean is -0.0 and the standard deviation is  1.0


#### 4. Modeling

Split the data into training and testing sets.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
print('Train set:', X_train.shape, y_train.shape)
print('Test set:', X_test.shape, y_test.shape)

Train set: (160, 7) (160,)
Test set: (40, 7) (40,)


Fit the model.

In [7]:
logit = LogisticRegression(C=0.01, solver='liblinear')
logit.fit(X_train, y_train)

Make predictions.

In [8]:
yhat = logit.predict(X_test)

While the above predicted the class of an observation, we can also retrieve the probability that an individual will be in each class by calling on the `predict_proba` method.

In [29]:
yhat_prob = logit.predict_proba(X_test)
yhat_prob[0:10]

array([[0.40130629, 0.59869371],
       [0.54455897, 0.45544103],
       [0.49631135, 0.50368865],
       [0.5432107 , 0.4567893 ],
       [0.50668766, 0.49331234],
       [0.51800365, 0.48199635],
       [0.70742493, 0.29257507],
       [0.60829538, 0.39170462],
       [0.49611884, 0.50388116],
       [0.37389734, 0.62610266]])

The above returns two columns because there are two possible classes. The first column predicts the probablity of being in class 0 and the second returns the probablity of being in class 1. `sklearn` will return one column for each class ordered by the label of the class. The probability of each row will sum to 1.

In [20]:
class0_probabilities = []
class1_probabilities = []
for lst in yhat_prob:
    for idx, prob in enumerate(lst):
        if idx == 0:
            class0_probabilities.append(prob)
        elif idx == 1:
            class1_probabilities.append(prob)


#### 5. Model Evaluation

In [31]:
jaccard_score(y_true=y_test, y_pred=yhat, pos_label=0)

0.6666666666666666

In [37]:
sum(y_test == yhat) / len(y_test)

0.725

In [43]:
len([x for x in y_test == yhat if x == True]) / len(yhat_prob)

0.725