# Logistic Regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ipywidgets import interactive

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import add_dummy_feature

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

from sklearn.datasets import load_iris

from sklearn.tree import plot_tree

In [None]:
rng = np.random.RandomState(2)

## Logistic Regression

### Read in students dataset and split it

In [None]:
import os
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "./drive/My Drive/Colab Notebooks/" # You may need to change this, depending on where your notebooks are on Google Drive
else:
    base_dir = "."
dataset_dir = os.path.join(base_dir, "datasets")

In [None]:
df = pd.read_csv(os.path.join(dataset_dir, "cs1109.csv"))

In [None]:
features = ["lect", "lab"]

X = df[features]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["outcome"])

In [None]:
label_encoder.inverse_transform([0, 1])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=df["outcome"], random_state=rng)

### What is Logistic Regression trying to do?

In [None]:
def show_linear_model(b, w):
    fig, ax = plt.subplots()
    ax.set_xlim(0, 100)
    ax.set_ylim(0, 100)
    sns.scatterplot(data=X_train, x="lect", y="lab", hue=y_train, style=y_train)
    xvals = np.array([0,100])
    sns.lineplot(x=xvals, y=b + w*xvals, color='g', ax=ax)

interactive_plot = interactive(show_linear_model, b=(0, 200, 2), w=(-10,10,0.05))
interactive_plot

There is no line that perfectly separates the blue dots and orange circles - so what can we say about this dataset?

### Rolling our own

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled = add_dummy_feature(X_train_scaled)
X_test_scaled = add_dummy_feature(X_test_scaled)

In [None]:
def sigmoid(z):
    return 1/(1 + np.exp(-z))

In [None]:
def J(X, y, params):
    h = sigmoid(X.dot(params))
    return - np.mean(y * np.log(h) + (1 - y) * np.log(1 - h))

In [None]:
def stochastic_gradient_descent(X, y, alpha, num_epochs):

    m, n = X.shape
    params = rng.standard_normal(n) 
    Jvals = np.zeros(num_epochs * m)
    
    for epoch in range(num_epochs):
        perm = rng.permutation(m)
        for idx, i in enumerate(perm):
            x_i = X[i:i+1]
            y_i = y[i:i+1]
            params -= alpha * x_i.T.dot(sigmoid(x_i.dot(params)) - y_i) 
            Jvals[epoch * m + idx] = J(X, y, params)
 
    return params, Jvals

In [None]:
# Run the Stochastic Gradient Descent
params, Jvals = stochastic_gradient_descent(X_train_scaled, y_train, alpha = 0.003, num_epochs = 500)

In [None]:
def plot_loss(Jvals):
    fig, ax = plt.subplots(figsize=(20,8))
    xvals = np.linspace(1, Jvals.size, Jvals.size)
    ax = sns.scatterplot(x=xvals, y=Jvals)
    ax.set_title("J during learning")
    ax.set_ylabel("J")
    ax.set_xlabel("Number of iterations")
    plt.show()

In [None]:
plot_loss(Jvals[:40000])

In [None]:
# Display params it learned

params

Remember these parameter values are for scaled data.

In [None]:
# Let's make a prediction for a student whose lecture attendance was 60 and lab attendance was 25.

students = pd.DataFrame([
    {"lect" : 60, "lab" : 25},
])

students_scaled = scaler.transform(students)

students_scaled

In [None]:
y = sigmoid(0.2557774 + 0.22449731 * -0.8213866 + 1.80580958 * -2.36049218)
y

So the probability of the positive class (that this student passes) is approximately 0.015. In other words, we predict they will fail!

### Logistic Regression in scikit-learn

We don't need to 'roll our own'. scikit-learn has a class that we can use. It is implemented using Stochastic Gradient Descent. It has many arguments, e.g. max_iter controls the number of epochs. We will discuss the penalty argument in a later lecture.

In [None]:
logistic_model = Pipeline([
    ("scaler", StandardScaler()),
    ("predictor", LogisticRegression(penalty=None, random_state=rng))])

Training: fit the model to the scaled data.

In [None]:
logistic_model.fit(X_train, y_train)

Inference: make some predictions

In [None]:
students = pd.DataFrame([[60, 25], [80, 90], [95, 70]], columns=["lect", "lab"])

In [None]:
logistic_model.predict_proba(students)

In [None]:
label_encoder.inverse_transform(logistic_model.predict(students))

Error estimation - How good is it?

In [None]:
accuracy_score(logistic_model.predict(X_test), y_test)

scikit-learn's `LogisticRegression` class sits on top of the `SGDClassifier` class. We would use the latter if we wanted finer-grained control. For example, it allows us to set the learning rate.

## Multiclass Classification

### Read in the iris dataset - shuffle - split

In [None]:
# We don't need to read in a csv. scikit-learn has a copy of the dataset and a function to load it.
iris = load_iris(as_frame=True)

In [None]:
iris.frame.head()

In [None]:
iris.frame.shape

In [None]:
iris.frame.info()

In [None]:
iris.frame.describe(include="all")

In [None]:
X = iris.data
y = iris.target

In [None]:
y.value_counts()

We can see the this dataset need shuffling:

In [None]:
y

Happily, by default, `train_test_split` shhuffles before it splits.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=rng)

In [None]:
# There are four features. We'll choose two of them for our scatter plot.
sns.scatterplot(data=X_train, x="petal length (cm)", y="petal width (cm)", hue=y_train, style=y_train)

What do you notice from the diagram?

### Decision Tree

In [None]:
decision_tree = DecisionTreeClassifier(max_depth=3, random_state=rng)

In [None]:
decision_tree.fit(X_train, y_train)

In [None]:
fig = plt.figure(figsize=(20,8))
plot_tree(decision_tree, feature_names=iris.feature_names, class_names=iris.target_names, fontsize=12)
plt.show()

### kNN

In [None]:
knn = Pipeline([
    ("scaler", StandardScaler()),
    ("predictor", KNeighborsClassifier(n_neighbors=3))])

In [None]:
knn.fit(X_train, y_train)

### Multinomial Logistic Regression

In [None]:
logistic_model = Pipeline([
    ("scaler", StandardScaler()),
    ("predictor", LogisticRegression(penalty=None, random_state=rng))])

In [None]:
logistic_model.fit(X_train, y_train)

### Error estimation - evaluate on the test set

In [None]:
[accuracy_score(model.predict(X_test), y_test) for model in [decision_tree, knn, logistic_model]]

Of course, long before we evaluated on the test set, we should have used a validation set to choose hyperparameter values. Maybe Decision Trees and kNN would have done better if we had chosen their hyperparameters using a grid search.