# Logistic Regression for Feature Selection

Want:
- Find an embedding for discrete labels of the shrooms data
- Use the embedding as logistic regression features to predict edible?
- Use L1 penalty
- Select nonzero coeffs after gradient descent for logistic regression

In [21]:
import numpy as np
import sklearn as sk
import sklearn.preprocessing
import sklearn.metrics 
import matplotlib.pyplot as plt

with open("scripts/data/mushrooms.csv", "r") as f_in:
    data = np.array([ line.split(",") for line in f_in.read().split("\n")])
    labels = data[0]
    data = data[1:]
    x_cat,y_cat = data[:, 1:], data[:, 0]

In [2]:

y = (y_cat == "e")
enc = sk.preprocessing.OneHotEncoder()
enc.fit(x_cat)
x = enc.transform(x_cat).toarray()

n, d = x.shape

num_train = int(n * 0.8)

x_train, x_test = x[:num_train], x[num_train:]
y_train, y_test = y[:num_train], y[num_train:]

In [3]:
x_ = np.copy(x_train)
x_[y_train==0] = -x_[y_train==0]

def logistic(x):
    return 1 / (1 + np.exp(-x))

def grad(theta, lmbda):
    return np.sum((1 - logistic(x_ @ theta)) * x_, axis=0)[:, None] - lmbda * np.sign(theta)

itrs = 1000
lr = 0.001
l1_penalty = 10

theta = np.random.normal(size=(d, 1), scale=0.1)
for i in range(itrs):
    theta = theta + lr * grad(theta, l1_penalty)

In [17]:
def evaluate(theta_thresh=0):
    theta_thresh = theta * (theta > theta_thresh)
    y_prob_thresh = logistic(x_test @ theta_thresh)
    y_est_thresh = (y_prob_thresh > 0.5)
    conf = sk.metrics.confusion_matrix(y_test, y_est_thresh)
    print("Confusion Matrix:")
    print(conf)
    print(f"Accuracy: {np.sum(np.diag(conf)) / np.sum(conf)}")
    print(f"Number of features: {np.sum(theta_thresh != 0)}")
    print(f"Important feature indices: {np.where(theta_thresh.flatten() != 0)}")
    return theta_thresh, np.where(theta_thresh.flatten() != 0)[0]

In [18]:
theta_thresh, features = evaluate(theta_thresh=1)

Confusion Matrix:
[[1110    8]
 [   0  507]]
Accuracy: 0.9950769230769231
Number of features: 3
Important feature indices: (array([22, 25, 27]),)


In [19]:
fs = [np.zeros((1,d)) for _ in range(len(features))]
for i, feat in enumerate(features):
    fs[i][0, feat] = 1
    print(enc.inverse_transform(fs[i]))
print(enc.inverse_transform(np.zeros((1,d))))

[['b' 'f' 'b' 'f' 'a' 'a' 'c' 'b' 'b' 'e' '?' 'f' 'f' 'b' 'b' 'p' 'n' 'n'
  'e' 'b' 'a' 'd']]
[['b' 'f' 'b' 'f' 'l' 'a' 'c' 'b' 'b' 'e' '?' 'f' 'f' 'b' 'b' 'p' 'n' 'n'
  'e' 'b' 'a' 'd']]
[['b' 'f' 'b' 'f' 'n' 'a' 'c' 'b' 'b' 'e' '?' 'f' 'f' 'b' 'b' 'p' 'n' 'n'
  'e' 'b' 'a' 'd']]
[['b' 'f' 'b' 'f' 'a' 'a' 'c' 'b' 'b' 'e' '?' 'f' 'f' 'b' 'b' 'p' 'n' 'n'
  'e' 'b' 'a' 'd']]


In [22]:
print(labels[4])

bruises


In [26]:
print(np.unique(data[:, 5]))

['a' 'c' 'f' 'l' 'm' 'n' 'p' 's' 'y']


In [27]:
print(labels[5])

odor


In [31]:
print(y[x_cat[:, 5] == "a"])

[ True  True  True  True  True  True  True False  True  True  True  True
  True  True False  True  True  True  True  True  True  True  True  True
  True  True False  True  True False False  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
 False  True False  True  True  True  True  True  True  True False  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False False False  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True False  True  True  True  True  True  True  True  True  True  True
 False  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True  True False  True  True  True  True  True  True
  True  True  True  True False  True  True  True  T

In [33]:
print(y[x_cat[:, 5] == "l"])

[]
