# Linear Classification

## Fisher's linear discriminant with equal class covariance

In [4]:
%matplotlib inline
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn import linear_model 

## Linear discriminant analysis (LDA)

In [5]:
# Dataset
n_samples, n_features = 100, 2
mean0, mean1 = np.array([0, 0]), np.array([0, 2])
Cov = np.array([[1, .8],[.8, 1]])

np.random.seed(42)

X0 = np.random.multivariate_normal(mean0, Cov, n_samples)
X1 = np.random.multivariate_normal(mean1, Cov, n_samples)
X = np.vstack([X0, X1])
y = np.array([0] * X0.shape[0] + [1] * X1.shape[0])

# LDA with scikit-learn
lda = LDA()
proj = lda.fit(X, y).transform(X)
y_pred_lda = lda.predict(X)
errors = y_pred_lda != y
print("Nb errors=%i, error rate=%.2f" % (errors.sum(), errors.sum() / len(y_pred_lda)))

Nb errors=10, error rate=0.05


## Logistic regression

In [6]:
logreg = linear_model.LogisticRegression(C=1e8, solver='lbfgs')
# This class implements regularized logistic regression. C is the Inverse of regularization strength.
# Large value => no regularization.
logreg.fit(X, y)
y_pred_logreg = logreg.predict(X)
errors = y_pred_logreg != y
print("Nb errors=%i, error rate=%.2f" % (errors.sum(), errors.sum() / len(y_pred_logreg)))
print(logreg.coef_)

Nb errors=10, error rate=0.05
[[-5.1516729   5.57303883]]


## Overfitting


## Ridge Fisher's linear classification ($l_2$-regularization)

## Ridge logistic regression ($l_2$-regularization)

In [7]:
# Dataset
# Build a classification task using 3 informative features
from sklearn import datasets
X, y = datasets.make_classification(n_samples=100,n_features=20, n_informative=3, n_redundant=0, 
                                    n_repeated=0, n_classes=2, random_state=0, shuffle=False)

In [8]:
lr = linear_model.LogisticRegression(C=1, solver='lbfgs')
# This class implements regularized logistic regression. C is the Inverse of regularization strength.
# Large value => no regularization.

lr.fit(X, y)
y_pred_lr = lr.predict(X)
errors = y_pred_lr != y

print("Nb errors=%i, error rate=%.2f" % (errors.sum(), errors.sum() / len(y)))
print(lr.coef_)

Nb errors=26, error rate=0.26
[[-0.12899737  0.7579822  -0.01228473 -0.11412421  0.25491221  0.4329847
   0.14564739  0.16763962  0.85071394  0.02116803 -0.1611039  -0.0146019
  -0.03399884  0.43127728 -0.05831644 -0.0812323   0.15877844  0.29387389
   0.54659524  0.03376169]]


## Lasso logistic regression ($l_1$-regularization)

In [9]:
lrl1 = linear_model.LogisticRegression(penalty='l1', solver='saga')
# This class implements regularized logistic regression. C is the Inverse of regularization strength.
# Large value => no regularization.

lrl1.fit(X, y)
y_pred_lrl1 = lrl1.predict(X)
errors = y_pred_lrl1 != y

print("Nb errors=%i, error rate=%.2f" % (errors.sum(), errors.sum() / len(y_pred_lrl1)))
print(lrl1.coef_)

Nb errors=25, error rate=0.25
[[-0.12618164  0.71723918  0.         -0.00534429  0.2042112   0.38247243
   0.07004365  0.06301009  0.76849544  0.         -0.10630664  0.
   0.          0.34015127  0.          0.          0.08043682  0.19202706
   0.47989086  0.        ]]


## Ridge linear SVM ($l_2$-regularization)

In [10]:
from sklearn import svm
svmlin = svm.LinearSVC()
# Remark: by default LinearSVC uses squared_hinge as loss
svmlin.fit(X, y)
y_pred_svmlin = svmlin.predict(X)
errors = y_pred_svmlin != y
print("Nb errors=%i, error rate=%.2f" % (errors.sum(), errors.sum() / len(y_pred_svmlin)))
print(svmlin.coef_)

Nb errors=26, error rate=0.26
[[-0.056121    0.31189589  0.00271961 -0.05149163  0.09940419  0.17726429
   0.06519484  0.08921394  0.3533912   0.00601045 -0.06201172 -0.00741169
  -0.02156861  0.18272446 -0.02162812 -0.04061099  0.07204358  0.13083485
   0.23721453  0.0082412 ]]




## Lasso SVM ($l_1$-regularization)

In [11]:
svmlinl1 = svm.LinearSVC(penalty='l1', dual=False)
# Remark: by default LinearSVC uses squared_hinge as loss
svmlinl1.fit(X, y)
y_pred_svmlinl1 = svmlinl1.predict(X)
errors = y_pred_svmlinl1 != y
print("Nb errors=%i, error rate=%.2f" % (errors.sum(), errors.sum() / len(y_pred_svmlinl1)))
print(svmlinl1.coef_)

Nb errors=26, error rate=0.26
[[-0.0533391   0.29934574  0.         -0.03541637  0.09261429  0.16763337
   0.05808022  0.07587758  0.3406516   0.         -0.0555901  -0.00194174
  -0.01312517  0.16866053 -0.01450499 -0.02500558  0.06073932  0.11738939
   0.22485446  0.00473282]]


## Elastic-net classification ($l_1$-$l_2$-regularization)

In [12]:
from sklearn import linear_model as lm
X, y = datasets.make_classification(n_samples=100, n_features=20, n_informative=3, n_redundant=0, 
                                    n_repeated=0, n_classes=2, random_state=0, shuffle=False)
enetloglike = lm.SGDClassifier(loss="log", penalty="elasticnet", alpha=0.0001, l1_ratio=0.15, 
                               class_weight='balanced', max_iter=1000, tol=1e-3)
enetloglike.fit(X, y)
enethinge = lm.SGDClassifier(loss="hinge", penalty="elasticnet", alpha=0.0001, l1_ratio=0.15, 
                             class_weight='balanced')
enethinge.fit(X, y)



SGDClassifier(alpha=0.0001, average=False, class_weight='balanced',
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='elasticnet',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

## Metrics for classification performance evaluation

In [13]:
from sklearn import metrics
y_pred = [0, 1, 0, 0]
y_true = [0, 1, 0, 1]
metrics.accuracy_score(y_true, y_pred)

# The overall precision an recall
metrics.precision_score(y_true, y_pred)
metrics.recall_score(y_true, y_pred) 

# Recalls on individual classes: SEN & SPC
recalls = metrics.recall_score(y_true, y_pred, average=None)
recalls[0] # is the recall of class 0: specificity
recalls[1] # is the recall of class 1: specificity

# Balanced accuracy
b_acc = recalls.mean()
# The overall precision an recall on each individual class
p, r, f, s = metrics.precision_recall_fscore_support(y_true, y_pred)

In [14]:
score_pred = np.array([.1 ,.2, .3, .4, .5, .6, .7, .8])
y_true = np.array([0, 0, 0, 0, 1, 1, 1, 1])
thres = .9

y_pred = (score_pred > thres).astype(int)
print("Predictions:", y_pred)
metrics.accuracy_score(y_true, y_pred)

# The overall precision an recall on each individual class
p, r, f, s = metrics.precision_recall_fscore_support(y_true, y_pred)
print("Recalls:", r)

# 100% of specificity, 0% of sensitivity
# However AUC=1 indicating a perfect separation of the two classes
auc = metrics.roc_auc_score(y_true, score_pred)
print("AUC:", auc)

Predictions: [0 0 0 0 0 0 0 0]
Recalls: [1. 0.]
AUC: 1.0


  'precision', 'predicted', average, warn_for)


## Imbalenced classes

In [15]:
# dataset
X, y = datasets.make_classification(n_samples=500, n_features=5, n_informative=2, n_redundant=0, 
                                    n_repeated=0, n_classes=2, random_state=1, shuffle=False)

print(*["#samples of class %i = %i;" % (lev, np.sum(y == lev)) for lev in np.unique(y)])

print('# No Reweighting balanced dataset')
lr_inter = linear_model.LogisticRegression(C=1, solver='lbfgs')
lr_inter.fit(X, y)
p, r, f, s = metrics.precision_recall_fscore_support(y, lr_inter.predict(X))

print("SPC: %.3f; SEN: %.3f" % tuple(r))
print('# => The predictions are balanced in sensitivity and specificity\n')

# Create imbalanced dataset, by subsampling sample of class 0: keep only 10% of
# class 0's samples and all class 1's samples.
n0 = int(np.rint(np.sum(y == 0) / 20))
subsample_idx = np.concatenate((np.where(y == 0)[0][:n0], np.where(y == 1)[0]))

Ximb = X[subsample_idx, :]
yimb = y[subsample_idx]
print(*["#samples of class %i = %i;" % (lev, np.sum(yimb == lev)) for lev in np.unique(yimb)])

print('# No Reweighting on imbalanced dataset')
lr_inter = linear_model.LogisticRegression(C=1, solver='lbfgs')
lr_inter.fit(Ximb, yimb)
p, r, f, s = metrics.precision_recall_fscore_support(yimb, lr_inter.predict(Ximb))

print("SPC: %.3f; SEN: %.3f" % tuple(r))
print('# => Sensitivity >> specificity\n')
print('# Reweighting on imbalanced dataset')

lr_inter_reweight = linear_model.LogisticRegression(C=1,  class_weight="balanced", solver='lbfgs') 
lr_inter_reweight.fit(Ximb, yimb)
p, r, f, s = metrics.precision_recall_fscore_support(yimb, lr_inter_reweight.predict(Ximb))

print("SPC: %.3f; SEN: %.3f" % tuple(r))
print('# => The predictions are balanced in sensitivity and specificity\n')

#samples of class 0 = 250; #samples of class 1 = 250;
# No Reweighting balanced dataset
SPC: 0.940; SEN: 0.928
# => The predictions are balanced in sensitivity and specificity

#samples of class 0 = 12; #samples of class 1 = 250;
# No Reweighting on imbalanced dataset
SPC: 0.750; SEN: 0.996
# => Sensitivity >> specificity

# Reweighting on imbalanced dataset
SPC: 1.000; SEN: 0.980
# => The predictions are balanced in sensitivity and specificity

