# Logistic Regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt

### 1. tanh
$$
y = tanh(x) = \frac{sinh(x)}{cosh(x)} = \frac{e^{2x} -1}{e^{2x} + 1}
$$

$x \in (-\infty, +\infty)$, $y \in [-1,1]$

In [None]:
in_array = np.linspace(-np.pi, np.pi, 12)
out_array = np.tanh(in_array)

print("in_array : ", in_array)
print("\nout_array : ", out_array)

# red for numpy.tanh()
plt.plot(in_array, out_array, color = 'red', marker = "o")
plt.title("numpy.tanh()")
plt.xlabel("X")
plt.ylabel("Y")
plt.show()

### 2. sigmoid

$$
y = \sigma(x) = \frac{1}{1 + e^{-x}}
$$

$x \in (-\infty, +\infty)$, $y \in [0,1]$

In [None]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [None]:
in_array = np.linspace(-5, 5, 25)
out_array = sigmoid(in_array)

print("in_array : ", in_array)
print("\nout_array : ", out_array)

# red for numpy.tanh()
plt.plot(in_array, out_array, color = 'red', marker = "o")
plt.title("sigmoid")
plt.xlabel("X")
plt.ylabel("Y")
plt.show()

### 3. Log of odds

In [None]:
p = np.linspace(0.00001, 0.999999, 50)
odds_ratio = p/(1-p)
log_odds = np.log(odds_ratio)

print("in_array : ", p)
print("\nout_array : ", log_odds)

# red for numpy.tanh()
plt.plot(p, log_odds, color = 'red', marker = "o")
plt.title("Log Odds")
plt.xlabel("Probability")
plt.ylabel("Log of odds")
plt.show()

$$
log \biggr(\frac{p}{1-p} \biggr) = w_1x_1 + w_2x_2 + .... w_nx_n + b = w^Tx + b
$$
Simplifying and reordering we get
$$
p = \frac{1}{1+e^{-(w^Tx+b)}} = \sigma(w^Tx+b)
$$

In [None]:
from sklearn.datasets import fetch_openml
titanic = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [None]:
df_titanic = titanic[0]
df_titanic['survived'] = titanic[1]
df_titanic.head()

In [None]:
df_titanic.info()

In [None]:
print(f"Unique values for feature cabin {df_titanic['cabin'].nunique()}")
print(f"Unique values for feature body {df_titanic['body'].nunique()}")
print(f"Unique values for feature boat {df_titanic['boat'].nunique()}")

In [None]:
df_titanic.drop(columns=["name", "ticket", "cabin", "body", "home.dest", "boat"], inplace=True)
df_titanic.head()

In [None]:
df_titanic.drop(np.where(df_titanic["embarked"].isna())[0], inplace=True)
df_titanic.drop(np.where(df_titanic["fare"].isna())[0], inplace=True)
df_titanic.shape

In [None]:
from sklearn.model_selection import train_test_split

X = df_titanic.iloc[:,:-1:]
y = df_titanic.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

In [None]:
from sklearn.preprocessing import LabelEncoder

lbl_encoder = LabelEncoder()
sex_train_encoded = lbl_encoder.fit_transform(X_train["sex"])
sex_test_encoded = lbl_encoder.transform(X_test["sex"])

lbl_encoder2 = LabelEncoder()
embark_train_encoded = lbl_encoder.fit_transform(X_train["embarked"])
embark_test_encoded = lbl_encoder.transform(X_test["embarked"])

tgt_encoder = LabelEncoder()
y_train_encoded = tgt_encoder.fit_transform(y_train)
y_test_encoded = tgt_encoder.transform(y_test)

In [None]:
X_train_new = np.hstack(
    (X_train.iloc[:,0:1].to_numpy(), sex_train_encoded.reshape(-1,1),
     X_train.iloc[:,2:6].to_numpy(), embark_train_encoded.reshape(-1,1)))
X_test_new = np.hstack(
    (X_test.iloc[:,0:1].to_numpy(), sex_test_encoded.reshape(-1,1),
     X_test.iloc[:,2:6].to_numpy(), embark_test_encoded.reshape(-1,1)))

In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=7, weights='uniform', metric='nan_euclidean')
X_train_imputed = imputer.fit_transform(X_train_new)
X_train_imputed[0:5]

In [None]:
X_test_imputed = imputer.transform(X_test_new)
X_test_imputed[0:5]

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_imputed, y_train)

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_imputed, y_train)

In [None]:
train_accuracy = clf.score(X_train_imputed, y_train)
print(f'Train accuracy: {train_accuracy * 100:.3f}%')

test_accuracy = clf.score(X_test_imputed, y_test)
print(f'Test accuracy: {test_accuracy * 100:.3f}%')

In [None]:
print(clf.n_iter_)

In [None]:
print(clf.coef_)
print(clf.intercept_)

**Solvers**

https://stackoverflow.com/questions/38640109/logistic-regression-python-solvers-definitions/52388406#52388406

In [None]:
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

solver_list = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
params = dict(solver=solver_list)
log_reg = LogisticRegression(C=1, n_jobs=-1, random_state=34)
clf = GridSearchCV(log_reg, params, cv=5)
clf.fit(X_train_imputed, y_train)
scores = clf.cv_results_['mean_test_score']

for score, solver in zip(scores, solver_list):
    print(f"  {solver} {score:.3f}" )

**multiclass**

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_wine

In [None]:
rd = load_wine()
X, y = load_wine(return_X_y=True)

df = pd.DataFrame(X, columns=rd.feature_names)
df['target'] = y
df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:,-1], random_state=34)

In [None]:

mask = np.zeros_like(X_train.corr(), dtype=bool)
mask[np.triu_indices_from(mask)]= True

plt.figure(figsize=(10,10))
plt.title("Wine Feature Correlation Matrix", fontsize=40)
x = sns.heatmap(
    X_train.corr(),
    cmap='coolwarm',
    annot=True,
    mask=mask,
    linewidths = .5,
    vmin = -1,
    vmax = 1,
)

In [None]:
logistic_regression_model = LogisticRegression(random_state=34, solver='lbfgs', multi_class="auto", n_jobs=-1, C=1)
logistic_regression_model.fit(X_train, y_train)

In [None]:
accuracy_score = logistic_regression_model.score(X_test, y_test)
accuracy_score

In [None]:
solver_list = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
parameters = dict(solver=solver_list)
lr = LogisticRegression(random_state=34, multi_class="auto", n_jobs=-1, C=1)
clf = GridSearchCV(lr, parameters, cv=5)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.cv_results_['mean_test_score']

In [None]:
scores = clf.cv_results_['mean_test_score']
for score, solver, in zip(scores, solver_list):
    print(f"{solver}: {score:.3f}")

In [None]:
sns.barplot(x=solver_list, y=scores). set_title("Wine Accuracy with Unscaled Features")

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
solver_list = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
parameters = dict(solver=solver_list)
lr = LogisticRegression(random_state=34, multi_class="auto", n_jobs=-1, C=1)
clf = GridSearchCV(lr, parameters, cv=5)
clf.fit(X_train, y_train)

In [None]:
clf.cv_results_['mean_test_score']
scores = clf.cv_results_['mean_test_score']
for score, solver, in zip(scores, solver_list):
    print(f"{solver}: {score:.3f}")

In [None]:
ax =sns.barplot(x=solver_list, y=scores).set_title("Wine Accuracy with Scaled Features", fontsize="20")

**One versus Rest and One Versus One**

https://machinelearningmastery.com/one-vs-rest-and-one-vs-one-for-multi-class-classification/

When to use which
1. When data is imbalanced, prefer OVO

SVM too has similar OVO and OVR support in sklearn