# Logistic regression 
In questo notebook addrestriamo un modello di Logistic Regression con dati sintetici (dati generati a scopo dimostrativo) per un problema di classificazione binaria. Dato un dataset etichettato di dati sintetici, addestriamo un modello di Logistic Regression per trovare un confine decisionale (o ```decision boundary```) lineare tra due classi.

In [None]:
# Author: Roberto Doriguzzi-Corin
# Project: Corso di Algoritmi di Machine Learning per la rilevazione di attacchi informatici
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Input

SEED = 1

# Create a synthetic dataset with two classes that are linearly separable
X, y = make_classification(n_samples=1000, n_features=2, n_classes=2, n_informative=2, n_redundant=0,
                           n_clusters_per_class=1, class_sep=2.0, random_state=SEED)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

# Visualizziamo i campioni del dataset nel piano

In [None]:
# Visualize the training data
plt.figure(figsize=(8, 6))
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='viridis', edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Dataset sintetico con due classi di punti linearmente separabili')
plt.show()

# Implementazione del modello
Nella prossima cella, implementiamo il modello di Logistic Regression. 
Il modello e' creato con una procedura identica a quella usata per creare una rete neurale. L'unica differeza sta nella complessita' del modello. Nel caso di Logistic Regression basta una riga di codice (riga 3). Nel caso di una rete neurale profonda, no. 

In [None]:
# Logistic Regression model
def create_model(input_shape):
    model = Sequential(name  = "logistic_regression")
    # Questo e' l'input layer. In questo caso abbiamo 21 features
    model.add(Input(shape=input_shape))
    
    # Infine l'output layer
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    print (model.summary())
    return model

# Model training
Nella cella seguente addestriamo il modello di Logistic Regression. L'output del processo di training mostra come l'accuracy del modello tenda a crescere ad ogni iterazione (```epoch```). Questo vuol dire che il modello sta imparando a distiguere i flussi di traffico benigni da quelli malevoli.
Lo addestriamo per 100 epoche.

In [None]:
# Train the model
model = create_model(X_train.shape[1])
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val))

# Visualizziamo il decision boundary

In [None]:
# Get the learned coefficients (weights and bias) from the trained model
weights, bias = model.layers[0].get_weights()
coefficients = [weights[0][0], weights[1][0], bias[0]]

# Print the learned coefficients
print("Decision boundary: " +  str(bias[0]) + " + " + str(weights[0][0]) + "*X1" + " + " + str(weights[1][0]) + "*X2")

# Calculate slope and intercept for the decision boundary line
slope = -coefficients[0] / coefficients[1]
intercept = -coefficients[2] / coefficients[1]

# Plot the data points and decision boundary line
plt.figure(figsize=(8, 6))
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, edgecolors='k', cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# Plot the decision boundary line
x_min, x_max = X_train[:, 0].min(), X_train[:, 0].max()
y_min, y_max = X_train[:, 1].min(), X_train[:, 1].max()
plt.plot([x_min, x_max], [slope * x_min + intercept, slope * x_max + intercept], color='red', linestyle='--')

plt.title('Decision Boundary con una linea retta')
plt.show()

In [None]:
# Generate points for plotting the hyperplane in 3D space
xx, yy = np.meshgrid(np.linspace(X[:, 0].min(), X[:, 0].max(), 50),
                     np.linspace(X[:, 1].min(), X[:, 1].max(), 50))
zz = (-coefficients[0] * xx - coefficients[1] * yy - coefficients[2]) / coefficients[2]


# Plot the hyperplane in 3D space
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(xx, yy, zz, color='c', alpha=0.5)
ax.scatter(X[:, 0], X[:, 1], y, c=y, edgecolors='k', cmap=plt.cm.Set1)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
ax.set_zlabel('Output (before the sigmoid function)')
ax.set_title('Hyperplane in 3D Space')

ax.view_init(elev=5, azim=180)  # Elev: Elevation (up/down), Azim: Azimuthal (rotation around the z axis)
plt.show()

# Usiamo il modello con dei dati mai visti (test set)

In [None]:
y_pred = np.squeeze(model.predict(X_test, batch_size=32) > 0.5)

print("F1 Score: ", f1_score(y_test,y_pred))

In [None]:
# Get the learned coefficients (weights and bias) from the trained model
weights, bias = model.layers[0].get_weights()
coefficients = [weights[0][0], weights[1][0], bias[0]]

# Print the learned coefficients
print("Decision boundary: " +  str(bias[0]) + " + " + str(weights[0][0]) + "*X1" + " + " + str(weights[1][0]) + "*X2")

# Calculate slope and intercept for the decision boundary line
slope = -coefficients[0] / coefficients[1]
intercept = -coefficients[2] / coefficients[1]

# Plot the data points and decision boundary line
plt.figure(figsize=(8, 6))
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, edgecolors='k', cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# Plot the decision boundary line
x_min, x_max = X_test[:, 0].min(), X_test[:, 0].max()
y_min, y_max = X_test[:, 1].min(), X_test[:, 1].max()
plt.plot([x_min, x_max], [slope * x_min + intercept, slope * x_max + intercept], color='red', linestyle='--')

plt.title('Decision Boundary con una linea retta')
plt.show()

# Logistic regression con features polinomiali
La Logistic Regression può essere estesa per gestire caratteristiche polinomiali incorporando termini polinomiali nello spazio delle caratteristiche. Questa tecnica è nota come Logistic Regression Polinomiale. Permette al modello di Logistic Regression di catturare le relazioni non lineari tra le caratteristiche e la variabile target.

In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from keras.models import Sequential
from keras.layers import Dense

SEED  = 1

GRADO_DEL_POLINOMIO=2

# Create a synthetic dataset with two classes that are not linearly separable
X, y = make_circles(n_samples=1000, noise=0.1, factor=0.3, random_state=SEED)

# Add polynomial features to the data
poly = PolynomialFeatures(degree=GRADO_DEL_POLINOMIO)
X_poly = poly.fit_transform(X)

# Split the data into training and testing sets
X_poly_train, X_poly_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=SEED)
X_poly_train, X_poly_val, y_train, y_val = train_test_split(X_poly_train, y_train, test_size=0.2, random_state=SEED)

In [None]:
# Visualize the dataset
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='viridis', edgecolors='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Dataset sintetico con due classi di punti NON linearmente separabili')
plt.show()

In [None]:
# Train the model
model = create_model(X_poly.shape[1])
model.fit(X_poly_train, y_train, epochs=100, batch_size=32, validation_data=(X_poly_val, y_val))

In [None]:
# Plot the decision boundary as a single curve
plt.figure(figsize=(8, 6))
h = .02  # Step size in the mesh
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Transform meshgrid points into polynomial features
xx_poly = poly.transform(np.c_[xx.ravel(), yy.ravel()])

# Predict probabilities for each point on the meshgrid
Z = model.predict(xx_poly)
Z = Z.reshape(xx.shape)

# Plot the contour line representing the decision boundary (where probability is 0.5)
plt.contour(xx, yy, Z, levels=[0.5], colors='black')

# Plot the data points
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Decision Boundary con Features polinomiali')
plt.show()

# Usiamo il modello sul test set

In [None]:
y_pred = np.squeeze(model.predict(X_poly_test, batch_size=32) > 0.5)

print("F1 Score: ", f1_score(y_test,y_pred))