### Exercise 1: Logistic regression in Scikit-learn

In [None]:
import numpy as np

X = np.array([[0],[0.1],[0.2], [1],[1.1],[1.2], [1.3]]) 
y = np.array([0,0,0,1,1,1,0]) 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

cls = KNeighborsClassifier()
cls.fit(X,y)

x_pred =cls.predict([[0.5]])
x_prob = cls.predict_proba([[0.5]])

reg = LogisticRegression()

reg.fit(X,y)

print(f"""
1. xpred = {x_pred}
2. xpred probabilities = {x_prob}
3. coef = {reg.coef_}; intercept = {reg.intercept_}; score = {reg.score(X,y)}
""")

### Exercise 2: Sigmoid

In [None]:
import matplotlib.pyplot as plt

x = np.linspace(-20, 15, 300)

sigmoid1  = 1/(1+ np.exp(-(0.5*x + 3)))
sigmoid2 = 1/(1+ np.exp(-(5*x + 11)))
sigmoid = 1/(1+ np.exp(-x))

plt.plot(x,sigmoid)
plt.plot(x,sigmoid1)
plt.plot(x,sigmoid2)
plt.axhline(y=0.5,color='red')

plt.title("Sigmoids")
plt.xlabel("X")


plt.show()

### Exercise 3: Decision boundary

In [3]:
from sklearn.datasets import make_classification


X,y = make_classification(
    n_samples=100,
    n_features=1,
    n_informative=1,
    n_redundant=0,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=1,
    weights=[0.5,0.5],
    flip_y=0.15,
    class_sep=2.0,
    hypercube=True,
    shift=1.0,
    scale=1.0,
    shuffle=True,
    random_state=88
)


##### 1. Plot the data using a scatter plot. The x-axis contains the feature and y-axis contains the target.

In [None]:
plt.title("X (1 dimension) and y")
plt.xlabel("X")
plt.ylabel("Y")

plt.scatter(X,y)

##### 2. Fit a Logistic Regression on the generated data using scikit learn. Print the coefficients and the interception of the Logistic Regression.

In [None]:
lreg = LogisticRegression()

lreg.fit(X,y)

print(f"""
coef = {lreg.coef_}
intercept = {reg.intercept_}
""")

##### 2. Add to the previous plot the fitted sigmoid and the 0.5 probability line. The plot should look like this:

In [None]:
x = np.linspace(-3,4)
sigmoid =lambda x: 1/(1+np.exp(-x))
y_prob = sigmoid(x*lreg.coef_[0][0]+lreg.intercept_[0])
plt.title("X (1 dimension) and y")
plt.xlabel("X")
plt.ylabel("Y")

plt.scatter(X,y)
plt.plot(x,y_prob,color="orange")
plt.plot([-3,4],[0.5,0.5],color="red")

##### 4. Create a function predict_probability that takes as input the data point and the coefficients and that returns the predicted probability.

In [None]:
def predict_probability(coefs, X):
    '''
    coefs is a list that contains a and b: [coef, intercept]
    X is the features set

    Returns probability of X
    '''
    probabilities = sigmoid(coefs[0]*X + coefs[1])

    return probabilities


lreg.fit(X,y)
x_prob = lreg.predict_proba(X)
coefs = [lreg.coef_[0][0],lreg.intercept_[0]]
x_prob1 = predict_probability(coefs,X)

x_prob[:3],x_prob1[:3]

##### 5. Create a function predict_class that takes as input the data point and the coefficients and that returns the predicted class. Check you have the same results as the class method predict output on the same data.

In [None]:
def predict_class(coefs, X: np.ndarray):
    '''
    coefs is a list that contains a and b: [coef, intercept]
    X is the features set

    Returns class of X
    '''
    prob =sigmoid(coefs[0]*X + coefs[1])
    cls =  np.where(prob>= 0.5,1,0)

    return cls

x_class = lreg.predict(X)
x_class1 = predict_class(coefs,X)
x_class[:5],x_class1[:5]

##### 6. On the plot add the predicted class

In [None]:
x = np.linspace(-3,4)
sigmoid =lambda x: 1/(1+np.exp(-x))
y_prob = sigmoid(x*lreg.coef_[0][0]+lreg.intercept_[0])
plt.title("X (1 dimension) and y")
plt.xlabel("X")
plt.ylabel("Y")

plt.scatter(X,y)
plt.plot(x,y_prob,color="orange")
plt.scatter(X,x_class.clip(0.1,0.9),color="orange")

plt.plot([-3,4],[0.5,0.5],color="red")

Now, let us repeat this process on 2-dimensional data. The goal is to focus on the decision boundary and to understand how the Logistic Regression create a line that separates the data. The code to plot the decision boundary is provided, however it is important to understand the way it works.

*  Generate 500 data points using:


In [10]:
X, y = make_classification(n_features=2,
                           n_redundant=0,
                           n_samples=250,
                           n_classes=2,
                           n_clusters_per_class=1,
                           flip_y=0.05,
                           class_sep=3,
                           random_state=43)


##### 7. Fit the Logistic Regression on X and y and use the code below to plot the fitted sigmoid on the data set.

In [None]:
lreg.fit(X,y)

xx, yy = np.mgrid[-5:5:.01, -5:5:.01]
grid = np.c_[xx.ravel(), yy.ravel()]
#if needed change the line below
probs = lreg.predict_proba(grid)[:, 1].reshape(xx.shape)

f, ax = plt.subplots(figsize=(8, 6))
contour = ax.contourf(xx, yy, probs, 25, cmap="RdBu",
                      vmin=0, vmax=1)
ax_c = f.colorbar(contour)
ax_c.set_label("$P(y = 1)$")
ax_c.set_ticks([0, .25, .5, .75, 1])

ax.scatter(X[:,0], X[:, 1], c=y, s=50,
           cmap="RdBu", vmin=-.2, vmax=1.2,
           edgecolor="white", linewidth=1)

ax.set(aspect="equal",
       xlim=(-5, 5), ylim=(-5, 5),
       xlabel="$X_1$", ylabel="$X_2$")


### Exercise 4: Train test split

In [None]:
from sklearn.model_selection import train_test_split


X = np.arange(1,21).reshape(10,-1)
y = np.zeros(10)
y[7:] = 1

X_train, X_test, y_train,  y_test = train_test_split(X,y,shuffle=False,test_size=0.2)

print(f"""
xtrain: {X_train}

ytrain: {y_train}

x_test: {X_test}

y_test: {y_test}
      
""")
train_prop = np.mean(y_train==1)
test_prop = np.mean(y_test==1)

train_prop,test_prop

In [None]:
X = np.arange(1,201).reshape(100,-1)
y = np.zeros(100)
y[70:] = 1

X_train, X_test, y_train,  y_test = train_test_split(X,y,stratify=y,test_size=0.2)

train_prop = np.mean(y_train==1)
test_prop = np.mean(y_test==1)

train_prop,test_prop


### Exercise 5: Breast Cancer prediction

The goal of this exercise is to use Logistic Regression to predict breast cancer. It is always important to understand the data before training any Machine Learning algorithm. The data is described in breast-cancer-wisconsin.names. I suggest to add manually the column names in the DataFrame.

In [None]:
import pandas  as pd

cols = ["SCN"
,"CT"
,"UCSI"
,"UCSH"
,"MA"
,"SECS"
,"BN"
,"BC"
,"NN"
,"Mitoses"
,"Class"]
data = pd.read_csv("https://raw.githubusercontent.com/01-edu/public/refs/heads/master/subjects/ai/classification/data/breast-cancer-wisconsin.data",names=cols,index_col=["SCN"])
data = data.apply(pd.to_numeric,errors="coerce")
data.fillna(data.median(),inplace=True)
data.isnull().sum()

X = data.drop("Class",axis=1)
y = data["Class"]

X_train, X_test, y_train,  y_test = train_test_split(X,y,stratify=y,test_size=0.2,random_state=43)

train_prop = np.mean(y_train==2)
test_prop = np.mean(y_test==2)

train_prop,test_prop


In [None]:
lreg = LogisticRegression()
lreg.fit(X_train,y_train)

train_pred = lreg.predict(X_train)
train_prob = lreg.predict_proba(X_train)
test_pred = lreg.predict(X_test)
test_prob = lreg.predict_proba(X_test)


print(f"""
train pred :
{train_pred[:5]}

train probabilities :
{train_prob[:5,1]}

test pred :
{test_pred[:5]}

test prprobabilitiesed :
{test_prob[:5,1]}

train score: {lreg.score(X_train,y_train)}

test score: {lreg.score(X_test,y_test)
}

""")

In [None]:
from sklearn.metrics import confusion_matrix


test_cm = confusion_matrix(y_test,test_pred)
train_cm = confusion_matrix(y_train,train_pred)
print(f"""
train confusion_matrix: 
{train_cm}

test confusion_matrix: 
{test_cm}
""")