<a href="https://colab.research.google.com/github/francesca-leonardi/Learning-Machine-Learning/blob/main/drug_classification_with_polynomial_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [79]:
### Simple logistic regression ###

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

df = pd.read_csv('/content/drug200.csv')

# Conversion of non numerical data in the dataframe to numerical data

df.Sex = df.Sex.apply(lambda p: 0 if p == 'M' else 1)
df.BP = df.BP.apply(lambda p: 0 if p == 'NORMAL' else (-1 if p == 'LOW' else 1))
df.Cholesterol = df.Cholesterol.apply(lambda p: 0 if p == 'NORMAL' else 1)

def convertor(row,values):
  i = 0
  while row != values[i]:
    i += 1
  return i

drug_values = df.Drug.unique()
df.Drug = df.Drug.apply(lambda x: convertor(x,drug_values))

# Preparation of the training and testing set

train_and_cv, test = train_test_split(df, train_size=0.8, random_state=42)
train, cv = train_test_split(train_and_cv, train_size = 0.75)

X_train = train.iloc[:,:-1]
y_train = train.iloc[:,-1].tolist()

X_cv = cv.iloc[:,:-1]
y_cv = cv.iloc[:,-1].tolist()

X_test = test.iloc[:,:-1]
y_test = test.iloc[:,-1].tolist()

In [80]:
# Training of the model and prediction

model = LogisticRegression(multi_class='multinomial', max_iter = 5000)

model.fit(X_train,y_train)

y_hat = model.predict(X_cv)

# Performance of the model

accuracy = accuracy_score(y_true = y_cv, y_pred=y_hat)

cm = confusion_matrix(y_true = y_cv, y_pred = y_hat)

print("Accuracy score:",accuracy)
print("Confusion matrix:\n",cm)
print("Drugs:",drug_values)



Accuracy score: 0.975
Confusion matrix:
 [[23  0  0  0  0]
 [ 0  0  1  0  0]
 [ 0  0  9  0  0]
 [ 0  0  0  3  0]
 [ 0  0  0  0  4]]
Drugs: ['DrugY' 'drugC' 'drugX' 'drugA' 'drugB']


In [81]:
accuracy_per_deg = {1: accuracy}
cm_per_deg = {1: cm}

In [82]:
# Enhancement of the dataset with polynomial features

for deg in range(2,6):

    X_train_deg = X_train.copy()
    X_cv_deg    = X_cv.copy()
    X_test_deg  = X_test.copy()

    for i_BP in [0,1,2]:
        for i_Age in range(deg-i_BP+1):
            for i_Na_to_K in range(deg-(i_BP+i_Age)+1):
                col_name = f"deg{deg}_BP{i_BP}_Age{i_Age}_NaK{i_Na_to_K}"
                X_train_deg[col_name] = (X_train['BP']**i_BP)*(X_train['Age']**i_Age)*(X_train['Na_to_K']**i_Na_to_K)
                X_cv_deg[col_name]    = (X_cv['BP']**i_BP)*(X_cv['Age']**i_Age)*(X_cv['Na_to_K']**i_Na_to_K)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_deg, y_train)
    y_hat = model.predict(X_cv_deg)

    accuracy = accuracy_score(y_cv, y_hat)
    cm = confusion_matrix(y_cv, y_hat)

    accuracy_per_deg[deg] = accuracy
    cm_per_deg[deg] = cm

print("Accuracies:", accuracy_per_deg)
print("Confusion matrices:", cm_per_deg)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracies: {1: 0.975, 2: 1.0, 3: 0.95, 4: 0.85, 5: 0.9}
Confusion matrices: {1: array([[23,  0,  0,  0,  0],
       [ 0,  0,  1,  0,  0],
       [ 0,  0,  9,  0,  0],
       [ 0,  0,  0,  3,  0],
       [ 0,  0,  0,  0,  4]]), 2: array([[23,  0,  0,  0,  0],
       [ 0,  1,  0,  0,  0],
       [ 0,  0,  9,  0,  0],
       [ 0,  0,  0,  3,  0],
       [ 0,  0,  0,  0,  4]]), 3: array([[23,  0,  0,  0,  0],
       [ 0,  1,  0,  0,  0],
       [ 0,  1,  8,  0,  0],
       [ 0,  0,  0,  3,  0],
       [ 0,  0,  1,  0,  3]]), 4: array([[19,  0,  1,  3,  0],
       [ 0,  1,  0,  0,  0],
       [ 0,  1,  8,  0,  0],
       [ 0,  0,  0,  3,  0],
       [ 1,  0,  0,  0,  3]]), 5: array([[23,  0,  0,  0,  0],
       [ 0,  1,  0,  0,  0],
       [ 0,  3,  6,  0,  0],
       [ 0,  0,  0,  3,  0],
       [ 1,  0,  0,  0,  3]])}


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [83]:
# Selection of the model with highest accuracy score on the cross validation dataset

best_deg = max(accuracy_per_deg, key=accuracy_per_deg.get)

print('The best degree for the polynomial features is', best_deg)

X_train_deg = X_train.copy()
X_test_deg  = X_test.copy()

for i_BP in range(best_deg):
    for i_Age in range(best_deg-i_BP+1):
        for i_Na_to_K in range(best_deg-(i_BP+i_Age)+1):
            col_name = f"deg{deg}_BP{i_BP}_Age{i_Age}_NaK{i_Na_to_K}"
            X_train_deg[col_name] = (X_train['BP']**i_BP)*(X_train['Age']**i_Age)*(X_train['Na_to_K']**i_Na_to_K)
            X_test_deg[col_name]  = (X_test['BP']**i_BP)*(X_test['Age']**i_Age)*(X_test['Na_to_K']**i_Na_to_K)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_deg, y_train)

# Testing

y_hat = model.predict(X_test_deg)

accuracy = accuracy_score(y_test, y_hat)
cm = confusion_matrix(y_test, y_hat)

print('Accuracy score on the test:', accuracy)
print('Confusion matrix on the test:\n', cm)

The best degree for the polynomial features is 2
Accuracy score on the test: 0.9
Confusion matrix on the test:
 [[15  0  0  0  0]
 [ 0  5  0  0  0]
 [ 0  3  8  0  0]
 [ 0  0  1  5  0]
 [ 0  0  0  0  3]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
