# Review 1: Supervised Learning

- Using the drugs dataset, use classification machine learning models.
- Supervised learning

## Part 1: Import, load, clean the dataset 

In [159]:
# Preprocessing and model selection imports
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [160]:
# Load the dataset into a dataframe, and check the first 5 rows.
df = pd.read_csv('drug200.csv')
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [161]:
# Check the datasets info and for any missing values.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


*Based on the results above, there are no null values. There is 1 integer, 4 strings, and 1 float datatypes.*


In [162]:
# A good practice is to make a copy of the dataframe to avoid any changes to the original data.
df_copy = df.copy()

In [163]:
# Transform categorical values to numerical values.
df['BP'].value_counts()


BP
HIGH      77
LOW       64
NORMAL    59
Name: count, dtype: int64

In [164]:
df['Cholesterol'].value_counts()


Cholesterol
HIGH      103
NORMAL     97
Name: count, dtype: int64

In [165]:
# Apply mapping to convert the Cholesterol and BP columns into ordinal numerical 
# Apply the get_dummies function to convert Sex column into one-hot encoded columns.
# -------------------------------------------------------------------------------------
# The mapping is used for categorical values with ordinal values, such as low, normal, and high.
# However, the one-hot encoding for categorical values are used when there is no ordinal relationship.
# For example, the Sex column with Male and Female string values.

df_copy['Cholesterol'] = df_copy['Cholesterol'].map({'NORMAL':0, 'HIGH':1})
df_copy['BP'] = df_copy['BP'].map({'LOW':0, 'NORMAL':1, 'HIGH':2})
df_copy = pd.get_dummies(df_copy, columns=['Sex'], dtype=int)

In [166]:
# Split the features and the target variable by putting them into X and y.
X = df_copy.drop(columns=['Drug'])
y = df_copy['Drug']

In [167]:
# Check the results by displaying the first 5 rows
X.head()

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,Sex_F,Sex_M
0,23,2,1,25.355,1,0
1,47,0,1,13.093,0,1
2,47,0,1,10.114,0,1
3,28,1,1,7.798,1,0
4,61,0,1,18.043,1,0


In [168]:
y.head()

0    DrugY
1    drugC
2    drugC
3    drugX
4    DrugY
Name: Drug, dtype: object

In [169]:
# Apply LabelEncoder to the target variable.
LE = LabelEncoder()
y = LE.fit_transform(y)
y

array([0, 3, 3, 4, 0, 4, 0, 3, 0, 0, 3, 0, 0, 0, 4, 0, 4, 1, 3, 0, 0, 0,
       0, 0, 0, 0, 0, 4, 0, 0, 4, 2, 4, 0, 4, 4, 1, 4, 4, 4, 0, 2, 0, 4,
       4, 4, 1, 3, 0, 0, 0, 4, 0, 0, 2, 3, 2, 0, 4, 0, 0, 1, 0, 4, 2, 0,
       1, 4, 0, 0, 2, 0, 4, 0, 0, 0, 1, 0, 1, 4, 2, 4, 3, 1, 3, 2, 4, 0,
       0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 1, 3, 4, 0, 4, 4, 0, 2, 0,
       1, 4, 4, 4, 4, 0, 4, 4, 1, 0, 0, 0, 0, 0, 2, 0, 0, 4, 0, 4, 0, 0,
       4, 0, 0, 4, 2, 1, 2, 4, 1, 0, 2, 0, 1, 4, 4, 1, 4, 3, 1, 2, 4, 4,
       0, 3, 1, 0, 3, 4, 4, 2, 4, 0, 0, 0, 0, 4, 0, 1, 4, 4, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 4, 4, 0, 0, 0, 2, 1, 0, 0, 0, 1, 0, 3, 0, 3, 3, 4,
       4, 4])

In [170]:
# Split the dataset using train_test_split into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [171]:
# Check the results of the split.
X_train

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,Sex_F,Sex_M
169,20,2,1,11.262,1,0
97,56,2,1,25.395,1,0
31,74,2,1,9.567,0,1
12,43,0,1,15.376,0,1
35,46,1,0,7.285,0,1
...,...,...,...,...,...,...
106,22,1,1,11.953,0,1
14,50,1,1,12.703,1,0
92,29,2,1,29.450,1,0
179,67,1,1,15.891,1,0


In [172]:
y_train

array([1, 0, 2, 0, 4, 0, 0, 0, 0, 2, 4, 4, 4, 2, 1, 4, 0, 4, 1, 4, 3, 0,
       1, 1, 1, 2, 2, 1, 0, 1, 0, 0, 1, 0, 0, 0, 4, 0, 4, 0, 0, 4, 2, 4,
       0, 1, 3, 0, 4, 4, 0, 0, 4, 4, 2, 0, 2, 4, 0, 0, 0, 2, 3, 1, 0, 0,
       0, 1, 4, 4, 0, 3, 0, 0, 4, 2, 4, 4, 0, 0, 0, 0, 0, 0, 2, 4, 3, 1,
       0, 1, 1, 4, 0, 0, 0, 0, 4, 0, 1, 4, 0, 0, 4, 0, 4, 2, 0, 0, 3, 4,
       0, 0, 0, 0, 4, 0, 4, 4, 0, 3, 3, 0, 3, 0, 2, 4, 0, 4, 0, 0, 0, 4,
       0, 0, 0, 4, 4, 0, 0, 3])

In [173]:
X_test 

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,Sex_F,Sex_M
95,36,0,0,11.424,0,1
15,16,2,0,15.516,1,0
30,18,1,0,8.75,1,0
158,59,0,1,10.444,1,0
128,47,0,0,33.542,0,1
115,51,2,1,18.295,0,1
69,18,2,0,24.276,1,0
170,28,1,1,12.879,1,0
174,42,2,0,12.766,0,1
45,66,1,0,8.107,1,0


In [174]:
y_test 

array([4, 0, 4, 3, 0, 0, 0, 4, 1, 4, 1, 4, 0, 1, 2, 0, 2, 4, 3, 0, 2, 4,
       4, 0, 0, 0, 3, 4, 0, 4, 0, 3, 3, 0, 1, 0, 4, 1, 0, 1, 4, 4, 4, 0,
       0, 3, 0, 0, 0, 4, 4, 0, 4, 0, 4, 0, 1, 0, 0, 0])

In [175]:
# Use StandardScaler to scale the feature values.
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Part 2: Fitting data into machine learning classification models

### Logistic Regression

In [176]:
# Logistic regression uses the sigmoid function to model the probability of a certain class or event. 
# It uses binary which is 0s and 1s to determine the outcome.
LR = LogisticRegression(solver='lbfgs')
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
y_pred_proba = LR.predict_proba(X_test)


In [177]:
# Use the evaluation metrics to determine the base model's performance.
# The classification report shows the precision, recall, f1-score, and accuracy of each class.
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        26
           1       0.88      1.00      0.93         7
           2       0.75      1.00      0.86         3
           3       1.00      0.33      0.50         6
           4       0.82      1.00      0.90        18

    accuracy                           0.90        60
   macro avg       0.89      0.85      0.83        60
weighted avg       0.92      0.90      0.89        60



In [178]:
# The confusion matrix shows the correct and incorrect predictions for each class.
print(confusion_matrix(y_test, y_pred))

[[24  1  1  0  0]
 [ 0  7  0  0  0]
 [ 0  0  3  0  0]
 [ 0  0  0  2  4]
 [ 0  0  0  0 18]]


In [179]:
# The roc_auc_score shows the area under the curve score. It shows the model's ability to distinguish between classes.
# A score of 1.0 indicates perfect classification,
# A score of 0.5 indicates no discrimination.
# A score of 0.0 indicates perfect misclassification.
print(roc_auc_score(y_test, y_pred_proba, multi_class='ovr'))

0.993373333682347


In [180]:
# The accuracy score shows the overall accuracy of the model.
print(accuracy_score(y_test, y_pred))

0.9


## Conclusion

The dataset used in this review is perfectly fitted for the logistic regression classification model. The model was able to accurately predict which drugs are supposed to be used based on the given features in the dataset. The model was 90% accurate, with a `roc_auc_score` of 99.34%, which indicates that it almost perfectly distinguishes classes. The observations of the dataset is quite low, with only 200 observations. It is suggested to gather more data for the observations to give the model a higher accuracy.