In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [20]:
# Load dataset from csc file

data = pd.read_csv('C:/Users/PMLS/Desktop/ai/ML lab/archive (1)/drug200.csv')


print(data.head())


   Age Sex      BP Cholesterol  Na_to_K   Drug
0   23   F    HIGH        HIGH   25.355  DrugY
1   47   M     LOW        HIGH   13.093  drugC
2   47   M     LOW        HIGH   10.114  drugC
3   28   F  NORMAL        HIGH    7.798  drugX
4   61   F     LOW        HIGH   18.043  DrugY


In [21]:
# Encoding categorical features
label_encoders = {}
categorical_columns = ['Sex', 'BP', 'Cholesterol', 'Drug']

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


print(data.head())



   Age  Sex  BP  Cholesterol  Na_to_K  Drug
0   23    0   0            0   25.355     0
1   47    1   1            0   13.093     3
2   47    1   1            0   10.114     3
3   28    0   2            0    7.798     4
4   61    0   1            0   18.043     0


In [22]:
# Separating features and labels
X = data[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
y = data['Drug'].values


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
 # Class distribution in the dataset
class_distribution = data['Drug'].value_counts()
print(class_distribution)


Drug
0    91
4    54
1    23
3    16
2    16
Name: count, dtype: int64


In [24]:
# Initialize and train KNN
'''knn = KNeighborsClassifier(n_neighbors=7)  # Use 5 neighbors
knn.fit(X_train, y_train)'''

from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': range(1,10)}  # Testing k from 1 to 20
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_k = grid_search.best_params_['n_neighbors']

best_score = grid_search.best_score_

print(f"Best k: {best_k}")
print(f"Best Cross-Validation Accuracy: {best_score:.2f}")


best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
'''
# Predict on the test dataset
y_pred = knn.predict(X_test)

# Evaluate the model
print("Accuracy for unbalanced :", accuracy_score(y_test, y_pred))
print("\nClassification Report for unbalaced:\n", classification_report(y_test, y_pred, target_names=label_encoders['Drug'].classes_))
'''

Best k: 7
Best Cross-Validation Accuracy: 0.72

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        15
           1       0.60      0.50      0.55         6
           2       0.20      0.33      0.25         3
           3       1.00      0.20      0.33         5
           4       0.50      0.64      0.56        11

    accuracy                           0.65        40
   macro avg       0.65      0.52      0.52        40
weighted avg       0.72      0.65      0.65        40



'\n# Predict on the test dataset\ny_pred = knn.predict(X_test)\n\n# Evaluate the model\nprint("Accuracy for unbalanced :", accuracy_score(y_test, y_pred))\nprint("\nClassification Report for unbalaced:\n", classification_report(y_test, y_pred, target_names=label_encoders[\'Drug\'].classes_))\n'

Result after feature scalling is

In [25]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


knn after feature scaling

In [26]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': range(1,10)}  # Testing k from 1 to 20
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_k = grid_search.best_params_['n_neighbors']

best_score = grid_search.best_score_

print(f"Best k: {best_k}")
print(f"Best Cross-Validation Accuracy: {best_score:.2f}")


best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Best k: 1
Best Cross-Validation Accuracy: 0.85

Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        15
           1       1.00      0.83      0.91         6
           2       0.75      1.00      0.86         3
           3       1.00      1.00      1.00         5
           4       1.00      0.91      0.95        11

    accuracy                           0.95        40
   macro avg       0.94      0.95      0.94        40
weighted avg       0.96      0.95      0.95        40



            NOW BALANCING THE DATA

In [27]:
from collections import Counter

from imblearn.under_sampling import RandomUnderSampler
undersampler = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)

print("Resampled Class Distribution:", Counter(y_train_under))

clf_under = KNeighborsClassifier(n_neighbors=5) 
clf_under.fit(X_train_under, y_train_under)

y_pred_under = clf_under.predict(X_test)






print("Accuracy (after undersampling):", (accuracy_score(y_test, y_pred_under))*100)
#print("Confusion Matrix (after undersampling):\n", confusion_matrix(y_test, y_pred_under))
print("Classification Report (after undersampling):\n", classification_report(y_test,y_pred_under))



Resampled Class Distribution: Counter({np.int64(0): 11, np.int64(1): 11, np.int64(2): 11, np.int64(3): 11, np.int64(4): 11})
Accuracy (after undersampling): 60.0
Classification Report (after undersampling):
               precision    recall  f1-score   support

           0       0.89      0.53      0.67        15
           1       0.43      0.50      0.46         6
           2       0.29      0.67      0.40         3
           3       0.50      0.80      0.62         5
           4       0.78      0.64      0.70        11

    accuracy                           0.60        40
   macro avg       0.58      0.63      0.57        40
weighted avg       0.70      0.60      0.62        40

