In [None]:
import numpy as np
import pandas as pd
import sklearn as sk

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()
X = dataset.data
y = dataset.target


In [None]:
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)

# Add the target variable to the dataframe
df['target'] = dataset.target

# Print the first 5 rows of the dataframe
df.head()

In [None]:
y = df.pop("target")
X = df

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

In [None]:
from sklearn.ensemble import BaggingClassifier

# TODO: Check sklearn documentation on what is a OOB score and what is the reason for using it?

bag_knn = BaggingClassifier(KNeighborsClassifier(n_neighbors=5),
                            n_estimators=10, max_samples=0.5,
                            bootstrap=True, random_state=3,oob_score=True) 


bag_knn.fit(X_train, y_train)
bag_knn.score(X_test, y_test)

##### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier(min_samples_split=2)
model_dt.fit(X_train,y_train)
model_dt.score(X_test, y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier(criterion='entropy', max_depth=24, min_samples_leaf=1)
model_dt.fit(X_train,y_train)
model_dt.score(X_test, y_test)

##### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(random_state=6)
model_rf.fit(X_train,y_train)
model_rf.score(X_test, y_test)

In [None]:
grid_param = {
    "n_estimators" : [90,100],
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(5,10,1),
    'min_samples_leaf' : range(1,4,1),
    'min_samples_split': range(2,5,1),
    'max_features' : ['auto','log2']
}

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=model_rf, param_grid=grid_param, cv=3, n_jobs=-1, verbose=3)
grid_search.fit(X_train,y_train)

In [None]:
grid_search.best_params_

In [None]:
model_best_rf = RandomForestClassifier(random_state=6, **grid_search.best_params_)
model_best_rf.fit(X_train,y_train)

model_best_rf.score(X_test, y_test)

In [None]:
model_best_rf.n_estimators

In [None]:
len(model_best_rf.estimators_)

In [None]:
model_best_rf.estimators_[2]

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(80,40))
plot_tree(model_best_rf.estimators_[2], feature_names = dataset.feature_names,class_names=['Cancer', "No Cancer"],filled=True);

##### Feature Importance

The Random Forest algorithm has built-in feature importance which can be computed in two ways:
1. Gini importance (or mean decrease impurity)
2. Mean Decrease Accuracy - is a method of computing the feature importance on permuted out-of-bag (OOB) samples based on mean decrease in the accuracy. This method is not implemented in the scikit-learn package

Gini importance is computed from Random Forest structure. In the Rf construction, we can measure how each feature decreases the impurity of the split. For each feature we can collect how on average it decreases the impurity. The average over all trees in the forest is the measure of the feature importance. The drawbacks of the method is to tendency to prefer numerical features and categorical features with high cardinality. For correlated features, it may even select one of the feature and neglect the importance of the second one leading to wrong conclusions.

In [None]:
print(dataset.feature_names)
print(model_best_rf.feature_importances_)

In [None]:
plt.barh(dataset.feature_names, model_best_rf.feature_importances_)

In [None]:
sorted_idx = model_best_rf.feature_importances_.argsort()
plt.barh(dataset.feature_names[sorted_idx], model_best_rf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")

##### Permutation based feature importance

Permutation based importance can be used to overcome drawbacks of default feature importance computed with mean impurity decrease. This method randomly shuffles each feature and compute the change in the model’s performance. The features impacting performance the most are the most important ones.

In [None]:
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(model_best_rf, X_test, y_test)
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(dataset.feature_names[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")

##### SHAP based feature importance

Shapley values are calculated using Cooperative Game Theory. Gives feature importance breakdown per class (label) also

In [None]:
import shap # Requires SHAP install in your environment

explainer = shap.TreeExplainer(model_best_rf)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type="bar")

In [None]:
shap.plots.bar(shap_values[0])

In [None]:
choosen_instance = X_test.iloc[[0]]
shap_values_inst = explainer.shap_values(choosen_instance)
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values_inst[1], choosen_instance)

Above is a force plot for feature shapley values. Feature values in pink cause to increase the prediction. Size of the bar shows the magnitude of the feature's effect. Feature values in blue cause to decrease the prediction. Sum of all feature SHAP values explain why model prediction was different from the baseline.

#### Evaluation metrics

In [None]:
from sklearn.metrics import accuracy_score
y_pred = model_best_rf.predict(X_test)
accuracy_score(y_pred=y_pred, y_true=y_test)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score

print(f"Precision = {precision_score(y_test, y_pred)}")
print(f"Recall = {recall_score(y_test, y_pred)}")
print(f"F-1 score = {f1_score(y_test, y_pred)}")
print(f"F-2 score = {fbeta_score(y_test, y_pred, beta=2)}")

In [None]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, y_pred)

In [None]:
# macro average is arithmetic mean of per class scores
# Read this: https://towardsdatascience.com/micro-macro-weighted-averages-of-f1-score-clearly-explained-b603420b292f 
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

##### Evaluating Random Forest model performance and robustness with ROC

In [None]:
# Predict probabilities instead of 1/0 label
y_pred_prob = model_best_rf.predict_proba(X_test)
y_pred_prob

In [None]:
y_pred_prob[:,1] > 0.5

In [None]:
(y_pred_prob[:,1] > 0.7).astype(int)

In [None]:
cm = confusion_matrix(y_test, (y_pred_prob[:,1] > 0.5).astype(int))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
# Interpretation of this result after increasing the threshold 
# Increasing threshold decreases FP, but increases FN such that FP+FN remains constant for a given model+hyperparams
threshold = 0.7
cm = confusion_matrix(y_test, (y_pred_prob[:,1] > threshold).astype(int))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

![TPFP](https://onedrive.live.com/embed?resid=A5A4158EF1352FCB%211958&authkey=%21AAtvfe4lv5jfOGg&width=600)

In [None]:
from sklearn.metrics import auc, roc_curve
fpr, tpr, threshold = roc_curve(y_test, model_best_rf.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
roc_auc

In [None]:
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
from sklearn.metrics import RocCurveDisplay
svc_disp = RocCurveDisplay.from_estimator(model_best_rf, X_test, y_test)

##### Exercise: Draw a PR curve and understand it

Dont blindly copy code from ChatGPT

### Imbalanced Dataset classification

This is not part of syllabus and is a optional exercise. Refer to this kaggle credit card fraud dataset and look at the provided notebooks
1. https://www.kaggle.com/code/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets
2. https://www.kaggle.com/code/marcinrutecki/best-techniques-and-metrics-for-imbalanced-dataset

pip install imblearn for SMOTE implementation

### Multi class classification

1. Read this article to understand the classification report especially in the case of multi class classification: https://towardsdatascience.com/micro-macro-weighted-averages-of-f1-score-clearly-explained-b603420b292f
2. Download a kaggle dataset for multi class classification. 
3. Your goal is to run all the evaluation metrics, ROC AUC curves and classification reports and see the impact of changing threshold on False Positive and False negatives
4. Dont use iris dataset because the distributions are very cleanly separated and there wont be any real False positive and False negatives

References
1. https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
2. Multiclass ROC plots https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

Read about One Versus One (OvO) and One versus Rest (OvR)
https://towardsdatascience.com/comprehensive-guide-to-multiclass-classification-with-sklearn-127cc500f362

### Stacking (aka Stacked Generalization)

Read up on Stacking Generalization and meta models

In [None]:
from sklearn.datasets import load_wine

dataset = load_wine()
X = dataset.data
y = dataset.target

In [None]:
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)

# Add the target variable to the dataframe
df['target'] = dataset.target

# Print the first 5 rows of the dataframe
df.head()

In [None]:
y = df.pop("target")
X = df

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [None]:
knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X_train,y_train)
knn.score(X_test, y_test)

In [None]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train,y_train)
svm.score(X_test, y_test)

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

knn = KNeighborsClassifier(n_neighbors=7)
svm = SVC()
lr = LogisticRegression()
stacking = StackingClassifier(estimators=[('knn', knn), ("svc", svm)], final_estimator=lr)
stacking.fit(X_train, y_train)

In [None]:
y_pred = stacking.predict(X_test)
accuracy = stacking.score(X_test, y_test)
print("Accuracy:", accuracy)