# ROC Curve  for multiclass classification 
**Classifier**: Random Forest

The dataset used in this tutorial is from this paper:<br>
Morang’a, C.M., Amenga–Etego, L., Bah, S.Y. et al. Machine learning approaches classify clinical malaria outcomes based on haematological parameters. BMC Med 18, 375 (2020). https://doi.org/10.1186/s12916-020-01823-3

### Import libraries for the analysis 

In [None]:
import pandas as pd        
import numpy as np         
import matplotlib.pyplot as plt   
import seaborn as sns             

### Read the data

In [None]:
#set path to the data file.
data_file='https://raw.githubusercontent.com/vappiah/Machine-Learning-Tutorials/main/datasets/malaria_clin_data.csv'

In [None]:
#read the data with pandas
dataframe=pd.read_csv(data_file)

### Explore and clean

In [None]:
#find the number of rows and columns in the dataframe
dataframe.shape

In [None]:
#get the first n rows in the dataframe
dataframe.head(n=5)

In [None]:
# list the column names
dataframe.columns

In [None]:
#obtain some information about the data 
#i.e. columns,datatypes,missing values,etc
dataframe.info()

In [None]:
#we are interested in the columns : 'Clinical_diagnosis' up to 'RBC_dist_width_Percent'
#meaning we will subset the data from column 16 - the last column
subset=dataframe.iloc[:,16:]

In [None]:
subset.shape

In [None]:
subset.info()

In [None]:
#Check the mising data. We are interested in how many missing data are present in each column
subset.isnull().sum()

In [None]:
# handling missing values
# drop / remove all rows with missing values
subset.dropna(inplace=True)

In [None]:
subset.shape

In [None]:
subset.columns

In [None]:
#Let us get the different malaria outcomes. 
#The outcomes will be our labels/classes in the data

In [None]:
subset['Clinical_Diagnosis'].unique()

In [None]:
labels=pd.Categorical(subset['Clinical_Diagnosis'])
labels

In [None]:
subset.head()

In [None]:
#class distribution
subset['Clinical_Diagnosis'].value_counts()

In [None]:
# plot a bar chat to display the class distribution
subset['Clinical_Diagnosis'].value_counts().plot.bar()

In [None]:
#descriptive statistics on the data
subset.iloc[:,1:].describe().transpose()

In [None]:
#check the correlation for the features
subset.corr()

In [None]:
#lets visualize the correlation matrix using seaborn
sns.heatmap(subset.corr(),cmap='coolwarm')

### Data Preprocessing

In [None]:
# separate the labels/classes from the features/measurement
X=subset.iloc[:,1:]
y=subset.iloc[:,0]

\
**Encode labels** \
This is required by scikit learn when dealing with categorical data.

In [None]:
#Lets encode target labels (y) with values between 0 and n_classes-1.
#We will use the LabelEncoder to do this. 
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
label_encoder.fit(y)
y=label_encoder.transform(y)
classes=label_encoder.classes_

### Split data into train and test sets

In [None]:
# train test ratio 80:20

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

### Normalize the data

In [None]:
# scale data between 0 and 1

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler=MinMaxScaler()
X_train_norm=min_max_scaler.fit_transform(X_train)
X_test_norm=min_max_scaler.fit_transform(X_test)

### Classification

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve,auc

#### Training Phase
This will be done by parsing the training set to a classifier or classifiers
Because we are dealing with 3 classes, this becomes a multiclass classification problem.
We therefore us the One-vs-the-rest strategy.\
This strategy involves fitting one classifier per class. For each classifier, the class is fitted against all the other classes. 
Here, we use the Random Forest Classifier

In [None]:
#Random Forest Classifier
#because we are dealing with multiclass data and so, the one versus rest strategy is used.
#learn to predict each class against the other.

RF=OneVsRestClassifier(RandomForestClassifier(max_features=0.2))
RF.fit(X_train_norm,y_train)
y_pred =RF.predict(X_test_norm)
pred_prob = RF.predict_proba(X_test_norm)

#### Plot the ROC Curve

In [None]:
from sklearn.preprocessing import label_binarize
#binarize the y_values

y_test_binarized=label_binarize(y_test,classes=np.unique(y_test))

# roc curve for classes
fpr = {}
tpr = {}
thresh ={}
roc_auc = dict()

n_class = classes.shape[0]

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test_binarized[:,i], pred_prob[:,i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    
    # plotting    
    plt.plot(fpr[i], tpr[i], linestyle='--', 
             label='%s vs Rest (AUC=%0.2f)'%(classes[i],roc_auc[i]))

plt.plot([0,1],[0,1],'b--')
plt.xlim([0,1])
plt.ylim([0,1.05])
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='lower right')
plt.show()