In [1]:
import pandas as pd 
import numpy as np

import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv(r"E:\Data Science by SRK\Machine_learning\Classification\mouse_data\cleaned_dataset_mouse.csv")

In [3]:
df

Unnamed: 0,Med_1_mL,Med_2_mL,Virus_Present
0,6.508231,8.582531,0
1,4.126116,3.073459,1
2,6.427870,6.369758,0
3,3.672953,4.905215,1
4,1.580321,2.440562,1
...,...,...,...
395,2.884122,3.271748,1
396,7.290855,9.488672,0
397,7.895325,8.272529,0
398,2.690592,2.674979,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Med_1_mL       400 non-null    float64
 1   Med_2_mL       400 non-null    float64
 2   Virus_Present  400 non-null    int64  
dtypes: float64(2), int64(1)
memory usage: 9.5 KB


In [5]:
df.isnull().sum()

Med_1_mL         0
Med_2_mL         0
Virus_Present    0
dtype: int64

# X and y

In [6]:
X = pd.get_dummies(df.drop('Virus_Present', axis = 1), drop_first = True)
y = df['Virus_Present']

# Train_Test_split

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = True)

# Modelling and Evaluation

In [8]:
# Random Forest classifire with default parameters
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state = True)
model.fit(X_train, y_train)

# Prediction
ypred_train = model.predict(X_train)
ypred_test = model.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print("Train accuracy : ", accuracy_score(ypred_train, y_train))
print("Test accuracy : ", accuracy_score(ypred_test, y_test))

from sklearn.model_selection import cross_val_score
print("cross_val_score : ", cross_val_score(model, X, y, cv=5).mean())

Train accuracy :  1.0
Test accuracy :  1.0
cross_val_score :  1.0


# Hyper_parameter Tuning

In [9]:
from sklearn.model_selection import GridSearchCV

# model
estimator = RandomForestClassifier(random_state = True)

# Parameters (which you want to tune and identify the best)
param_grid = {"n_estimators" : list(range(1, 10))}

grid = GridSearchCV(estimator, param_grid, scoring = 'accuracy', cv= 5)

grid.fit(X_train, y_train)

grid.best_params_

{'n_estimators': 5}

# Importance of each features

In [10]:
grid.best_estimator_.feature_importances_


array([0.6054084, 0.3945916])

In [11]:
feats_imp = pd.DataFrame(data = grid.best_estimator_.feature_importances_, 
                         index = X.columns, 
                         columns = ['Feature Importance'])
important_feats = feats_imp[feats_imp['Feature Importance'] > 0]
important_features_list = important_feats.index.to_list()
important_features_list

['Med_1_mL', 'Med_2_mL']

# Final Random Forest model
**with best HPT AND important features**

In [12]:
X_imp = X[important_features_list]

# Train_Test_Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = True)


# Model with best HPT AND important features
final_rf_model = RandomForestClassifier()
final_rf_model.fit(X_train, y_train)

# Prediction
ypred_train = final_rf_model.predict(X_train)
ypred_test = final_rf_model.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print("Train accuracy : ", accuracy_score(ypred_train, y_train))
print("Test accuracy : ", accuracy_score(ypred_test, y_test))

from sklearn.model_selection import cross_val_score
print("cross_val_score : ", cross_val_score(model, X, y, cv=5).mean())

Train accuracy :  1.0
Test accuracy :  1.0
cross_val_score :  1.0
