Here we have checked for datasets which contains data for two microbes, thereby diseases. We have collected the data for E-Coli and HIV. The active inhibitors are labelled as 1 and 2 respectively. The inactive ones are labelled as -1.


In [2]:
# Importing different libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

# sklearn library
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

In [4]:
# Matrix reading from the files
def read_file(txt_file, mat_a):
    file = open(txt_file, "r")
    for line in file:
        ns = line.split()
        no = [float(n) for n in ns]
        mat_a.append(no)
    file.close()
X = []
Y = []
read_file("E-coli_hiv_inhibitor.txt", X)
read_file("Activity on E-coli_hiv.txt", Y)


In [5]:
# Creating training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Standardizing the data
ss=StandardScaler()
X_train_st=ss.fit_transform(X_train)
X_test_st=ss.fit_transform(X_test)

In [6]:
#Decision Tree Clasifier

# Training Decision tree model
DTClassifier=DecisionTreeClassifier(criterion='entropy', random_state=42)
DTClassifier.fit(X_train,Y_train)

# Testing
Y_pred = DTClassifier.predict(X_test)

# Accuracy of the model
#
print('The accuracy of decision tree is: ', metrics.accuracy_score(Y_pred,Y_test))

# Different f1 scores
print('F1 Score with macro average: ',f1_score(Y_test, Y_pred, average='macro'))
print('F1 Score with micro average: ',f1_score(Y_test, Y_pred, average='micro'))
print('F1 Score with weighted average: ',f1_score(Y_test, Y_pred, average='weighted'))

The accuracy of decision tree is:  0.8764607679465777
F1 Score with macro average:  0.5608796296296296
F1 Score with micro average:  0.8764607679465777
F1 Score with weighted average:  0.8718736474370864


In [7]:
Y_train_np = np.array(Y_train)

#Naive Bayes Classifier
# Training the model
NBClassifier=GaussianNB()
NBClassifier.fit(X_train_st, Y_train_np.ravel())

# Testing
Y_pred1 = NBClassifier.predict(X_test)

# Accuracy of the model
print('The accuracy of Naive baise is: ', metrics.accuracy_score(Y_pred1, Y_test))

# Different f1 scores
print('F1 Score with macro average: ',f1_score(Y_test, Y_pred1, average='macro'))
print('F1 Score with micro average: ',f1_score(Y_test, Y_pred1, average='micro'))
print('F1 Score with weighted average: ',f1_score(Y_test, Y_pred1, average='weighted'))

The accuracy of Naive baise is:  0.8931552587646077
F1 Score with macro average:  0.31452087007642565
F1 Score with micro average:  0.8931552587646077
F1 Score with weighted average:  0.8427479072999384


In [8]:
#Random Forest Classifier Algorithm

# Training the model
model = RandomForestClassifier(n_estimators=10, random_state=25)
model.fit(X_train, Y_train_np.ravel())

# Testing
Y_pred2 = model.predict(X_test)

# Accuracy of the model
print('The accuracy of Random Forest Classifier is: ', metrics.accuracy_score(Y_pred2,Y_test))

# Different f1 scores
print('F1 Score with macro average: ',f1_score(Y_test, Y_pred2, average='macro'))
print('F1 Score with micro average: ',f1_score(Y_test, Y_pred2, average='micro'))
print('F1 Score with weighted average: ',f1_score(Y_test, Y_pred2, average='weighted'))

The accuracy of Random Forest Classifier is:  0.9248747913188647
F1 Score with macro average:  0.6893328805238291
F1 Score with micro average:  0.9248747913188647
F1 Score with weighted average:  0.9158002859431743


In [13]:
#SVM 

# Training the model
classifier_object = SVC(kernel ='linear', gamma='auto', C=2)
classifier_object = SVC(probability=True)
classifier_object.fit(X_train, Y_train_np.ravel())

# Testing 
Y_pred3 = classifier_object.predict(X_test)

# Accuracy of the model
print('The accuracy of Support Vector Classifier is: ', metrics.accuracy_score(Y_pred3,Y_test))

# Different f1 scores
print('F1 Score with macro average: ',f1_score(Y_test, Y_pred3, average='macro'))
print('F1 Score with micro average: ',f1_score(Y_test, Y_pred3, average='micro'))
print('F1 Score with weighted average: ',f1_score(Y_test, Y_pred3, average='weighted'))

The accuracy of Support Vector Classifier is:  0.9081803005008348
F1 Score with macro average:  0.488725185964723
F1 Score with micro average:  0.9081803005008348
F1 Score with weighted average:  0.8774386338150225


In [11]:
# SGD model

#Training the model
classifier_Sgd = make_pipeline(StandardScaler(),SGDClassifier(max_iter=1000, tol=1e-3))
classifier_Sgd.fit(X_train, Y_train_np.ravel())

#Testing the model
Y_pred4 = classifier_Sgd.predict(X_test)

# Accuracy of the model
print('The accuracy of Stochastic Gradient Descent Classifier is: ', metrics.accuracy_score(Y_pred4,Y_test))

# Different f1 scores
print('F1 Score with macro average: ',f1_score(Y_test, Y_pred4, average='macro'))
print('F1 Score with micro average: ',f1_score(Y_test, Y_pred4, average='micro'))
print('F1 Score with weighted average: ',f1_score(Y_test, Y_pred4, average='weighted'))

The accuracy of Stochastic Gradient Descent Classifier is:  0.8747913188647746
F1 Score with macro average:  0.5268510563207118
F1 Score with micro average:  0.8747913188647746
F1 Score with weighted average:  0.8684764846766214
