# Import Libraries

In [3]:
import pandas as pd
import numpy as np

# Data Visualization Library 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(context='notebook', style='darkgrid', palette='colorblind', font='sans-serif', font_scale=1, rc=None)
matplotlib.rcParams['figure.figsize'] =[8,8]
matplotlib.rcParams.update({'font.size': 15})
matplotlib.rcParams['font.family'] = 'sans-serif'

# Data Preparation 


In [7]:
!pip install dataprep

Collecting dataprep
  Using cached dataprep-0.4.3-py3-none-any.whl (9.5 MB)
Collecting jsonpath-ng<2.0,>=1.5
  Using cached jsonpath_ng-1.5.3-py3-none-any.whl (29 kB)
Collecting pydantic<2.0,>=1.6
  Using cached pydantic-1.9.1-cp39-cp39-win_amd64.whl (2.0 MB)
Collecting python-Levenshtein<0.13.0,>=0.12.2
  Using cached python-Levenshtein-0.12.2.tar.gz (50 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting aiohttp<4.0,>=3.6
  Using cached aiohttp-3.8.1-cp39-cp39-win_amd64.whl (554 kB)
Collecting sqlalchemy<2.0.0,>=1.4.32
  Using cached SQLAlchemy-1.4.37-cp39-cp39-win_amd64.whl (1.6 MB)
Collecting varname<0.9.0,>=0.8.1
  Using cached varname-0.8.3-py3-none-any.whl (21 kB)
Collecting flask<3,>=2
  Using cached Flask-2.1.2-py3-none-any.whl (95 kB)
Collecting numpy<2.0,>=1.21
  Using cached numpy-1.22.4-cp39-cp39-win_amd64.whl (14.7 MB)
Collecting wordcloud<2.0,>=1.8
  Using cached wordcloud-1.8.1.tar.gz (220 kB)
  Preparing 

  error: subprocess-exited-with-error
  
  × python setup.py bdist_wheel did not run successfully.
  │ exit code: 1
  ╰─> [27 lines of output]
      running bdist_wheel
      running build
      running build_py
      creating build
      creating build\lib.win-amd64-3.9
      creating build\lib.win-amd64-3.9\Levenshtein
      copying Levenshtein\StringMatcher.py -> build\lib.win-amd64-3.9\Levenshtein
      copying Levenshtein\__init__.py -> build\lib.win-amd64-3.9\Levenshtein
      running egg_info
      writing python_Levenshtein.egg-info\PKG-INFO
      writing dependency_links to python_Levenshtein.egg-info\dependency_links.txt
      writing entry points to python_Levenshtein.egg-info\entry_points.txt
      writing namespace_packages to python_Levenshtein.egg-info\namespace_packages.txt
      writing requirements to python_Levenshtein.egg-info\requires.txt
      writing top-level names to python_Levenshtein.egg-info\top_level.txt
      reading manifest file 'python_Levenshtein.egg-i

In [5]:
# Data Preparation 
from dataprep.eda import *
from dataprep.eda.missing import plot_missing
from dataprep.eda import plot_correlation

ModuleNotFoundError: No module named 'dataprep'

# Data Analysis

In [None]:
lassa_data = pd.read_csv('/Lassa Dataset.csv')
lassa_data

In [None]:
lassa_data.info()

In [None]:
lassa_data.describe(include='all')

In [None]:
lassa_data.columns

# Finding Missing Value

In [None]:
plot_missing(lassa_data)

In [None]:
# Create a table with data missing 
missing_values=lassa_data.isnull().sum() # Missing values

percent_missing = lassa_data.isnull().sum()/lassa_data.shape[0]*100 # Missing value in %

value = {
    'missing_values ':missing_values,
    'percent_missing %':percent_missing  
}
frame=pd.DataFrame(value)
frame

The dataset contains zero missing values

In [None]:
def msv_1(df, thresh = 20, color = 'black', edgecolor = 'black', height = 3, width = 15):
    
    plt.figure(figsize = (width, height))
    percentage = (df.isnull().mean()) * 100
    percentage.sort_values(ascending = False).plot.bar(color = color, edgecolor = edgecolor)
    plt.axhline(y = thresh, color = 'r', linestyle = '-')
    
    plt.title('Missing values percentage per column', fontsize=20, weight='bold' )
    
    plt.text(len(df.isnull().sum()/len(df))/1.7, thresh+2.5, f'Columns with more than {thresh}% missing values', fontsize=12, color='crimson',
         ha='left' ,va='top')
    plt.text(len(df.isnull().sum()/len(df))/1.7, thresh - 0.5, f'Columns with less than {thresh}% missing values', fontsize=12, color='green',
         ha='left' ,va='top')
    plt.xlabel('Columns', size=15, weight='bold')
    plt.ylabel('Missing values percentage')
    plt.yticks(weight ='bold')
    
    return plt.show()
msv_1(lassa_data, 20, color=sns.color_palette('Reds',15))

# Data Visualization

LASSA FEVER (target)

In [None]:
sns.countplot(x='Lassa Fever',data=lassa_data)

In [None]:
lassa_data["Lassa Fever"].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True)
plt.title('number of cases');

Breathing Problem

In [None]:
sns.countplot(x='Breathing Problem',data=lassa_data)

In [None]:
sns.countplot(x='Breathing Problem',hue='Lassa Fever',data=lassa_data)

Fever

In [None]:
sns.countplot(x='Fever',hue='Lassa Fever',data=lassa_data);

Dry Cough

In [None]:
sns.countplot(x='Dry Cough',hue='Lassa Fever',data=lassa_data)

Sore Throat

In [None]:
sns.countplot(x='Sore throat',hue='Lassa Fever',data=lassa_data)

Vomitting

In [None]:
sns.countplot(x='Vomitting',hue='Lassa Fever',data=lassa_data)

# Feature Transformation

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()

In [None]:
lassa_data['Breathing Problem']=label_encoder.fit_transform(lassa_data['Breathing Problem'])
lassa_data['Fever']=label_encoder.fit_transform(lassa_data['Fever'])
lassa_data['Dry Cough']=label_encoder.fit_transform(lassa_data['Dry Cough'])
lassa_data['Sore throat']=label_encoder.fit_transform(lassa_data['Sore throat'])
lassa_data['Running Nose']=label_encoder.fit_transform(lassa_data['Running Nose'])
lassa_data['Chest Pain']=label_encoder.fit_transform(lassa_data['Chest Pain'])
lassa_data['Chronic Lung Disease']=label_encoder.fit_transform(lassa_data['Chronic Lung Disease'])
lassa_data['Headache']=label_encoder.fit_transform(lassa_data['Headache'])
lassa_data['Heart Disease']=label_encoder.fit_transform(lassa_data['Heart Disease'])
lassa_data['Diabetes']=label_encoder.fit_transform(lassa_data['Diabetes'])
lassa_data['Hyper Tension']=label_encoder.fit_transform(lassa_data['Hyper Tension'])
lassa_data['Fatigue ']=label_encoder.fit_transform(lassa_data['Fatigue '])
lassa_data['Shock ']=label_encoder.fit_transform(lassa_data['Shock '])
lassa_data['Diarrhoea']=label_encoder.fit_transform(lassa_data['Diarrhoea'])
lassa_data['Vomitting']=label_encoder.fit_transform(lassa_data['Vomitting'])
lassa_data['Hearing Loss']=label_encoder.fit_transform(lassa_data['Hearing Loss'])
lassa_data['Organ Failure']=label_encoder.fit_transform(lassa_data['Organ Failure'])
lassa_data['Hepatitis']=label_encoder.fit_transform(lassa_data['Hepatitis'])
lassa_data['Seizures']=label_encoder.fit_transform(lassa_data['Seizures'])
lassa_data['Blue Lips']=label_encoder.fit_transform(lassa_data['Blue Lips'])
lassa_data['Lassa Fever']=label_encoder.fit_transform(lassa_data['Lassa Fever'])

In [None]:
lassa_data.head()

In [None]:
lassa_data.dtypes.value_counts()

# Info about the data after transformation 

In [None]:
lassa_data.describe(include='all')

In [None]:
lassa_data.hist(figsize=(20,15));

# Correlation Between Features

In [None]:
plot_correlation(lassa_data)

In [None]:
corr=lassa_data.corr()
corr.style.background_gradient(cmap='coolwarm',axis=None)

# Feature Selection

**Feature that we will delete:**
Seizures / Blue Lips /  Hepatitis / Organ Failure / Shock / Heart Disease / Chronic Lung Disease

In [None]:
# Feature Scaling.
lassa_data=lassa_data.drop('Chronic Lung Disease',axis=1)
lassa_data=lassa_data.drop('Heart Disease',axis=1)
lassa_data=lassa_data.drop('Seizures',axis=1)
lassa_data=lassa_data.drop('Blue Lips',axis=1)
lassa_data=lassa_data.drop('Organ Failure',axis=1)
lassa_data=lassa_data.drop('Hepatitis',axis=1)
lassa_data=lassa_data.drop('Shock ',axis=1)

In [None]:
corr=lassa_data.corr()
corr.style.background_gradient(cmap='coolwarm',axis=None)

# Machine Learning Algorithm

In [None]:
# Importing the libraries, Libtune to tune model, get different metric scores
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, plot_roc_curve, auc, precision_recall_curve, plot_precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [None]:
x = lassa_data.drop('Lassa Fever',axis=1)
y = lassa_data['Lassa Fever']

In [None]:
# Splitting the Data-set into train and test set.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)

print(x_test)

# HyperParameter Tuning

In [None]:
grid_models = [(LogisticRegression(),[{'C':[0.25,0.5,0.75,1],'random_state':[0]}]), 
               (GaussianNB(),[{'var_smoothing': [1e-09]}]), 
               (DecisionTreeClassifier(),[{'criterion':['gini','entropy'],'random_state':[0]}]), 
               (RandomForestClassifier(),[{'n_estimators':[100,150,200],'criterion':['gini','entropy'],'random_state':[0]}]),
               (AdaBoostClassifier(),[{'n_estimators':[100,150,200],'learning_rate':[0.1, 0.5, 0.8, 1],'algorithm':['SAMME', 'SAMME.R'], 'random_state':[0]}]),
               (GradientBoostingClassifier(),[{'n_estimators':[100,150,200],'criterion':['friedman_mse','squared_error'],'loss':['deviance','exponential'],'learning_rate':[0.1, 0.5, 0.8, 1],'random_state':[0]}]),
               (XGBClassifier(), [{'learning_rate': [0.01, 0.05, 0.1], 'eval_metric': ['error']}])]

In [None]:
for i,j in grid_models:
    grid = GridSearchCV(estimator=i,param_grid = j, scoring = 'accuracy',cv=2)
    grid.fit(x_train, y_train)
    best_accuracy = grid.best_score_
    best_param = grid.best_params_
    print('{}:\nBest Accuracy : {:.2f}%'.format(i,best_accuracy*100))
    print('Best Parameters : ',best_param)
    print('')
    print('----------------')
    print('')

# Logistic Regression

In [None]:
accuracies = {}
# Fitting Logistic Regression into training set.
model = LogisticRegression(random_state=0, C=0.25)
model.fit(x_train, y_train)

# Predicting test sets results.
y_pred = model.predict(x_test)

# Score/Accuracy
accuracy_logreg = model.score(x_test, y_test)*100

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Checking the probability of the test sets.
y_prob = model.predict_proba(x_test)[:,1]

# Print the Accuracies
print(classification_report(y_test, y_pred))
print('----------------')
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('----------------')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('----------------')
print('Accuracy in Percentage: ',accuracy_logreg)

# Visualizing Confusion Matrix
plt.figure(figsize = (6, 6))
print('----------------')
sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['Negative', 'Positive'], xticklabels = ['Predicted Negative', 'Predicted Positive'])
plt.yticks(rotation = 0)
print(' ')
plt.show()

#accuracy
acc = accuracy_score(y_test, y_pred)*100
accuracies['Logistic Regression'] = acc

# Roc AUC Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (6, 6))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC AUC Curve')
plt.legend()
print(' ')
plt.show()

#Precision Recall Curve
plt.figure(figsize = (6, 6))
average_precision = average_precision_score(y_test, y_prob)
disp = plot_precision_recall_curve(model, x_test, y_test)
plt.title('Precision-Recall Curve')
print(' ')
plt.show()

# Random Forest Classifier

In [None]:
random_forest_classifier = RandomForestClassifier(n_estimators=150, criterion='entropy', random_state=0)
random_forest_classifier.fit(x_train, y_train.ravel())

# Predicting the test set results
y_pred = random_forest_classifier.predict(x_test)

# Score/Accuracy
accuracy_ranforclass = random_forest_classifier.score(x_test, y_test)*100

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Checking the probability of the test sets.
y_prob = random_forest_classifier.predict_proba(x_test)[:,1]

# Print the Accuracies
print(classification_report(y_test, y_pred))
print('----------------')
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('----------------')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('----------------')
print('Accuracy in Percentage: ',accuracy_ranforclass)

# Visualizing Confusion Matrix
plt.figure(figsize = (6, 6))
print('----------------')
sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['Negative', 'Positive'], xticklabels = ['Predicted Negative', 'Predicted Positive'])
plt.yticks(rotation = 0)
print(' ')
plt.show()

#accuracy
acc = accuracy_score(y_test, y_pred)*100
accuracies['Random Forest'] = acc

# Roc AUC Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (6, 6))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC AUC Curve')
plt.legend()
print(' ')
plt.show()

#Precision Recall Curve
plt.figure(figsize = (6, 6))
average_precision = average_precision_score(y_test, y_prob)
disp = plot_precision_recall_curve(random_forest_classifier, x_test, y_test)
plt.title('Precision-Recall Curve')
print(' ')
plt.show()

# AdaBoost Classifier Algorithms

In [None]:
classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1)
classifier.fit(x_train, y_train.ravel())

# Predicting the test set results
y_pred = classifier.predict(x_test)

# Confusion matrix
Accuracy = confusion_matrix(y_test, y_pred)

# Score/Accuracy
accuracy_adaboost = classifier.score(x_test, y_test)*100

# Checking the probability of the test sets.
y_prob = classifier.predict_proba(x_test)[:,1]

# Print the Accuracies
print(classification_report(y_test, y_pred))
print('----------------')
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('----------------')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('----------------')
print('Accuracy in Percentage: ',accuracy_adaboost)

# Visualizing Confusion Matrix
plt.figure(figsize = (6, 6))
print('----------------')
sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['Negative', 'Positive'], xticklabels = ['Predicted Negative', 'Predicted Positive'])
plt.yticks(rotation = 0)
print(' ')
plt.show()

#accuracy
acc = accuracy_score(y_test, y_pred)*100
accuracies['AdaBoost'] = acc

# Roc AUC Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (6, 6))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC AUC Curve')
plt.legend()
print(' ')
plt.show()

#Precision Recall Curve
plt.figure(figsize = (6, 6))
average_precision = average_precision_score(y_test, y_prob)
disp = plot_precision_recall_curve(classifier, x_test, y_test)
plt.title('Precision-Recall Curve')
print(' ')
plt.show()

# Gradient Boosting Classifier

In [None]:
# Fitting Gradient Boosting Regression into training set
classifier = GradientBoostingClassifier(criterion='friedman_mse', learning_rate=0.8, loss='deviance', n_estimators=200, random_state=0)
classifier.fit(x_train, y_train)

# Score/Accuracy
accuracy_gbc = classifier.score(x_test, y_test)*100

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Checking the probability of the test sets.
y_prob = classifier.predict_proba(x_test)[:,1]

# Print the Accuracies
print(classification_report(y_test, y_pred))
print('----------------')
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('----------------')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('----------------')
print('Accuracy in Percentage: ',accuracy_gbc)

# Visualizing Confusion Matrix
plt.figure(figsize = (6, 6))
print('----------------')
sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['Negative', 'Positive'], xticklabels = ['Predicted Negative', 'Predicted Positive'])
plt.yticks(rotation = 0)
print(' ')
plt.show()

#accuracy
acc = accuracy_score(y_test, y_pred)*100
accuracies['Gradient Boosting'] = acc

# Roc AUC Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (6, 6))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC AUC Curve')
plt.legend()
print(' ')
plt.show()

#Precision Recall Curve
plt.figure(figsize = (6, 6))
average_precision = average_precision_score(y_test, y_prob)
disp = plot_precision_recall_curve(classifier, x_test, y_test)
plt.title('Precision-Recall Curve')
print(' ')
plt.show()

# KNeighborsClassifier

In [None]:
# Fitting KNN into training set.
classifier = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
classifier.fit(x_train, y_train.ravel())

# Predicting the results.
y_pred = classifier.predict(x_test)

#Score/Accuracy
accuracy_knn = classifier.score(x_test, y_test)*100

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Checking the probability of the test sets.
y_prob = classifier.predict_proba(x_test)[:,1]

# Print the Accuracies
print(classification_report(y_test, y_pred))
print('----------------')
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('----------------')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('----------------')
print('Accuracy in Percentage: ',accuracy_knn)

# Visualizing Confusion Matrix
plt.figure(figsize = (6, 6))
print('----------------')
sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['Negative', 'Positive'], xticklabels = ['Predicted Negative', 'Predicted Positive'])
plt.yticks(rotation = 0)
print(' ')
plt.show()

#accuracy
acc = accuracy_score(y_test, y_pred)*100
accuracies['KNeighbors'] = acc

# Roc AUC Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (6, 6))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC AUC Curve')
plt.legend()
print(' ')
plt.show()

#Precision Recall Curve
plt.figure(figsize = (6, 6))
average_precision = average_precision_score(y_test, y_prob)
disp = plot_precision_recall_curve(classifier, x_test, y_test)
plt.title('Precision-Recall Curve')
print(' ')
plt.show()

# DecisionTreeClassifier

In [None]:
# Fitting Decision tree to training set
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
classifier.fit(x_train,y_train) 

# Predicting the result
y_pred = classifier.predict(x_test)

#Score/Accuracy
accuracy_decisiontree = classifier.score(x_test, y_test)*100

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Checking the probability of the test sets.
y_prob = classifier.predict_proba(x_test)[:,1]

# Print the Accuracies
print(classification_report(y_test, y_pred))
print('----------------')
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('----------------')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('----------------')
print('Accuracy in Percentage: ',accuracy_decisiontree )

# Visualizing Confusion Matrix
plt.figure(figsize = (6, 6))
print('----------------')
sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['Negative', 'Positive'], xticklabels = ['Predicted Negative', 'Predicted Positive'])
plt.yticks(rotation = 0)
print(' ')
plt.show()

#accuracy
acc = accuracy_score(y_test, y_pred)*100
accuracies['Decision Tree'] = acc

# Roc AUC Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (6, 6))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC AUC Curve')
plt.legend()
print(' ')
plt.show()

#Precision Recall Curve
plt.figure(figsize = (6, 6))
average_precision = average_precision_score(y_test, y_prob)
disp = plot_precision_recall_curve(classifier, x_test, y_test)
plt.title('Precision-Recall Curve')
print(' ')
plt.show()

# Naive Bayes Algorithm (Gaussian Naive Bayes)

In [None]:
# Fitting Naive Bayes to training set
classifier = GaussianNB()
classifier.fit(x_train,y_train.ravel())

# Predicting Test set results
y_pred = classifier.predict(x_test)

#Score/Accuracy
accuracy_gaussian= classifier.score(x_test, y_test)*100

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Checking the probability of the test sets.
y_prob = classifier.predict_proba(x_test)[:,1]

# Print the Accuracies
print(classification_report(y_test, y_pred))
print('----------------')
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('----------------')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('----------------')
print('Accuracy in Percentage: ',accuracy_gaussian)

# Visualizing Confusion Matrix
plt.figure(figsize = (6, 6))
print('----------------')
sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['Negative', 'Positive'], xticklabels = ['Predicted Negative', 'Predicted Positive'])
plt.yticks(rotation = 0)
print(' ')
plt.show()

#accuracy
acc = accuracy_score(y_test, y_pred)*100
accuracies['Naive Bayes'] = acc

# Roc AUC Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (6, 6))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC AUC Curve')
plt.legend()
print(' ')
plt.show()

#Precision Recall Curve
plt.figure(figsize = (6, 6))
average_precision = average_precision_score(y_test, y_prob)
disp = plot_precision_recall_curve(classifier, x_test, y_test)
plt.title('Precision-Recall Curve')
print(' ')
plt.show()

# Support Vector Machine

In [None]:
# Fitting SVM to training set.
classifier = SVC(kernel='linear', random_state=0, probability=True) # Linear Kernel
classifier.fit(x_train, y_train)

# Predicting the result.
y_pred = classifier.predict(x_test)

#Score/Accuracy
accuracy_svc=classifier.score(x_test, y_test)*100

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Checking the probability of the test sets.
y_prob = classifier.predict_proba(x_test)[:,1]

# Print the Accuracies
print(classification_report(y_test, y_pred))
print('----------------')
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('----------------')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('----------------')
print('Accuracy in Percentage: ',accuracy_svc)

# Visualizing Confusion Matrix
plt.figure(figsize = (6, 6))
print('----------------')
sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['Negative', 'Positive'], xticklabels = ['Predicted Negative', 'Predicted Positive'])
plt.yticks(rotation = 0)
print(' ')
plt.show()

#accuracy
acc = accuracy_score(y_test, y_pred)*100
accuracies['Support Vector'] = acc

# Roc AUC Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (6, 6))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC AUC Curve')
plt.legend()
print(' ')
plt.show()

#Precision Recall Curve
plt.figure(figsize = (6, 6))
average_precision = average_precision_score(y_test, y_prob)
disp = plot_precision_recall_curve(classifier, x_test, y_test)
plt.title('Precision-Recall Curve')
print(' ')
plt.show()


# XGB Classifier

In [None]:
classifier = XGBClassifier(eval_metric='error', learning_rate=0.1)
classifier.fit(x_train, y_train.ravel())

# Predicting the test set results
y_pred = classifier.predict(x_test)

# Score/Accuracy
accuracy_xgb = classifier.score(x_test, y_test)*100

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Checking the probability of the test sets.
y_prob = classifier.predict_proba(x_test)[:,1]

# Print the Accuracies
print(classification_report(y_test, y_pred))
print('----------------')
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('----------------')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
print('----------------')
print('Accuracy in Percentage: ',accuracy_xgb)

# Visualizing Confusion Matrix
plt.figure(figsize = (6, 6))
print('----------------')
sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['Negative', 'Positive'], xticklabels = ['Predicted Negative', 'Predicted Positive'])
plt.yticks(rotation = 0)
print(' ')
plt.show()

#accuracy
acc = accuracy_score(y_test, y_pred)*100
accuracies['XGB Classifier'] = acc

# Roc AUC Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (6, 6))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC AUC Curve')
plt.legend()
print(' ')
plt.show()

#Precision Recall Curve
plt.figure(figsize = (6, 6))
average_precision = average_precision_score(y_test, y_prob)
disp = plot_precision_recall_curve(classifier, x_test, y_test)
plt.title('Precision-Recall Curve')
print(' ')
plt.show()


# Sorting the Models

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 'Naive Bayes',   
              'Decision Tree', 'Random Forest Classifier', 'Gradient Boosting Classifier', 'XGB Classifier', 'AdaBoost Classifier'],
    'Score': [accuracy_svc, accuracy_knn, accuracy_logreg, accuracy_gaussian, accuracy_decisiontree, accuracy_ranforclass, accuracy_gbc, accuracy_xgb, accuracy_adaboost]})
models.sort_values(by='Score', ascending=False)

In [None]:
colors = ["purple", "green", "orange", "magenta","#CFC60E","#0FBBAE",'#417D7A','#066163','#4D4C7D']

sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette=colors)
plt.show()

# Making a Predictive System

In [None]:
input_data = (0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0)

# Changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = random_forest_classifier.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('You are not diagnosed with Lassa Fever')
else: 
  print('You are diagnosed with Lassa Fever')

## Saving the trained model

In [None]:
import pickle

In [None]:
filename = 'trained_model.sav'
pickle.dump(random_forest_classifier, open(filename, 'wb'))

In [None]:
# loading the saved model
loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [None]:
input_data = (0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0)

# Changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('You are not diagnosed with Lassa Fever')
else: 
  print('You are diagnosed with Lassa Fever')