In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import joblib

# Importing the dataset
dataset = pd.read_csv('data.csv')
X = dataset.iloc[:, 2:].values
Y = dataset.iloc[:, 1].values

# Encoding categorical data values
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

# Splitting the dataset into the Training set and Test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

# Performing oversampling using SMOTE
smote = SMOTE(random_state=0)
X_train_oversampled, Y_train_oversampled = smote.fit_resample(X_train, Y_train)

# Performing undersampling on the majority class
undersampler = RandomUnderSampler(random_state=0)
X_train_undersampled, Y_train_undersampled = undersampler.fit_resample(X_train, Y_train)

# Feature Scaling
sc = StandardScaler()
X_train_oversampled = sc.fit_transform(X_train_oversampled)
X_test = sc.transform(X_test)

# Train Logistic Regression model
logistic_regression = LogisticRegression(random_state=0)
logistic_regression.fit(X_train_oversampled, Y_train_oversampled)

# Training K-NN model
knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn.fit(X_train_oversampled, Y_train_oversampled)

# Training SVM (linear) model
svm_linear = SVC(kernel='linear', random_state=0)
svm_linear.fit(X_train_oversampled, Y_train_oversampled)

# Training SVM (rbf) model
svm_rbf = SVC(kernel='rbf', random_state=0)
svm_rbf.fit(X_train_oversampled, Y_train_oversampled)

# Training Naive Bayes model
naive_bayes = GaussianNB()
naive_bayes.fit(X_train_oversampled, Y_train_oversampled)

# Training Decision Tree model
decision_tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
decision_tree.fit(X_train_oversampled, Y_train_oversampled)

# Training Random Forest model
random_forest = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
random_forest.fit(X_train_oversampled, Y_train_oversampled)

# Predicting on the test set
Y_pred_logistic_regression = logistic_regression.predict(X_test)
Y_pred_knn = knn.predict(X_test)
Y_pred_svm_linear = svm_linear.predict(X_test)
Y_pred_svm_rbf = svm_rbf.predict(X_test)
Y_pred_naive_bayes = naive_bayes.predict(X_test)
Y_pred_decision_tree = decision_tree.predict(X_test)
Y_pred_random_forest = random_forest.predict(X_test)

# Creating the confusion matrix for each model
cm_logistic_regression = confusion_matrix(Y_test, Y_pred_logistic_regression)
cm_knn = confusion_matrix(Y_test, Y_pred_knn)
cm_svm_linear = confusion_matrix(Y_test, Y_pred_svm_linear)
cm_svm_rbf = confusion_matrix(Y_test, Y_pred_svm_rbf)
cm_naive_bayes = confusion_matrix(Y_test, Y_pred_naive_bayes)
cm_decision_tree = confusion_matrix(Y_test, Y_pred_decision_tree)
cm_random_forest = confusion_matrix(Y_test, Y_pred_random_forest)

# Calculating accuracy for each model
accuracy_logistic_regression = accuracy_score(Y_test, Y_pred_logistic_regression)
accuracy_knn = accuracy_score(Y_test, Y_pred_knn)
accuracy_svm_linear = accuracy_score(Y_test, Y_pred_svm_linear)
accuracy_svm_rbf = accuracy_score(Y_test, Y_pred_svm_rbf)
accuracy_naive_bayes = accuracy_score(Y_test, Y_pred_naive_bayes)
accuracy_decision_tree = accuracy_score(Y_test, Y_pred_decision_tree)
accuracy_random_forest = accuracy_score(Y_test, Y_pred_random_forest)

# Saving the trained models and the scaler
joblib.dump(logistic_regression, 'logistic_regression_model.joblib')
joblib.dump(knn, 'knn_model.joblib')
joblib.dump(svm_linear, 'svm_linear_model.joblib')
joblib.dump(svm_rbf, 'svm_rbf_model.joblib')
joblib.dump(naive_bayes, 'naive_bayes_model.joblib')
joblib.dump(decision_tree, 'decision_tree_model.joblib')
joblib.dump(random_forest, 'random_forest_model.joblib')
joblib.dump(sc, 'scaler.joblib')

# Printing the accuracies
print("Logistic Regression Accuracy: {:.2f}%".format(accuracy_logistic_regression * 100))
print("K-NN Accuracy: {:.2f}%".format(accuracy_knn * 100))
print("SVM (Linear) Accuracy: {:.2f}%".format(accuracy_svm_linear * 100))
print("SVM (RBF) Accuracy: {:.2f}%".format(accuracy_svm_rbf * 100))
print("Naive Bayes Accuracy: {:.2f}%".format(accuracy_naive_bayes * 100))
print("Decision Tree Accuracy: {:.2f}%".format(accuracy_decision_tree * 100))
print("Random Forest Accuracy: {:.2f}%".format(accuracy_random_forest * 100))

Logistic Regression Accuracy: 94.41%
K-NN Accuracy: 95.10%
SVM (Linear) Accuracy: 93.71%
SVM (RBF) Accuracy: 97.90%
Naive Bayes Accuracy: 92.31%
Decision Tree Accuracy: 89.51%
Random Forest Accuracy: 95.80%
