In [5]:
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
from sklearn import svm
from sklearn.metrics import roc_auc_score , accuracy_score , precision_score, recall_score ,confusion_matrix
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import pickle
import h5py
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

%matplotlib inline

In [None]:
BASE_DIR = os.getcwd()

In [2]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('data/Epileptic Seizure Recognition.csv')

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.shape

### Visualization

In [None]:
data.describe(include=object)

In [None]:
null_values = data.isnull().sum()
null_values.to_numpy()

In [None]:
data_1 = data.copy()

In [None]:
data_1.drop(['Unnamed','y'],axis=1,inplace=True)

In [None]:
fig, axs = plt.subplots(5, sharex=True, sharey=True)
fig.set_size_inches(18, 24)
labels = ["X20","X40","X60","X80","X100"]
colors = ["b","g","k","r","y"]
fig.suptitle('Visual representation of different channels when stacked independently', fontsize = 20)
for i,ax in enumerate(axs):
  axs[i].plot(data.iloc[:,0],data[labels[i]],color=colors[i],label=labels[i])
  axs[i].legend(loc="upper right")


plt.xlabel('total number of observation', fontsize = 20)
x_ticks = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000]
x_ticklabels = ['0', '1000', '2000', '3000', '4000', '5000', '6000', '7000', '8000', '9000', '10000', '11000', '12000']
plt.xticks(x_ticks, x_ticklabels)
plt.savefig(os.path.join(BASE_DIR, 'static/assets/img', 'independent_channel.png'))
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = (20, 10)
data.loc[:,::25].plot()
plt.title("Visual representation different channels when stacked against each other")
plt.xlabel("total number of values of x")
plt.ylabel("range of values of y")
plt.savefig(os.path.join(BASE_DIR, 'static/assets/img', 'stacked_channels.png'))
plt.show()

In [None]:
corr = data_1.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap='coolwarm'
)
plt.title("Heat Map")
plt.savefig(os.path.join(BASE_DIR, 'static/assets/img','heat_map.png'))
plt.show()

### Solve Class Imbalance

In [None]:
data_2 = data.drop(["Unnamed"],axis=1).copy()

In [None]:
data_2.y.value_counts()

In [None]:
data_2['y'] = data_2['y'].replace([2,3,4,5],0)

In [None]:
data_2.y.value_counts()

In [None]:
data_2.head()

In [None]:
plt.figure(figsize=(10,6),dpi=300)
sns.despine(left=True)
sns.scatterplot(x='X1', y='X2', hue = 'y', data=data_2, palette=['red', 'blue'])
plt.title('Distribution of Labels')
plt.legend(loc='upper left', title="labels")
plt.savefig(os.path.join(BASE_DIR, 'static/assets/img', 'scatter_plot.png'))
plt.show()

In [None]:
data_2.y.value_counts()

In [None]:
X  = data_2.drop(['y'], axis=1)
y = data_2['y']

In [None]:
counter = Counter(y)
print('Before',counter)
smenn = SMOTEENN()
X_train1, y_train1 = smenn.fit_resample(X, y)
counter = Counter(y_train1)
print('After',counter)

In [None]:
# Save data for using in Django
X_train1.to_csv(os.path.join(BASE_DIR, 'data', 'data.csv'), index=False)
y_train1.to_csv(os.path.join(BASE_DIR,'data', 'labels.csv'), index=False)

### Train/Test/Validation Dataset Splitting


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train1,y_train1,test_size=0.2,random_state=42)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.25,random_state=42)

### Feature Scaling

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
print(f"The shape of the training set is :{X_train.shape}")
print(f"The shape of the testing set is :{X_test.shape}")
print(f"The shape of the validation set is :{X_val.shape}")

### Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
# Save Logistic Regression Model
lg_filename = 'LogisticRegressionModel.pickle'
with open(os.path.join(BASE_DIR, "model", lg_filename), 'wb') as f:
    pickle.dump(logreg, f)

In [None]:

y_pred = logreg.predict(X_val)
print(f"The accuracy score of the model on the validation data is:{metrics.accuracy_score(y_val, y_pred)*100:.2f}.")

In [None]:
models_dataframe = pd.DataFrame({
    'Model': ['Logistic Regression'],
    'Score': [f'{metrics.accuracy_score(y_val, y_pred)*100:.2f}'],
    'Precision': [f'{metrics.precision_score(y_val, y_pred)*100:.2f}'],
    'F1_Score': [f'{metrics.f1_score(y_val, y_pred)*100:.2f}'],
    'Recall': [f'{metrics.recall_score(y_val, y_pred)*100:.2f}'],
    'View': ['LogisticView'],
    'SavedModelName': [f'{lg_filename}']
})

In [None]:
logit_fpr, logit_tpr, thresholds = metrics.roc_curve(y_val, y_pred)
logit_auc = metrics.roc_auc_score(y_val, y_pred)

### KNN

In [None]:
pipe = Pipeline([('knn', KNeighborsClassifier())])
param_grid = {'knn__n_neighbors': [9,10, 100]}
knn = GridSearchCV(pipe, param_grid, cv=5)

knn.fit(X_train,y_train)
print('Best hyperparameters:', knn.best_params_)
# Save KNN Model
knn_filename = 'KNNModel.pickle'
with open(os.path.join(BASE_DIR, "model", knn_filename), 'wb') as f:
    pickle.dump(knn, f)

In [None]:
y_pred = knn.predict(X_val)
knn_fpr, knn_tpr, thresholds = metrics.roc_curve(y_val, y_pred)
knn_auc = metrics.roc_auc_score(y_val, y_pred)
y_valid_preds = knn.predict_proba(X_val)
precision = metrics.accuracy_score(y_pred=y_pred, y_true=y_val) * 100
print(f"Accuracy with K-NN: {precision:.2f}%")

In [None]:
new_row = pd.DataFrame({
    'Model': ['KNN'],
    'Score': [f'{metrics.accuracy_score(y_val, y_pred)*100:.2f}'],
    'Precision': [f'{metrics.precision_score(y_val, y_pred)*100:.2f}'],
    'F1_Score': [f'{metrics.f1_score(y_val, y_pred)*100:.2f}'],
    'Recall': [f'{metrics.recall_score(y_val, y_pred)*100:.2f}'],
    'View': ['KNNView'],
    'SavedModelName': [f'{knn_filename}']
})
if not models_dataframe.isin(new_row).all().all():
    # If the new row values do not exist in the DataFrame, append the row
    models_dataframe = models_dataframe.append(new_row, ignore_index=True)

### Support Vector Machine

In [None]:
clf = svm.SVC(kernel='poly', C=100, gamma=0.01, probability=True)
clf.fit(X_train, y_train)


In [None]:
# Save SVM Model
svm_filename = 'SVMModel.pickle'
with open(os.path.join(BASE_DIR, "model", svm_filename), 'wb') as f:
    pickle.dump(clf, f)

In [None]:
y_pred = clf.predict(X_val)
precision = metrics.accuracy_score(y_pred=y_pred, y_true=y_val) * 100
print(f"Accuracy with SVM: {precision:.2f}%")
probs = clf.predict_proba(X_val)
probs = probs[:, 1]
svm_fpr, svm_tpr, thresholds = metrics.roc_curve(y_val, probs)
svm_auc = metrics.roc_auc_score(y_val, probs)

In [None]:
new_row = pd.DataFrame({
    'Model': ['SVM'],
    'Score': [f'{metrics.accuracy_score(y_val, y_pred)*100:.2f}'],
    'Precision': [f'{metrics.precision_score(y_val, y_pred)*100:.2f}'],
    'F1_Score': [f'{metrics.f1_score(y_val, y_pred)*100:.2f}'],
    'Recall': [f'{metrics.recall_score(y_val, y_pred)*100:.2f}'],
    'View': ['SVMView'],
    'SavedModelName': [f'{svm_filename}']
})
if not models_dataframe.isin(new_row).all().all():
    # If the new row values do not exist in the DataFrame, append the row
    models_dataframe = models_dataframe.append(new_row, ignore_index=True)

### Naive Bayes Classifier

In [None]:
naive = GaussianNB()
naive.fit(X_train,y_train)
# Save NaiveBayes Model
nb_filename = 'NaiveBayesModel.pickle'
with open(os.path.join(BASE_DIR, "model", nb_filename), 'wb') as f:
    pickle.dump(naive, f)


In [None]:
y_pred = naive.predict(X_val)
naive_fpr, naive_tpr, thresholds = metrics.roc_curve(y_val, y_pred)
naive_auc = metrics.roc_auc_score(y_val, y_pred)
print(f'Accuracy with naive is:{metrics.accuracy_score(y_pred=y_pred, y_true=y_val) * 100:.2f}%.')

In [None]:
new_row = pd.DataFrame({
    'Model': ['Naive Bayes'],
    'Score': [f'{metrics.accuracy_score(y_val, y_pred)*100:.2f}'],
    'Precision': [f'{metrics.precision_score(y_val, y_pred)*100:.2f}'],
    'F1_Score': [f'{metrics.f1_score(y_val, y_pred)*100:.2f}'],
    'Recall': [f'{metrics.recall_score(y_val, y_pred)*100:.2f}'],
    'View': ['NaiveBayesView'],
    'SavedModelName': [f'{svm_filename}']
})
if not models_dataframe.isin(new_row).all().all():
    # If the new row values do not exist in the DataFrame, append the row
    models_dataframe = models_dataframe.append(new_row, ignore_index=True)

### Random Forest Classifier

In [None]:
random = RandomForestClassifier(max_depth=10,random_state=69)
random.fit(X_train,y_train)

# Save RF Model
rf_filename = 'RandomForestModel.pickle'
with open(os.path.join(BASE_DIR, "model", rf_filename), 'wb') as f:
    pickle.dump(random, f)


In [None]:
#predicting
y_pred = random.predict(X_val)
random_fpr, random_tpr, thresholds = metrics.roc_curve(y_val, y_pred)
random_auc = metrics.roc_auc_score(y_val, y_pred)
#Evaluating the model
precision = metrics.accuracy_score(y_pred=y_pred,y_true=y_val)* 100
#print  the accuracy
print(f"Accuracy of the model by using the random forest algorithm : {precision:.2f}%")

In [None]:
new_row = pd.DataFrame({
    'Model': ['Random Forest'],
    'Score': [f'{metrics.accuracy_score(y_val, y_pred)*100:.2f}'],
    'Precision': [f'{metrics.precision_score(y_val, y_pred)*100:.2f}'],
    'F1_Score': [f'{metrics.f1_score(y_val, y_pred)*100:.2f}'],
    'Recall': [f'{metrics.recall_score(y_val, y_pred)*100:.2f}'],
    'View': ['RandomForestView'],
    'SavedModelName': [f'{rf_filename}']
})
if not models_dataframe.isin(new_row).all().all():
    # If the new row values do not exist in the DataFrame, append the row
    models_dataframe = models_dataframe.append(new_row, ignore_index=True)

### XgBoost

In [None]:
xgbc = XGBClassifier()

xgbc.fit(X_train,y_train)

# Save SVM Model
xg_filename = 'XgBoostModel.pickle'
with open(os.path.join(BASE_DIR, "model", xg_filename), 'wb') as f:
    pickle.dump(xgbc, f)

In [None]:
#predicting
y_pred = xgbc.predict(X_val)
xgbc_fpr, xgbc_tpr, thresholds = metrics.roc_curve(y_val, y_pred)
xgbc_auc = metrics.roc_auc_score(y_val, y_pred)
#Evaluating the model
precision = metrics.accuracy_score(y_pred=y_pred,y_true=y_val)* 100
#print  the accuracy
print(f"Accuracy of the model by using the xgbc algorithm : {precision:.2f}%")

In [None]:
new_row = pd.DataFrame({
    'Model': ['XgBoost'],
    'Score': [f'{metrics.accuracy_score(y_val, y_pred)*100:.2f}'],
    'Precision': [f'{metrics.precision_score(y_val, y_pred)*100:.2f}'],
    'F1_Score': [f'{metrics.f1_score(y_val, y_pred)*100:.2f}'],
    'Recall': [f'{metrics.recall_score(y_val, y_pred)*100:.2f}'],
    'View': ['XgBoostView'],
    'SavedModelName': [f'{xg_filename}']
})
if not models_dataframe.isin(new_row).all().all():
    # If the new row values do not exist in the DataFrame, append the row
    models_dataframe = models_dataframe.append(new_row, ignore_index=True)

### ROC Curve

In [None]:
plt.figure(figsize=(10,6),dpi=300)
plt.title('ROC Curve')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(logit_fpr, logit_tpr, 'c', marker='.', label = 'logit = %0.3f' % logit_auc )
plt.plot(svm_fpr, svm_tpr, 'b', marker='.', label = 'SVM = %0.3f' % svm_auc )
plt.plot(knn_fpr, knn_tpr, 'g', marker='.', label = 'K-NN = %0.3f' % knn_auc)
plt.plot(naive_fpr, naive_tpr, 'm', marker='.', label = 'naive = %0.3f' % naive_auc)
plt.plot(random_fpr, random_tpr, 'k', marker='.',label = 'Random Forest = %.3f' % random_auc)
plt.plot(xgbc_fpr, xgbc_tpr, 'y', marker='.',label = 'XGBoost = %.3f' % xgbc_auc)


plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig(os.path.join(BASE_DIR, 'static/assets/img', 'roc_curve.png'), dpi=100)
plt.show()

### Deep Learning

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import callbacks
from tensorflow.keras import layers
from tensorflow.keras.metrics import Precision, Recall

In [None]:
early_stopping = callbacks.EarlyStopping(
    min_delta=0.001,
    patience=20,
    restore_best_weights=True
)

In [None]:
model1 = Sequential()
model1.add(layers.Reshape((178, 1, 1), input_shape=(178,)))
model1.add(layers.Conv2D(filters=32, kernel_size=(10, 1), activation='relu', input_shape=(178, 1, 1)))
model1.add(layers.MaxPooling2D(pool_size=(3, 1)))
model1.add(layers.Conv2D(filters=64, kernel_size=(10, 1), activation='relu'))
model1.add(layers.MaxPooling2D(pool_size=(3, 1)))
model1.add(layers.Flatten())
model1.add(layers.Dense(units=64, activation='relu'))
model1.add(layers.Dropout(0.5))
model1.add(layers.Dense(units=1, activation='sigmoid'))

# model Summary
model1.summary()

In [None]:
# Compile the model
model1.compile(loss='binary_crossentropy',
               optimizer=Adam(learning_rate=0.001),
               metrics=['accuracy',  Precision(), Recall()])


# Train the model
model1.fit(X_train, y_train,
                     epochs=500,
                     batch_size=32,
                     validation_data=(X_val, y_val), callbacks=[early_stopping])

In [None]:
cnn_model_name = 'DeepLearning.h5'
model1.save(os.path.join(BASE_DIR, 'model', cnn_model_name))

In [None]:
# Evaluate the model on the data
test_loss, test_acc, test_precision, test_recall = model1.evaluate(X_test, y_test)

print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

In [None]:
predictions = model1.predict(X_val)
print("Predictions",predictions)

In [None]:
# Convert predictions to binary class labels
y_pred_labels = [1 if x > 0.5 else 0 for x in predictions]

In [None]:
new_row = pd.DataFrame({
    'Model': ['CNN'],
    'Score': [f'{test_acc*100:.2f}'],
    'Precision': f'{test_precision*100:.2f}',
    'F1_Score': [f'{metrics.f1_score(y_val, y_pred_labels)*100:.2f}'],
    'Recall': f'{test_recall*100:.2f}',
    'View': ['CNNView'],
    'SavedModelName': [f'{cnn_model_name}']
})
if not models_dataframe.isin(new_row).all().all():
    # If the new row values do not exist in the DataFrame, append the row
    models_dataframe = models_dataframe.append(new_row, ignore_index=True)

In [None]:
print(models_dataframe.sort_values('Score', ascending=False))

### Saving the models later used by Django

In [None]:
models_dataframe.sort_values('Score', ascending=False).to_csv(os.path.join(BASE_DIR, 'data', 'model_acc_dataframe.csv'), index=False)

### Testing the model

In [None]:
temp_data = data.drop(['y'], axis=1).copy()
temp_data1 = temp_data[temp_data['Unnamed'].str.split('.').str[2] == '941'].copy()
temp_data1

In [None]:
data_x = temp_data1.drop(['Unnamed'], axis=1).copy()
data_x = scaler.transform(data_x)
pre = model1.predict(data_x)
pre

In [None]:
binary_predictions = [1 if pr > 0.5 else 0 for pr in pre]
print(binary_predictions)
threshold = 0.5

# Apply threshold and classify patient's output
predicted_class = 1 if np.mean(binary_predictions) >= threshold else 0

print(predicted_class)
if predicted_class == 1:
    output_string = f"The patient is predicted to have epilepsy."
else:
    output_string = f"The patient is predicted to not have epilepsy."
print(output_string)