Eun Ju Jong

In [12]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
import os
import pandas as pd
import matplotlib as mpl
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
np.random.seed(42)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier

### Data Set: Epileptic Seizure Recognition
Attribute Information:

The original dataset from the reference consists of 5 different folders, each with 100 files, with each file representing a single subject/person. Each file is a recording of brain activity for 23.6 seconds. The corresponding time-series is sampled into 4097 data points. Each data point is the value of the EEG recording at a different point in time. So we have total 500 individuals with each has 4097 data points for 23.5 seconds.

We divided and shuffled every 4097 data points into 23 chunks, each chunk contains 178 data points for 1 second, and each data point is the value of the EEG recording at a different point in time. So now we have 23 x 500 = 11500 pieces of information(row), each information contains 178 data points for 1 second(column), the last column represents the label y {1,2,3,4,5}.

The response variable is y in column 179, the Explanatory variables X1, X2, ..., X178

y contains the category of the 178-dimensional input vector. Specifically y in {1, 2, 3, 4, 5}:

5 - eyes open, means when they were recording the EEG signal of the brain the patient had their eyes open

4 - eyes closed, means when they were recording the EEG signal the patient had their eyes closed

3 - Yes they identify where the region of the tumor was in the brain and recording the EEG activity from the healthy brain area

2 - They recorder the EEG from the area where the tumor was located

1 - Recording of seizure activity

All subjects falling in classes 2, 3, 4, and 5 are subjects who did not have epileptic seizure. Only subjects in class 1 have epileptic seizure. Our motivation for creating this version of the data was to simplify access to the data via the creation of a .csv version of it. Although there are 5 classes most authors have done binary classification, namely class 1 (Epileptic seizure) against the rest.

In [3]:
epilepsy = pd.read_csv("epilepsy.csv",)
epilepsy.drop(epilepsy.columns[[0]], axis = 1, inplace = True)
print(epilepsy.shape)
print(epilepsy.info())
epilepsy.head()

(11500, 179)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11500 entries, 0 to 11499
Columns: 179 entries, X1 to y
dtypes: int64(179)
memory usage: 15.7 MB
None


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,y
0,135,190,229,223,192,125,55,-9,-33,-38,...,-17,-15,-31,-77,-103,-127,-116,-83,-51,4
1,386,382,356,331,320,315,307,272,244,232,...,164,150,146,152,157,156,154,143,129,1
2,-32,-39,-47,-37,-32,-36,-57,-73,-85,-94,...,57,64,48,19,-12,-30,-35,-35,-36,5
3,-105,-101,-96,-92,-89,-95,-102,-100,-87,-79,...,-82,-81,-80,-77,-85,-77,-72,-69,-65,5
4,-9,-65,-98,-102,-78,-48,-16,0,-21,-59,...,4,2,-12,-32,-41,-65,-83,-89,-73,5


In [4]:
epilepsy["y"].value_counts()

5    2300
4    2300
3    2300
2    2300
1    2300
Name: y, dtype: int64

In [5]:
epilepsy.describe()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,y
count,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,...,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0
mean,-11.581391,-10.911565,-10.18713,-9.143043,-8.009739,-7.003478,-6.502087,-6.68713,-6.558,-6.168435,...,-10.145739,-11.630348,-12.943478,-13.66887,-13.363304,-13.045043,-12.70513,-12.426,-12.195652,3.0
std,165.626284,166.059609,163.524317,161.269041,160.998007,161.328725,161.467837,162.11912,162.03336,160.436352,...,164.652883,166.14979,168.554058,168.556486,167.25729,164.241019,162.895832,162.886311,164.852015,1.414275
min,-1839.0,-1838.0,-1835.0,-1845.0,-1791.0,-1757.0,-1832.0,-1778.0,-1840.0,-1867.0,...,-1867.0,-1865.0,-1642.0,-1723.0,-1866.0,-1863.0,-1781.0,-1727.0,-1829.0,1.0
25%,-54.0,-55.0,-54.0,-54.0,-54.0,-54.0,-54.0,-55.0,-55.0,-54.0,...,-55.0,-56.0,-56.0,-56.0,-55.0,-56.0,-55.0,-55.0,-55.0,2.0
50%,-8.0,-8.0,-7.0,-8.0,-8.0,-8.0,-8.0,-8.0,-7.0,-7.0,...,-9.0,-10.0,-10.0,-10.0,-10.0,-9.0,-9.0,-9.0,-9.0,3.0
75%,34.0,35.0,36.0,36.0,35.0,36.0,35.0,36.0,36.0,35.25,...,34.0,34.0,33.0,33.0,34.0,34.0,34.0,34.0,34.0,4.0
max,1726.0,1713.0,1697.0,1612.0,1518.0,1816.0,2047.0,2047.0,2047.0,2047.0,...,1777.0,1472.0,1319.0,1436.0,1733.0,1958.0,2047.0,2047.0,1915.0,5.0


In [6]:
# Classes 2, 3, 4, and 5 are subjects who did not have epileptic seizure. 
# Only subjects in class 1 have epileptic seizure.
# The variable indicating whether subject to epileptic seizure could be an alternative
# output data?
epilepsy_input = epilepsy
#epilepsy_input["epileptic seizure"] = (epilepsy["y"] == 1) * 1
epilepsy_input.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,y
0,135,190,229,223,192,125,55,-9,-33,-38,...,-17,-15,-31,-77,-103,-127,-116,-83,-51,4
1,386,382,356,331,320,315,307,272,244,232,...,164,150,146,152,157,156,154,143,129,1
2,-32,-39,-47,-37,-32,-36,-57,-73,-85,-94,...,57,64,48,19,-12,-30,-35,-35,-36,5
3,-105,-101,-96,-92,-89,-95,-102,-100,-87,-79,...,-82,-81,-80,-77,-85,-77,-72,-69,-65,5
4,-9,-65,-98,-102,-78,-48,-16,0,-21,-59,...,4,2,-12,-32,-41,-65,-83,-89,-73,5


In [7]:
epilepsy_input[epilepsy_input.isnull().any(axis=1)]

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,y


In [8]:
# output data = y
X_train, X_test, y_train, y_test = train_test_split(epilepsy_input, epilepsy["y"], random_state=42)

print(len(X_train), "train +", len(X_test), "test")

8625 train + 2875 test


In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print("Training set score: {:.3f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.3f}".format(lr.score(X_test, y_test)))

Training set score: 1.000
Test set score: 1.000


In [10]:
mlp = MLPClassifier(random_state=42)
mlp.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(mlp.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(mlp.score(X_test, y_test)))

Accuracy on training set: 0.875
Accuracy on test set: 0.668


"output = epileptic seizure" had the exact same results as "output = y" ---> expected. 

In [18]:
for i in np.linspace(1, 150, 15): 
    mlp = MLPClassifier(hidden_layer_sizes=[int(i)], random_state=42)
    mlp.fit(X_train, y_train)
    print(i)
    print("Accuracy on training set: {:.3f}".format(mlp.score(X_train, y_train)))
    print("Accuracy on test set: {:.3f}".format(mlp.score(X_test, y_test)))

1.0
Accuracy on training set: 0.323
Accuracy on test set: 0.301
11.642857142857142
Accuracy on training set: 0.472
Accuracy on test set: 0.414




22.285714285714285
Accuracy on training set: 0.882
Accuracy on test set: 0.818




32.92857142857143
Accuracy on training set: 0.910
Accuracy on test set: 0.797
43.57142857142857
Accuracy on training set: 0.818
Accuracy on test set: 0.714
54.21428571428571
Accuracy on training set: 0.799
Accuracy on test set: 0.639
64.85714285714286
Accuracy on training set: 0.865
Accuracy on test set: 0.715
75.5
Accuracy on training set: 0.817
Accuracy on test set: 0.646
86.14285714285714
Accuracy on training set: 0.864
Accuracy on test set: 0.675
96.78571428571428
Accuracy on training set: 0.857
Accuracy on test set: 0.679
107.42857142857142
Accuracy on training set: 0.831
Accuracy on test set: 0.649
118.07142857142857
Accuracy on training set: 0.896
Accuracy on test set: 0.663
128.71428571428572
Accuracy on training set: 0.884
Accuracy on test set: 0.649
139.35714285714286
Accuracy on training set: 0.914
Accuracy on test set: 0.672
150.0
Accuracy on training set: 0.882
Accuracy on test set: 0.646


In [13]:
mlp = MLPClassifier(hidden_layer_sizes=[100, 100], random_state=42)
mlp.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(mlp.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(mlp.score(X_test, y_test)))

Accuracy on training set: 0.884
Accuracy on test set: 0.593


In [15]:
mlp = MLPRegressor(random_state=42)
mlp.fit(X_train, y_train)
print(" Accuracy on training set: {:.3f}".format(mlp.score(X_train, y_train)))
print(" Accuracy on test set: {:.3f}".format(mlp.score(X_test, y_test)))  

 Accuracy on training set: -72.456
 Accuracy on test set: -473.179


In [17]:
mlp = MLPRegressor(hidden_layer_sizes=[150, 150], random_state=42)
mlp.fit(X_train, y_train)
print(" Accuracy on training set: {:.3f}".format(mlp.score(X_train, y_train)))
print(" Accuracy on test set: {:.3f}".format(mlp.score(X_test, y_test)))  

 Accuracy on training set: -13.056
 Accuracy on test set: -66.070
