In [151]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE

In [152]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,Age,Duration,Frequency,Location,Character,Intensity,Nausea,Vomit,Phonophobia,Photophobia,...,Vertigo,Tinnitus,Hypoacusis,Diplopia,Defect,Ataxia,Conscience,Paresthesia,DPF,Type
0,30,1,5,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura with migraine
1,50,3,5,1,1,3,1,1,1,1,...,1,0,0,0,0,0,0,0,0,Typical aura with migraine
2,53,2,1,1,1,2,1,1,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura with migraine
3,45,3,5,1,1,3,1,0,1,1,...,1,0,0,0,0,0,0,0,0,Typical aura with migraine
4,53,1,1,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,1,Typical aura with migraine


In [153]:
df["Type"].value_counts()

Type
Typical aura with migraine       247
Migraine without aura             60
Familial hemiplegic migraine      24
Typical aura without migraine     20
Basilar-type aura                 18
Other                             17
Sporadic hemiplegic migraine      14
Name: count, dtype: int64

In [154]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [155]:
smote = SMOTE(random_state=2)
X_resampled, y_resampled = smote.fit_resample(X, y)
resampled_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['Type'])], axis=1)

In [156]:
resampled_data.to_csv('balanced_migraine_data.csv', index=False)

In [157]:
df = pd.read_csv("balanced_migraine_data.csv")

In [158]:
df.columns

Index(['Age', 'Duration', 'Frequency', 'Location', 'Character', 'Intensity',
       'Nausea', 'Vomit', 'Phonophobia', 'Photophobia', 'Visual', 'Sensory',
       'Dysphasia', 'Dysarthria', 'Vertigo', 'Tinnitus', 'Hypoacusis',
       'Diplopia', 'Defect', 'Ataxia', 'Conscience', 'Paresthesia', 'DPF',
       'Type'],
      dtype='object')

In [159]:
df = df.drop("Ataxia", axis=1)

In [160]:
df = df.drop("Dysarthria", axis=1)

In [161]:
df.shape

(1729, 22)

In [162]:
df.duplicated().sum()

776

In [163]:
df.shape

(1729, 22)

In [164]:
df = df.drop_duplicates()

In [165]:

# prof = ProfileReport(df)
# prof.to_file(output_file='DataAnalysis(AugmentedData).html')

In [166]:
df.isnull().sum()

Age            0
Duration       0
Frequency      0
Location       0
Character      0
Intensity      0
Nausea         0
Vomit          0
Phonophobia    0
Photophobia    0
Visual         0
Sensory        0
Dysphasia      0
Vertigo        0
Tinnitus       0
Hypoacusis     0
Diplopia       0
Defect         0
Conscience     0
Paresthesia    0
DPF            0
Type           0
dtype: int64

In [167]:
df

Unnamed: 0,Age,Duration,Frequency,Location,Character,Intensity,Nausea,Vomit,Phonophobia,Photophobia,...,Dysphasia,Vertigo,Tinnitus,Hypoacusis,Diplopia,Defect,Conscience,Paresthesia,DPF,Type
0,30,1,5,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura with migraine
1,50,3,5,1,1,3,1,1,1,1,...,0,1,0,0,0,0,0,0,0,Typical aura with migraine
2,53,2,1,1,1,2,1,1,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura with migraine
3,45,3,5,1,1,3,1,0,1,1,...,0,1,0,0,0,0,0,0,0,Typical aura with migraine
4,53,1,1,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,1,Typical aura with migraine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1717,43,1,4,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura without migraine
1720,26,3,1,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,1,Typical aura without migraine
1723,20,2,1,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura without migraine
1725,33,1,4,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura without migraine


In [168]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 953 entries, 0 to 1728
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Age          953 non-null    int64 
 1   Duration     953 non-null    int64 
 2   Frequency    953 non-null    int64 
 3   Location     953 non-null    int64 
 4   Character    953 non-null    int64 
 5   Intensity    953 non-null    int64 
 6   Nausea       953 non-null    int64 
 7   Vomit        953 non-null    int64 
 8   Phonophobia  953 non-null    int64 
 9   Photophobia  953 non-null    int64 
 10  Visual       953 non-null    int64 
 11  Sensory      953 non-null    int64 
 12  Dysphasia    953 non-null    int64 
 13  Vertigo      953 non-null    int64 
 14  Tinnitus     953 non-null    int64 
 15  Hypoacusis   953 non-null    int64 
 16  Diplopia     953 non-null    int64 
 17  Defect       953 non-null    int64 
 18  Conscience   953 non-null    int64 
 19  Paresthesia  953 non-null    int6

In [169]:
df.describe()

Unnamed: 0,Age,Duration,Frequency,Location,Character,Intensity,Nausea,Vomit,Phonophobia,Photophobia,...,Sensory,Dysphasia,Vertigo,Tinnitus,Hypoacusis,Diplopia,Defect,Conscience,Paresthesia,DPF
count,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,...,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0,953.0
mean,30.934942,1.535152,2.12277,0.942288,0.955929,2.316894,0.937041,0.242392,0.865687,0.887723,...,0.229801,0.040923,0.173137,0.059811,0.019937,0.005247,0.01574,0.009444,0.004197,0.398741
std,11.8342,0.704929,1.555904,0.417357,0.435008,1.003302,0.243017,0.428755,0.341167,0.315873,...,0.511089,0.198217,0.378565,0.237261,0.139857,0.072281,0.124532,0.09677,0.064684,0.489896
min,15.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,1.0,1.0,1.0,1.0,3.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,2.0,3.0,1.0,1.0,3.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,77.0,3.0,8.0,2.0,2.0,3.0,1.0,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [170]:
df.head()

Unnamed: 0,Age,Duration,Frequency,Location,Character,Intensity,Nausea,Vomit,Phonophobia,Photophobia,...,Dysphasia,Vertigo,Tinnitus,Hypoacusis,Diplopia,Defect,Conscience,Paresthesia,DPF,Type
0,30,1,5,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura with migraine
1,50,3,5,1,1,3,1,1,1,1,...,0,1,0,0,0,0,0,0,0,Typical aura with migraine
2,53,2,1,1,1,2,1,1,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura with migraine
3,45,3,5,1,1,3,1,0,1,1,...,0,1,0,0,0,0,0,0,0,Typical aura with migraine
4,53,1,1,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,1,Typical aura with migraine


In [171]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [172]:
label_encoder = LabelEncoder()
df["Type"] = label_encoder.fit_transform(df["Type"])

In [173]:
df.shape

(953, 22)

In [174]:
df.duplicated().sum()

0

In [175]:
print(X)

      Age  Duration  Frequency  Location  Character  Intensity  Nausea  Vomit  \
0      30         1          5         1          1          2       1      0   
1      50         3          5         1          1          3       1      1   
2      53         2          1         1          1          2       1      1   
3      45         3          5         1          1          3       1      0   
4      53         1          1         1          1          2       1      0   
...   ...       ...        ...       ...        ...        ...     ...    ...   
1717   43         1          4         0          0          0       1      0   
1720   26         3          1         0          0          0       1      0   
1723   20         2          1         0          0          0       1      0   
1725   33         1          4         0          0          0       1      0   
1728   32         1          1         0          0          0       1      0   

      Phonophobia  Photopho

In [176]:
# Feature Engineering
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, f_classif

selector = SelectKBest(mutual_info_classif, k="all")
X = selector.fit_transform(X,y)
X

array([[30,  1,  5, ...,  0,  0,  0],
       [50,  3,  5, ...,  0,  0,  0],
       [53,  2,  1, ...,  0,  0,  0],
       ...,
       [20,  2,  1, ...,  0,  0,  0],
       [33,  1,  4, ...,  0,  0,  0],
       [32,  1,  1, ...,  0,  0,  0]], dtype=int64)

In [177]:
X.shape

(953, 21)

In [178]:
# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[-0.07904491, -0.75955621,  1.85020496, ..., -0.09764168,
        -0.06492275, -0.81435635],
       [ 1.61185958,  2.07909896,  1.85020496, ..., -0.09764168,
        -0.06492275, -0.81435635],
       [ 1.86549525,  0.65977137, -0.72199829, ..., -0.09764168,
        -0.06492275, -0.81435635],
       ...,
       [-0.92449715,  0.65977137, -0.72199829, ..., -0.09764168,
        -0.06492275, -0.81435635],
       [ 0.17459077, -0.75955621,  1.20715415, ..., -0.09764168,
        -0.06492275, -0.81435635],
       [ 0.09004554, -0.75955621, -0.72199829, ..., -0.09764168,
        -0.06492275, -0.81435635]])

In [179]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [180]:
classifiers = {
    "Logistic Regression" : LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "Naive Bayes": GaussianNB(),
    "K Nearest Neighbors": KNeighborsClassifier(n_neighbors=1, weights="uniform", metric="euclidean")
}

In [181]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, recall_score, precision_score, f1_score

results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix for {name}: \n", cm)
    accuracy = accuracy_score(y_test, y_pred)
    results[name]=accuracy
    print(f"{name} Accuracy: {accuracy*100:.2f}%")
    print(f"{name} Precision: {precision_score(y_test, y_pred, average="macro")*100:.2f}%")
    print(f"{name} Recall: {recall_score(y_test, y_pred, average="macro")*100:.2f}%")
    print(f"{name} F1_score: {f1_score(y_test, y_pred, average="macro")*100:.2f}%")
    print(classification_report(y_test, y_pred))
    print("-"*30)
            

Confusion Matrix for Logistic Regression: 
 [[21  0  0  0  0  2  0]
 [ 1 13  0  0  0  3  0]
 [ 0  0 33  0  0  0  0]
 [ 1  0  0 25  0  2  0]
 [ 2  0  1  0 15  1  0]
 [ 3  0  0  0  3 44  0]
 [ 0  0  0  0  0  0 21]]
Logistic Regression Accuracy: 90.05%
Logistic Regression Precision: 91.43%
Logistic Regression Recall: 89.14%
Logistic Regression F1_score: 89.89%
                               precision    recall  f1-score   support

            Basilar-type aura       0.75      0.91      0.82        23
 Familial hemiplegic migraine       1.00      0.76      0.87        17
        Migraine without aura       0.97      1.00      0.99        33
                        Other       1.00      0.89      0.94        28
 Sporadic hemiplegic migraine       0.83      0.79      0.81        19
   Typical aura with migraine       0.85      0.88      0.86        50
Typical aura without migraine       1.00      1.00      1.00        21

                     accuracy                           0.90       191