In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
df = pd.read_csv("balanced_migraine_data.csv")

In [3]:
df=df.drop_duplicates()

In [4]:
df.head()

Unnamed: 0,Age,Duration,Frequency,Location,Character,Intensity,Nausea,Vomit,Phonophobia,Photophobia,...,Vertigo,Tinnitus,Hypoacusis,Diplopia,Defect,Ataxia,Conscience,Paresthesia,DPF,Type
0,30,1,5,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura with migraine
1,50,3,5,1,1,3,1,1,1,1,...,1,0,0,0,0,0,0,0,0,Typical aura with migraine
2,53,2,1,1,1,2,1,1,1,1,...,0,0,0,0,0,0,0,0,0,Typical aura with migraine
3,45,3,5,1,1,3,1,0,1,1,...,1,0,0,0,0,0,0,0,0,Typical aura with migraine
4,53,1,1,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,1,Typical aura with migraine


In [5]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [6]:
label_encoder = LabelEncoder()
df["Type"] = label_encoder.fit_transform(df["Type"])

In [7]:
df

Unnamed: 0,Age,Duration,Frequency,Location,Character,Intensity,Nausea,Vomit,Phonophobia,Photophobia,...,Vertigo,Tinnitus,Hypoacusis,Diplopia,Defect,Ataxia,Conscience,Paresthesia,DPF,Type
0,30,1,5,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,0,5
1,50,3,5,1,1,3,1,1,1,1,...,1,0,0,0,0,0,0,0,0,5
2,53,2,1,1,1,2,1,1,1,1,...,0,0,0,0,0,0,0,0,0,5
3,45,3,5,1,1,3,1,0,1,1,...,1,0,0,0,0,0,0,0,0,5
4,53,1,1,1,1,2,1,0,1,1,...,0,0,0,0,0,0,0,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1663,46,1,4,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,6
1671,21,3,2,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,1,6
1673,31,1,2,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,6
1675,25,1,4,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,1,6


In [8]:
df.shape

(953, 24)

In [9]:
df.duplicated().sum()

0

In [10]:
print(X)

      Age  Duration  Frequency  Location  Character  Intensity  Nausea  Vomit  \
0      30         1          5         1          1          2       1      0   
1      50         3          5         1          1          3       1      1   
2      53         2          1         1          1          2       1      1   
3      45         3          5         1          1          3       1      0   
4      53         1          1         1          1          2       1      0   
...   ...       ...        ...       ...        ...        ...     ...    ...   
1663   46         1          4         0          0          0       1      0   
1671   21         3          2         0          0          0       1      0   
1673   31         1          2         0          0          0       1      0   
1675   25         1          4         0          0          0       1      0   
1680   24         2          1         0          0          0       1      0   

      Phonophobia  Photopho

In [11]:
# Feature Engineering
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, f_classif

selector = SelectKBest(mutual_info_classif, k="all")
X = selector.fit_transform(X,y)
X

array([[30,  1,  5, ...,  0,  0,  0],
       [50,  3,  5, ...,  0,  0,  0],
       [53,  2,  1, ...,  0,  0,  0],
       ...,
       [31,  1,  2, ...,  0,  0,  0],
       [25,  1,  4, ...,  0,  0,  1],
       [24,  2,  1, ...,  0,  0,  1]], dtype=int64)

In [12]:
X.shape

(953, 23)

In [13]:
# Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[-0.0599369 , -0.76909729,  1.82033159, ..., -0.10806147,
        -0.06492275, -0.80902033],
       [ 1.65280279,  2.08840648,  1.82033159, ..., -0.10806147,
        -0.06492275, -0.80902033],
       [ 1.90971374,  0.6596546 , -0.73550783, ..., -0.10806147,
        -0.06492275, -0.80902033],
       ...,
       [ 0.02570008, -0.76909729, -0.09654797, ..., -0.10806147,
        -0.06492275, -0.80902033],
       [-0.48812183, -0.76909729,  1.18137173, ..., -0.10806147,
        -0.06492275,  1.23606289],
       [-0.57375881,  0.6596546 , -0.73550783, ..., -0.10806147,
        -0.06492275,  1.23606289]])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [17]:
rf = RandomForestClassifier(bootstrap=False, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [20]:
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average = "macro"))
print("Recall:", recall_score(y_test, y_pred, average = "macro"))
print("Classification Report:", classification_report(y_test, y_pred))
            

Accuracy: 0.9528795811518325
Precision: 0.9496394774125866
Recall: 0.9478571428571428
Classification Report:                                precision    recall  f1-score   support

            Basilar-type aura       0.95      0.80      0.87        25
 Familial hemiplegic migraine       0.88      0.88      0.88        16
        Migraine without aura       0.92      1.00      0.96        34
                        Other       1.00      1.00      1.00        26
 Sporadic hemiplegic migraine       0.94      1.00      0.97        16
   Typical aura with migraine       0.96      0.96      0.96        50
Typical aura without migraine       1.00      1.00      1.00        24

                     accuracy                           0.95       191
                    macro avg       0.95      0.95      0.95       191
                 weighted avg       0.95      0.95      0.95       191



In [22]:
with open('migraine_model.pkl', 'wb') as f:
    pickle.dump(rf, f)