#***Data Definition***
A comprehensive set of variables related to demographics, health history, lab results, and reported symptoms to assess the presence of autoimmune conditions.

# ***Data Collection***

In [None]:
# Import necessary libraries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv('/content/synthetic_autoimmune_dataset.csv')
df

Unnamed: 0,Age,Gender,Family_History,ANA_Test_Result,CRP_Level,Symptom_Duration,Autoimmune_Indicator,Clinical_Notes
0,56,Female,0,Negative,41.354543,0.876386,0,Clinical presentation not indicative of autoim...
1,69,Female,1,Negative,47.914648,1.529219,1,Patient exhibits symptoms consistent with auto...
2,46,Male,1,Positive,2.751155,0.995265,0,Clinical presentation not indicative of autoim...
3,32,Male,0,Negative,11.668582,3.006671,0,Clinical presentation not indicative of autoim...
4,60,Male,0,Positive,9.478106,2.835407,0,Clinical presentation not indicative of autoim...
...,...,...,...,...,...,...,...,...
9995,38,Female,0,Positive,14.635030,4.500755,0,Clinical presentation not indicative of autoim...
9996,23,Male,1,Negative,6.649159,3.650763,1,Patient exhibits symptoms consistent with auto...
9997,18,Male,0,Positive,30.559541,2.728188,0,Clinical presentation not indicative of autoim...
9998,21,Male,0,Negative,34.176490,4.654111,1,Patient exhibits symptoms consistent with auto...


#***Data Exploration and Cleaning***

*a) Examine the data(understanding the structure and contents of the dataset)*

In [None]:
# examine first 5 rows of dataset
df.head()

Unnamed: 0,Age,Gender,Family_History,ANA_Test_Result,CRP_Level,Symptom_Duration,Autoimmune_Indicator,Clinical_Notes
0,56,Female,0,Negative,41.354543,0.876386,0,Clinical presentation not indicative of autoim...
1,69,Female,1,Negative,47.914648,1.529219,1,Patient exhibits symptoms consistent with auto...
2,46,Male,1,Positive,2.751155,0.995265,0,Clinical presentation not indicative of autoim...
3,32,Male,0,Negative,11.668582,3.006671,0,Clinical presentation not indicative of autoim...
4,60,Male,0,Positive,9.478106,2.835407,0,Clinical presentation not indicative of autoim...


In [None]:
# examine last 5 rows of dataset
df.tail()

Unnamed: 0,Age,Gender,Family_History,ANA_Test_Result,CRP_Level,Symptom_Duration,Autoimmune_Indicator,Clinical_Notes
9995,38,Female,0,Positive,14.63503,4.500755,0,Clinical presentation not indicative of autoim...
9996,23,Male,1,Negative,6.649159,3.650763,1,Patient exhibits symptoms consistent with auto...
9997,18,Male,0,Positive,30.559541,2.728188,0,Clinical presentation not indicative of autoim...
9998,21,Male,0,Negative,34.17649,4.654111,1,Patient exhibits symptoms consistent with auto...
9999,28,Male,1,Positive,34.898413,4.359205,1,Patient exhibits symptoms consistent with auto...


In [None]:
# information on overall dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   10000 non-null  int64  
 1   Gender                10000 non-null  object 
 2   Family_History        10000 non-null  int64  
 3   ANA_Test_Result       10000 non-null  object 
 4   CRP_Level             10000 non-null  float64
 5   Symptom_Duration      10000 non-null  float64
 6   Autoimmune_Indicator  10000 non-null  int64  
 7   Clinical_Notes        10000 non-null  object 
dtypes: float64(2), int64(3), object(3)
memory usage: 625.1+ KB


In [None]:
# describe the numerical features in dataset
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,10000.0,48.7984,17.903223,18.0,34.0,49.0,64.0,79.0
Family_History,10000.0,0.2944,0.455795,0.0,0.0,0.0,1.0,1.0
CRP_Level,10000.0,24.802591,14.424425,0.005119,12.454843,24.743294,37.237761,49.994984
Symptom_Duration,10000.0,2.515348,1.441315,0.000466,1.268984,2.513802,3.770641,4.999045
Autoimmune_Indicator,10000.0,0.4518,0.497696,0.0,0.0,0.0,1.0,1.0


In [None]:
# describe the categorical feartures in dataset
df.describe(include='O').transpose()

Unnamed: 0,count,unique,top,freq
Gender,10000,2,Male,5014
ANA_Test_Result,10000,2,Negative,6029
Clinical_Notes,10000,2,Clinical presentation not indicative of autoim...,5482


In [None]:
# number of rows and columns in the dataset
df.shape

(10000, 8)

*b) Handle missing values*

In [None]:
# Check for missing values(impute them using methods like mean, median, or more advanced techniques.)
df.isnull().sum()

Unnamed: 0,0
Age,0
Gender,0
Family_History,0
ANA_Test_Result,0
CRP_Level,0
Symptom_Duration,0
Autoimmune_Indicator,0
Clinical_Notes,0


In [None]:
# Find the duplicate value
df.duplicated().sum()

0

In [None]:
# calculating the correlation of Age with other features
df.corr(numeric_only=True)

Unnamed: 0,Age,Family_History,CRP_Level,Symptom_Duration,Autoimmune_Indicator
Age,1.0,0.008144,-0.000395,0.010035,-0.003077
Family_History,0.008144,1.0,0.004145,0.009256,0.310769
CRP_Level,-0.000395,0.004145,1.0,-0.010671,0.054927
Symptom_Duration,0.010035,0.009256,-0.010671,1.0,0.048071
Autoimmune_Indicator,-0.003077,0.310769,0.054927,0.048071,1.0


In [None]:
# NLP
symptoms=df['Clinical_Notes']

symptoms

Unnamed: 0,Clinical_Notes
0,Clinical presentation not indicative of autoim...
1,Patient exhibits symptoms consistent with auto...
2,Clinical presentation not indicative of autoim...
3,Clinical presentation not indicative of autoim...
4,Clinical presentation not indicative of autoim...
...,...
9995,Clinical presentation not indicative of autoim...
9996,Patient exhibits symptoms consistent with auto...
9997,Clinical presentation not indicative of autoim...
9998,Patient exhibits symptoms consistent with auto...


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Tokenization
from nltk.tokenize import word_tokenize
# Ensure all entries in 'symptoms' are strings and handle missing values
symptoms=symptoms.apply(lambda x:word_tokenize(x)).apply(lambda x:' '.join(x))
symptoms

Unnamed: 0,Clinical_Notes
0,Clinical presentation not indicative of autoim...
1,Patient exhibits symptoms consistent with auto...
2,Clinical presentation not indicative of autoim...
3,Clinical presentation not indicative of autoim...
4,Clinical presentation not indicative of autoim...
...,...
9995,Clinical presentation not indicative of autoim...
9996,Patient exhibits symptoms consistent with auto...
9997,Clinical presentation not indicative of autoim...
9998,Patient exhibits symptoms consistent with auto...


In [None]:
# remove special characters
import re
symptoms=symptoms.str.replace('[^A-Za-z0-9]',' ',regex=True)
symptoms

Unnamed: 0,Clinical_Notes
0,Clinical presentation not indicative of autoim...
1,Patient exhibits symptoms consistent with auto...
2,Clinical presentation not indicative of autoim...
3,Clinical presentation not indicative of autoim...
4,Clinical presentation not indicative of autoim...
...,...
9995,Clinical presentation not indicative of autoim...
9996,Patient exhibits symptoms consistent with auto...
9997,Clinical presentation not indicative of autoim...
9998,Patient exhibits symptoms consistent with auto...


In [None]:
# collect meaningful words
from nltk.tokenize import word_tokenize
symptoms=symptoms.apply(lambda x:[i for i in word_tokenize(x) if len(i)>=3]).apply(lambda x:' '.join(x))
symptoms

Unnamed: 0,Clinical_Notes
0,Clinical presentation not indicative autoimmun...
1,Patient exhibits symptoms consistent with auto...
2,Clinical presentation not indicative autoimmun...
3,Clinical presentation not indicative autoimmun...
4,Clinical presentation not indicative autoimmun...
...,...
9995,Clinical presentation not indicative autoimmun...
9996,Patient exhibits symptoms consistent with auto...
9997,Clinical presentation not indicative autoimmun...
9998,Patient exhibits symptoms consistent with auto...


In [None]:
# stemming
from nltk.stem import PorterStemmer
ps=PorterStemmer()
symptoms=symptoms.apply(lambda x :[ps.stem(i.lower()) for i in word_tokenize(x)]).apply(lambda x:" ".join(x))
symptoms

Unnamed: 0,Clinical_Notes
0,clinic present not indic autoimmun diseas
1,patient exhibit symptom consist with autoimmun...
2,clinic present not indic autoimmun diseas
3,clinic present not indic autoimmun diseas
4,clinic present not indic autoimmun diseas
...,...
9995,clinic present not indic autoimmun diseas
9996,patient exhibit symptom consist with autoimmun...
9997,clinic present not indic autoimmun diseas
9998,patient exhibit symptom consist with autoimmun...


In [None]:
# remove stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
words_stop=stopwords.words('english')
symptoms=symptoms.apply(lambda x:[i for i in word_tokenize(x) if i not in words_stop]).apply(lambda x:' '.join(x))

symptoms

Unnamed: 0,Clinical_Notes
0,clinic present indic autoimmun diseas
1,patient exhibit symptom consist autoimmun condit
2,clinic present indic autoimmun diseas
3,clinic present indic autoimmun diseas
4,clinic present indic autoimmun diseas
...,...
9995,clinic present indic autoimmun diseas
9996,patient exhibit symptom consist autoimmun condit
9997,clinic present indic autoimmun diseas
9998,patient exhibit symptom consist autoimmun condit


In [None]:
# vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer()
symptoms_numerical=vectorizer.fit_transform(symptoms)
symptoms_numerical

<10000x10 sparse matrix of type '<class 'numpy.float64'>'
	with 54518 stored elements in Compressed Sparse Row format>

In [None]:
df.columns

Index(['Age', 'Gender', 'Family_History', 'ANA_Test_Result', 'CRP_Level',
       'Symptom_Duration', 'Autoimmune_Indicator', 'Clinical_Notes'],
      dtype='object')

In [None]:
df.corr(numeric_only=True)

Unnamed: 0,Age,Family_History,CRP_Level,Symptom_Duration,Autoimmune_Indicator
Age,1.0,0.008144,-0.000395,0.010035,-0.003077
Family_History,0.008144,1.0,0.004145,0.009256,0.310769
CRP_Level,-0.000395,0.004145,1.0,-0.010671,0.054927
Symptom_Duration,0.010035,0.009256,-0.010671,1.0,0.048071
Autoimmune_Indicator,-0.003077,0.310769,0.054927,0.048071,1.0


In [None]:
df['Autoimmune_Indicator'].value_counts()

Unnamed: 0_level_0,count
Autoimmune_Indicator,Unnamed: 1_level_1
0,5482
1,4518


In [None]:
# Convert the sparse matrix to a DataFrame
symptoms_numerical_df = pd.DataFrame.sparse.from_spmatrix(symptoms_numerical)

# Optionally, assign column names to the new DataFrame
symptoms_numerical_df.columns = [f'symptom_{i}' for i in range(symptoms_numerical_df.shape[1])]

# Concatenate the DataFrames
df = pd.concat([df, symptoms_numerical_df], axis=1)
df

Unnamed: 0,Age,Gender,Family_History,ANA_Test_Result,CRP_Level,Symptom_Duration,Autoimmune_Indicator,Clinical_Notes,symptom_0,symptom_1,symptom_2,symptom_3,symptom_4,symptom_5,symptom_6,symptom_7,symptom_8,symptom_9
0,56,Female,0,Negative,41.354543,0.876386,0,Clinical presentation not indicative of autoim...,0.2981,0.477267,0,0,0.477267,0,0.477267,0,0.477267,0
1,69,Female,1,Negative,47.914648,1.529219,1,Patient exhibits symptoms consistent with auto...,0.241831,0,0.43394,0.43394,0,0.43394,0,0.43394,0,0.43394
2,46,Male,1,Positive,2.751155,0.995265,0,Clinical presentation not indicative of autoim...,0.2981,0.477267,0,0,0.477267,0,0.477267,0,0.477267,0
3,32,Male,0,Negative,11.668582,3.006671,0,Clinical presentation not indicative of autoim...,0.2981,0.477267,0,0,0.477267,0,0.477267,0,0.477267,0
4,60,Male,0,Positive,9.478106,2.835407,0,Clinical presentation not indicative of autoim...,0.2981,0.477267,0,0,0.477267,0,0.477267,0,0.477267,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,38,Female,0,Positive,14.635030,4.500755,0,Clinical presentation not indicative of autoim...,0.2981,0.477267,0,0,0.477267,0,0.477267,0,0.477267,0
9996,23,Male,1,Negative,6.649159,3.650763,1,Patient exhibits symptoms consistent with auto...,0.241831,0,0.43394,0.43394,0,0.43394,0,0.43394,0,0.43394
9997,18,Male,0,Positive,30.559541,2.728188,0,Clinical presentation not indicative of autoim...,0.2981,0.477267,0,0,0.477267,0,0.477267,0,0.477267,0
9998,21,Male,0,Negative,34.176490,4.654111,1,Patient exhibits symptoms consistent with auto...,0.241831,0,0.43394,0.43394,0,0.43394,0,0.43394,0,0.43394


In [None]:
df.drop(['Clinical_Notes','Age'],axis=1,inplace=True)
df

Unnamed: 0,Gender,Family_History,ANA_Test_Result,CRP_Level,Symptom_Duration,Autoimmune_Indicator,symptom_0,symptom_1,symptom_2,symptom_3,symptom_4,symptom_5,symptom_6,symptom_7,symptom_8,symptom_9
0,Female,0,Negative,41.354543,0.876386,0,0.2981,0.477267,0,0,0.477267,0,0.477267,0,0.477267,0
1,Female,1,Negative,47.914648,1.529219,1,0.241831,0,0.43394,0.43394,0,0.43394,0,0.43394,0,0.43394
2,Male,1,Positive,2.751155,0.995265,0,0.2981,0.477267,0,0,0.477267,0,0.477267,0,0.477267,0
3,Male,0,Negative,11.668582,3.006671,0,0.2981,0.477267,0,0,0.477267,0,0.477267,0,0.477267,0
4,Male,0,Positive,9.478106,2.835407,0,0.2981,0.477267,0,0,0.477267,0,0.477267,0,0.477267,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Female,0,Positive,14.635030,4.500755,0,0.2981,0.477267,0,0,0.477267,0,0.477267,0,0.477267,0
9996,Male,1,Negative,6.649159,3.650763,1,0.241831,0,0.43394,0.43394,0,0.43394,0,0.43394,0,0.43394
9997,Male,0,Positive,30.559541,2.728188,0,0.2981,0.477267,0,0,0.477267,0,0.477267,0,0.477267,0
9998,Male,0,Negative,34.176490,4.654111,1,0.241831,0,0.43394,0.43394,0,0.43394,0,0.43394,0,0.43394


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
for i in df.columns:
  if df[i].dtype=='object':
    df[i]=label_encoder.fit_transform(df[i])

df.dtypes

Unnamed: 0,0
Gender,int64
Family_History,int64
ANA_Test_Result,int64
CRP_Level,float64
Symptom_Duration,float64
Autoimmune_Indicator,int64
symptom_0,"Sparse[float64, 0]"
symptom_1,"Sparse[float64, 0]"
symptom_2,"Sparse[float64, 0]"
symptom_3,"Sparse[float64, 0]"


In [None]:
x=df.drop('Autoimmune_Indicator',axis=1).values
y=df['Autoimmune_Indicator'].values
y

array([0, 1, 0, ..., 0, 1, 1])

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
x_train

array([[1.        , 0.        , 1.        , ..., 0.43393961, 0.        ,
        0.43393961],
       [0.        , 0.        , 0.        , ..., 0.        , 0.47726737,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.43393961, 0.        ,
        0.43393961],
       ...,
       [1.        , 0.        , 0.        , ..., 0.43393961, 0.        ,
        0.43393961],
       [0.        , 1.        , 0.        , ..., 0.43393961, 0.        ,
        0.43393961],
       [0.        , 1.        , 1.        , ..., 0.        , 0.47726737,
        0.        ]])

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)
x_train


array([[1., 0., 1., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 1.],
       ...,
       [1., 0., 0., ..., 1., 0., 1.],
       [0., 1., 0., ..., 1., 0., 1.],
       [0., 1., 1., ..., 0., 1., 0.]])

In [None]:
# from imblearn.over_sampling import SMOTE
# # Apply SMOTE before model training
# smote=SMOTE(random_state=42)
# x_train,y_train=smote.fit_resample(x_train,y_train)

In [None]:
x_train.shape

(7000, 15)

In [None]:
# from sklearn.preprocessing import PolynomialFeatures

#      # Create a PolynomialFeatures object
# poly = PolynomialFeatures(degree=2)  # You can adjust the degree

# # Apply polynomial transformation to your features
# x_train = poly.fit_transform(x_train)
# x_test = poly.transform(x_test)

In [None]:
# # Principal Component Analysis

# from sklearn.decomposition import PCA
# pca = PCA(0.95)
# x_train = pca.fit_transform(x_train)
# x_test = pca.transform(x_test)

In [None]:
# from sklearn.ensemble import StackingClassifier
# from sklearn.linear_model import LogisticRegression

# # Define your base models (estimators)
# estimators = [
#     ('rf', RandomForestClassifier()),
#     ('knn', KNeighborsClassifier()),
#     ('svc', SVC())
# ]

# # Define the meta-learner (final estimator)
# stacking_model = StackingClassifier(
#     estimators=estimators, final_estimator=LogisticRegression()
# )

# # Train the stacking model
# stacking_model.fit(x_train, y_train)

# # Make predictions using the stacking model
# y_pred = stacking_model.predict(x_test)

# # Evaluate the stacking model
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy}')

In [None]:
#  Random Search CV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the hyperparameter grid:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
# Create and train the RandomizedSearchCV object:

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [None]:
# Get the best parameters:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 30,
 'bootstrap': True}

In [None]:
# Evaluate the model with the best parameters:
best_random = rf_random.best_estimator_
y_pred_random = best_random.predict(x_test)
print(f'Accuracy is {accuracy_score(y_test,y_pred_random)}')
print(f'classification report is {classification_report(y_test,y_pred_random)}')

Accuracy is 1.0
classification report is               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1583
           1       1.00      1.00      1.00      1417

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000



In [None]:
x_train.shape

(7000, 15)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
knn=KNeighborsClassifier()
svc=SVC()
rfc=RandomForestClassifier()
dtc=DecisionTreeClassifier()
gnb=BernoulliNB()
lst=[knn,svc,rfc,dtc,gnb]
for i in lst:
  print(f'Model is {i}')
  print("*************")
  i.fit(x_train,y_train)
  y_pred=i.predict(x_test)
  print(f'Accuracy is {accuracy_score(y_test,y_pred)}')
  print(f'classification report is {classification_report(y_test,y_pred)}')

Model is KNeighborsClassifier()
*************
Accuracy is 1.0
classification report is               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1583
           1       1.00      1.00      1.00      1417

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000

Model is SVC()
*************
Accuracy is 1.0
classification report is               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1583
           1       1.00      1.00      1.00      1417

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000

Model is RandomForestClassifier()
*************
Accuracy is 1.0
classification report is               precision    recall  f1-score   support

           0       1.00      1.00      1.00  