#***Data Definition***
A comprehensive set of variables related to demographics, health history, lab results, and reported symptoms to assess the presence of autoimmune conditions.

In [None]:
# Import necessary libraries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv('/content/autoimmune_dataset_with_clinical_notes.csv',encoding='ISO-8859-1')
df

Unnamed: 0,Age,Gender,Family_History,ANA_Test_Result,CRP_Level,Symptom_Duration,Autoimmune_Indicator,Clinical_Notes
0,18.905238,Female,0,1.409555,7.483964,6,0,General health stable with no familial history...
1,36.646556,Male,1,1.678908,38.002400,17,1,"Patient reports chronic fatigue, joint pain, a..."
2,53.801979,Female,1,1.836874,20.893978,27,1,"Patient reports chronic fatigue, joint pain, a..."
3,51.400839,Female,1,1.287349,0.541344,2,0,Patient exhibits no significant inflammatory m...
4,41.236596,Male,1,1.041335,5.968569,7,0,General health stable with no familial history...
...,...,...,...,...,...,...,...,...
995,35.726868,Female,0,0.960685,7.564777,5,0,Minor joint pain reported but attributed to ph...
996,43.882309,Female,1,2.166170,27.476615,11,1,"Patient reports chronic fatigue, joint pain, a..."
997,36.377538,Female,0,0.594462,8.383378,5,0,Clinical evaluation shows no abnormalities in ...
998,45.706978,Female,1,2.520069,36.763360,13,1,Reports skin rashes and prolonged muscle weakn...


#***Data Exploration and Cleaning***

*a) Examine the data(understanding the structure and contents of the dataset)*

In [None]:
# examine first 5 rows of dataset
df.head()

Unnamed: 0,Age,Gender,Family_History,ANA_Test_Result,CRP_Level,Symptom_Duration,Autoimmune_Indicator,Clinical_Notes
0,18.905238,Female,0,1.409555,7.483964,6,0,General health stable with no familial history...
1,36.646556,Male,1,1.678908,38.0024,17,1,"Patient reports chronic fatigue, joint pain, a..."
2,53.801979,Female,1,1.836874,20.893978,27,1,"Patient reports chronic fatigue, joint pain, a..."
3,51.400839,Female,1,1.287349,0.541344,2,0,Patient exhibits no significant inflammatory m...
4,41.236596,Male,1,1.041335,5.968569,7,0,General health stable with no familial history...


In [None]:
# examine last 5 rows of dataset
df.tail()

Unnamed: 0,Age,Gender,Family_History,ANA_Test_Result,CRP_Level,Symptom_Duration,Autoimmune_Indicator,Clinical_Notes
995,35.726868,Female,0,0.960685,7.564777,5,0,Minor joint pain reported but attributed to ph...
996,43.882309,Female,1,2.16617,27.476615,11,1,"Patient reports chronic fatigue, joint pain, a..."
997,36.377538,Female,0,0.594462,8.383378,5,0,Clinical evaluation shows no abnormalities in ...
998,45.706978,Female,1,2.520069,36.76336,13,1,Reports skin rashes and prolonged muscle weakn...
999,47.191623,Female,1,1.449069,6.89969,7,0,Reports occasional fatigue but no signs of chr...


In [None]:
# information on overall dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   1000 non-null   float64
 1   Gender                1000 non-null   object 
 2   Family_History        1000 non-null   int64  
 3   ANA_Test_Result       1000 non-null   float64
 4   CRP_Level             1000 non-null   float64
 5   Symptom_Duration      1000 non-null   int64  
 6   Autoimmune_Indicator  1000 non-null   int64  
 7   Clinical_Notes        1000 non-null   object 
dtypes: float64(3), int64(3), object(2)
memory usage: 62.6+ KB


In [None]:
# describe the numerical features in dataset
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1000.0,43.694209,13.460175,18.0,34.583884,44.384998,53.231014,80.0
Family_History,1000.0,0.46,0.498647,0.0,0.0,0.0,1.0,1.0
ANA_Test_Result,1000.0,1.484138,0.695487,0.500053,0.916776,1.316583,2.013052,2.995025
CRP_Level,1000.0,12.627695,11.078509,0.024546,4.273476,8.346264,20.229553,39.865493
Symptom_Duration,1000.0,14.012,15.388459,0.0,3.0,6.0,25.0,51.0
Autoimmune_Indicator,1000.0,0.387,0.487307,0.0,0.0,0.0,1.0,1.0


In [None]:
# describe the categorical feartures in dataset
df.describe(include='O').transpose()

Unnamed: 0,count,unique,top,freq
Gender,1000,2,Female,612
Clinical_Notes,1000,10,Reports occasional fatigue but no signs of chr...,139


In [None]:
# number of rows and columns in the dataset
df.shape

(1000, 8)

*b) Handle missing values*

In [None]:
# Check for missing values(impute them using methods like mean, median, or more advanced techniques.)
df.isnull().sum()

Unnamed: 0,0
Age,0
Gender,0
Family_History,0
ANA_Test_Result,0
CRP_Level,0
Symptom_Duration,0
Autoimmune_Indicator,0
Clinical_Notes,0


In [None]:
# Find the duplicate value
df.duplicated().sum()

0

In [None]:
# calculating the correlation of Age with other features
df.corr(numeric_only=True)

Unnamed: 0,Age,Family_History,ANA_Test_Result,CRP_Level,Symptom_Duration,Autoimmune_Indicator
Age,1.0,0.147913,0.338547,0.330184,0.326874,0.38444
Family_History,0.147913,1.0,0.348055,0.340361,0.312362,0.378906
ANA_Test_Result,0.338547,0.348055,1.0,0.748836,0.727579,0.865989
CRP_Level,0.330184,0.340361,0.748836,1.0,0.7196,0.851903
Symptom_Duration,0.326874,0.312362,0.727579,0.7196,1.0,0.850755
Autoimmune_Indicator,0.38444,0.378906,0.865989,0.851903,0.850755,1.0


In [None]:
# NLP
symptoms=df['Clinical_Notes']

symptoms

Unnamed: 0,Clinical_Notes
0,General health stable with no familial history...
1,"Patient reports chronic fatigue, joint pain, a..."
2,"Patient reports chronic fatigue, joint pain, a..."
3,Patient exhibits no significant inflammatory m...
4,General health stable with no familial history...
...,...
995,Minor joint pain reported but attributed to ph...
996,"Patient reports chronic fatigue, joint pain, a..."
997,Clinical evaluation shows no abnormalities in ...
998,Reports skin rashes and prolonged muscle weakn...


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Tokenization
from nltk.tokenize import word_tokenize
# Ensure all entries in 'symptoms' are strings and handle missing values
symptoms=symptoms.apply(lambda x:word_tokenize(x)).apply(lambda x:' '.join(x))
symptoms

Unnamed: 0,Clinical_Notes
0,General health stable with no familial history...
1,"Patient reports chronic fatigue , joint pain ,..."
2,"Patient reports chronic fatigue , joint pain ,..."
3,Patient exhibits no significant inflammatory m...
4,General health stable with no familial history...
...,...
995,Minor joint pain reported but attributed to ph...
996,"Patient reports chronic fatigue , joint pain ,..."
997,Clinical evaluation shows no abnormalities in ...
998,Reports skin rashes and prolonged muscle weakn...


In [None]:
# remove special characters
import re
symptoms=symptoms.str.replace('[^A-Za-z0-9]',' ',regex=True)
symptoms

Unnamed: 0,Clinical_Notes
0,General health stable with no familial history...
1,Patient reports chronic fatigue joint pain ...
2,Patient reports chronic fatigue joint pain ...
3,Patient exhibits no significant inflammatory m...
4,General health stable with no familial history...
...,...
995,Minor joint pain reported but attributed to ph...
996,Patient reports chronic fatigue joint pain ...
997,Clinical evaluation shows no abnormalities in ...
998,Reports skin rashes and prolonged muscle weakn...


In [None]:
# collect meaningful words
from nltk.tokenize import word_tokenize
symptoms=symptoms.apply(lambda x:[i for i in word_tokenize(x) if len(i)>=3]).apply(lambda x:' '.join(x))
symptoms

Unnamed: 0,Clinical_Notes
0,General health stable with familial history au...
1,Patient reports chronic fatigue joint pain and...
2,Patient reports chronic fatigue joint pain and...
3,Patient exhibits significant inflammatory mark...
4,General health stable with familial history au...
...,...
995,Minor joint pain reported but attributed physi...
996,Patient reports chronic fatigue joint pain and...
997,Clinical evaluation shows abnormalities ANA CR...
998,Reports skin rashes and prolonged muscle weakn...


In [None]:
# stemming
from nltk.stem import PorterStemmer
ps=PorterStemmer()
symptoms=symptoms.apply(lambda x :[ps.stem(i.lower()) for i in word_tokenize(x)]).apply(lambda x:" ".join(x))
symptoms

Unnamed: 0,Clinical_Notes
0,gener health stabl with famili histori autoimm...
1,patient report chronic fatigu joint pain and p...
2,patient report chronic fatigu joint pain and p...
3,patient exhibit signif inflammatori marker aut...
4,gener health stabl with famili histori autoimm...
...,...
995,minor joint pain report but attribut physic st...
996,patient report chronic fatigu joint pain and p...
997,clinic evalu show abnorm ana crp level
998,report skin rash and prolong muscl weak indic ...


In [None]:
# remove stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
words_stop=stopwords.words('english')
symptoms=symptoms.apply(lambda x:[i for i in word_tokenize(x) if i not in words_stop]).apply(lambda x:' '.join(x))

symptoms

Unnamed: 0,Clinical_Notes
0,gener health stabl famili histori autoimmun co...
1,patient report chronic fatigu joint pain persi...
2,patient report chronic fatigu joint pain persi...
3,patient exhibit signif inflammatori marker aut...
4,gener health stabl famili histori autoimmun co...
...,...
995,minor joint pain report attribut physic strain...
996,patient report chronic fatigu joint pain persi...
997,clinic evalu show abnorm ana crp level
998,report skin rash prolong muscl weak indic auto...


In [None]:
# vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer()
symptoms_numerical=vectorizer.fit_transform(symptoms)
symptoms_numerical

<1000x57 sparse matrix of type '<class 'numpy.float64'>'
	with 7829 stored elements in Compressed Sparse Row format>