In [111]:
# Importing Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [112]:
# Load CSV for disease data. 
disease_df = pd.read_csv('./disease_and_symptoms_data/DiseaseAndSymptoms.csv')
disease_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [113]:
# Determine the number of null values in each symptom column
disease_df.isna().sum()

Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64

In [114]:
# Determine the number of unique values in each column.
unique_counts = disease_df.nunique()
unique_counts

Disease       41
Symptom_1     34
Symptom_2     48
Symptom_3     54
Symptom_4     50
Symptom_5     38
Symptom_6     32
Symptom_7     26
Symptom_8     21
Symptom_9     22
Symptom_10    21
Symptom_11    18
Symptom_12    11
Symptom_13     8
Symptom_14     4
Symptom_15     3
Symptom_16     3
Symptom_17     1
dtype: int64

In [115]:
# Delete unwanted columns
disease_cleansed_df = disease_df.drop(columns=["Symptom_11","Symptom_12","Symptom_13","Symptom_14","Symptom_15","Symptom_16","Symptom_17"])
disease_cleansed_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,


In [116]:
# Fill missing values with a specific value
disease_cleansed_df = disease_cleansed_df.fillna(value='Null')
disease_cleansed_df

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,Null,Null,Null,Null,Null,Null
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,Null,Null,Null,Null,Null,Null,Null
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,Null,Null,Null,Null,Null,Null,Null
3,Fungal infection,itching,skin_rash,dischromic _patches,Null,Null,Null,Null,Null,Null,Null
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,Null,Null,Null,Null,Null,Null,Null
...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,Null,Null,Null,Null
4916,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,Null,Null,Null,Null,Null,Null
4917,Urinary tract infection,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,Null,Null,Null,Null,Null,Null
4918,Psoriasis,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,Null,Null,Null,Null


In [117]:
# Separate features (X) and target labels (y)
X = disease_df.drop(columns=['Disease'])
y = disease_df['Disease']

In [118]:
label_encoder = LabelEncoder()

# Iterate over each column in the DataFrame
for column in X.columns:
    if X[column].dtype == 'object':  # Check if the column contains string values
        # Replace NaN values with a placeholder, for example, 'Unknown'
        X[column].fillna('Unknown', inplace=True)
        # Fit label encoder and transform the labels in the column
        X[column] = label_encoder.fit_transform(X[column])

y = label_encoder.fit_transform(y)

In [119]:
# Splitting into Training and Testing Sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [120]:
# Instantiate and train the RandomForestClassifier model
model = RandomForestClassifier()
model.fit(X_train, y_train_encoded)


In [121]:
# Make predictions on the testing data
y_pred = model.predict(X_test)

In [122]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [123]:
# Instantiate the logistic regression model
logistic_model = LogisticRegression()

In [124]:
# Train the model on the training data
logistic_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [125]:
# Make predictions on the testing data
y_pred = logistic_model.predict(X_test)

In [126]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9288617886178862
