In [19]:
# all necessary imports
import warnings
from decimal import Decimal
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB     
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
#from xgboost import XGBClassifier
import matplotlib.pyplot as plt
# ignore warnings generated due to usage of old version of tensorflow
warnings.simplefilter("ignore")

In [None]:
df_comb = pd.read_csv("dis_sym_dataset_comb.csv")

X = df_comb.iloc[:, 1:]
Y = df_comb.iloc[:, 0:1]

In [None]:

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.10)

In [None]:

accuracy_list = []
cross_accuracy_list = []
model_list = []

In [35]:
# RF Classifier
rf = RandomForestClassifier(n_estimators=10, criterion='entropy')
rf = rf.fit(X, Y)
# prediction of labels for the test data
rf_pred = rf.predict(x_test)
acc_rf = round(Decimal(accuracy_score(y_test, rf_pred) * 100), 2)
accuracy_list.append(acc_rf)
model_list.append("RF")
print(f"Accuracy (RF) : {acc_rf}%")

# Cross Validation Accuracy RF
# performing cross validation with 5 different splits
scores_rf = cross_val_score(rf, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_rf.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (RF): {score}%")

Accuracy (RF) : 90.50%
Cross Validation Accuracy (RF): 87.04%


In [36]:
# KNN Classifier
knn = KNeighborsClassifier(n_neighbors=7, weights='distance', n_jobs=4)
knn = knn.fit(X, Y)
# prediction of labels for the test data
knn_pred = knn.predict(x_test)
acc_knn = round(Decimal(accuracy_score(y_test, knn_pred) * 100), 2)
accuracy_list.append(acc_knn)
model_list.append("KNN")
print(f"Accuracy (KNN) : {acc_knn}%")

# Cross Validation Accuracy KNN
# performing cross validation with 5 different splits
scores_knn = cross_val_score(knn, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_knn.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (KNN): {score}%")

Accuracy (KNN) : 91.06%
Cross Validation Accuracy (KNN): 85.25%


In [37]:
# DT Classifier
dt = DecisionTreeClassifier()
dt = dt.fit(X, Y)
# prediction of labels for the test data
dt_pred = dt.predict(x_test)
acc_dt = round(Decimal(accuracy_score(y_test, dt_pred) * 100), 2)
accuracy_list.append(acc_dt)
model_list.append("DT")
print(f"Accuracy (DT) : {acc_dt}%")

# Cross Validation Accuracy DT
# performing cross validation with 5 different splits
scores_dt = cross_val_score(dt, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_dt.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (DT): {score}%")

Accuracy (DT) : 90.95%
Cross Validation Accuracy (DT): 83.67%


In [38]:
# Load Dataset (assuming it's the same dataset)
df_comb = pd.read_csv("dis_sym_dataset_comb.csv")

# Separate features (X) and labels (Y)
X = df_comb.iloc[:, 1:]  # All columns except the first one
Y = df_comb.iloc[:, 0]   # First column (assumed as the label)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

# Train the KNN model
knn = KNeighborsClassifier(n_neighbors=7, weights='distance', n_jobs=4)
knn.fit(x_train, y_train)

# Input symptoms dictionary provided by the user
input_symptoms = {
    'chest pain': 1,  # Shortness of breath (1 = present, 0 = absent)
    'shortness breath': 1   # Chest pain (1 = present, 0 = absent)
}

# Create a new input data point with all features (set default 0 for other symptoms)
input_data = pd.DataFrame([[0]*len(X.columns)], columns=X.columns)

# Update the input data with the values from input_symptoms
for symptom, value in input_symptoms.items():
    if symptom in input_data.columns:
        input_data[symptom] = value
    else:
        print(f"Warning: {symptom} not found in dataset columns.")

# Predict the result using the trained KNN model
prediction = knn.predict(input_data)

# Print the predicted result (replace this with actual disease prediction logic)
print(f"Predicted disease: {prediction[0]}")

# Calculate accuracy of the model on the test set
knn_pred = knn.predict(x_test)
acc_knn = round(Decimal(accuracy_score(y_test, knn_pred) * 100), 2)
print(f"Accuracy of KNN model: {acc_knn}%")


Predicted disease: Anthrax
Accuracy of KNN model: 84.73%


In [39]:
# Load Dataset (assuming it's the same dataset)
df_comb = pd.read_csv("dis_sym_dataset_comb.csv")

# Separate features (X) and labels (Y)
X = df_comb.iloc[:, 1:]  # All columns except the first one
Y = df_comb.iloc[:, 0]   # First column (assumed as the label)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

# Train the Decision Tree model
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(x_train, y_train)

# Input symptoms dictionary provided by the user (Update these values based on column names)
input_symptoms = {
    'chest pain': 1,  # Shortness of breath
    'shortness breath': 1   # Chest pain
}

# Create a new input data point with all features (set default 0 for other symptoms)
input_data = pd.DataFrame([[0]*len(X.columns)], columns=X.columns)

# Update the input data with the values from input_symptoms
for symptom, value in input_symptoms.items():
    if symptom in input_data.columns:
        input_data[symptom] = value
    else:
        print(f"Warning: {symptom} not found in dataset columns.")

# Predict the result using the trained Decision Tree model
prediction = decision_tree.predict(input_data)

# Print the predicted result (replace this with actual disease prediction logic)
print(f"Predicted disease (Decision Tree): {prediction[0]}")

# Calculate accuracy of the Decision Tree model on the test set
dt_pred = decision_tree.predict(x_test)
acc_dt = round(Decimal(accuracy_score(y_test, dt_pred) * 100), 2)
print(f"Accuracy of Decision Tree model: {acc_dt}%")


Predicted disease (Decision Tree): Anthrax
Accuracy of Decision Tree model: 81.45%


In [40]:
# Load Dataset (assuming it's the same dataset)
df_comb = pd.read_csv("dis_sym_dataset_comb.csv")

# Separate features (X) and labels (Y)
X = df_comb.iloc[:, 1:]  # All columns except the first one
Y = df_comb.iloc[:, 0]   # First column (assumed as the label)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

# Train the Random Forest model
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(x_train, y_train)

# Input symptoms dictionary provided by the user (Update these values based on column names)
input_symptoms = {
    'chest pain': 1,  # Shortness of breath
    'shortness breath': 1   # Chest pain
}

# Create a new input data point with all features (set default 0 for other symptoms)
input_data = pd.DataFrame([[0]*len(X.columns)], columns=X.columns)

# Update the input data with the values from input_symptoms
for symptom, value in input_symptoms.items():
    if symptom in input_data.columns:
        input_data[symptom] = value
    else:
        print(f"Warning: {symptom} not found in dataset columns.")

# Predict the result using the trained Random Forest model
prediction = random_forest.predict(input_data)

# Print the predicted result (replace this with actual disease prediction logic)
print(f"Predicted disease (Random Forest): {prediction[0]}")

# Calculate accuracy of the Random Forest model on the test set
rf_pred = random_forest.predict(x_test)
acc_rf = round(Decimal(accuracy_score(y_test, rf_pred) * 100), 2)
print(f"Accuracy of Random Forest model: {acc_rf}%")


Predicted disease (Random Forest): Lung cancer
Accuracy of Random Forest model: 87.78%


In [None]:

df_comb = pd.read_csv("dis_sym_dataset_comb.csv")
X = df_comb.iloc[:, 1:]  # All columns except the first one
Y = df_comb.iloc[:, 0]  
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(x_train, y_train)

input_symptoms = {
    'shortness breath': 1,  # Shortness of breath
    
}

input_data = pd.DataFrame([[0]*len(X.columns)], columns=X.columns)
for symptom, value in input_symptoms.items():
    if symptom in input_data.columns:
        input_data[symptom] = value
    else:
        print(f"Warning: {symptom} not found in dataset columns.")

prediction = random_forest.predict(input_data)

print(f"Predicted disease (Random Forest): {prediction[0]}")

rf_pred = random_forest.predict(x_test)
acc_rf = round(Decimal(accuracy_score(y_test, rf_pred) * 100), 2)
print(f"Accuracy of Random Forest model: {acc_rf}%")


Predicted disease (Random Forest): Bronchitis
Accuracy of Random Forest model: 87.78%


In [None]:

df_comb = pd.read_csv("dis_sym_dataset_comb.csv")


X = df_comb.iloc[:, 1:]  # columns except the first one
Y = df_comb.iloc[:, 0]   # column 
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(x_train, y_train)

input_symptoms = {
    'fever': 1,  
    'headache': 1,
    'runny nose':1,
}
input_data = pd.DataFrame([[0]*len(X.columns)], columns=X.columns)
for symptom, value in input_symptoms.items():
    if symptom in input_data.columns:
        input_data[symptom] = value
    else:
        print(f"Warning: {symptom} not found in dataset columns.")

prediction = decision_tree.predict(input_data)


print(f"Predicted disease (Decision Tree): {prediction[0]}")

dt_pred = decision_tree.predict(x_test)
acc_dt = round(Decimal(accuracy_score(y_test, dt_pred) * 100), 2)
print(f"Accuracy of Decision Tree model: {acc_dt}%")

Predicted disease (Decision Tree): Influenza
Accuracy of Decision Tree model: 81.45%
