In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from google.colab import files

In [2]:
uploaded = files.upload()

Saving Cleaned-Data.csv to Cleaned-Data.csv


In [3]:
df = pd.read_csv("Cleaned-Data.csv")

In [4]:
print(df.head())

   Fever  Tiredness  Dry-Cough  Difficulty-in-Breathing  Sore-Throat  \
0      1          1          1                        1            1   
1      1          1          1                        1            1   
2      1          1          1                        1            1   
3      1          1          1                        1            1   
4      1          1          1                        1            1   

   None_Sympton  Pains  Nasal-Congestion  Runny-Nose  Diarrhea  ...  \
0             0      1                 1           1         1  ...   
1             0      1                 1           1         1  ...   
2             0      1                 1           1         1  ...   
3             0      1                 1           1         1  ...   
4             0      1                 1           1         1  ...   

   Gender_Male  Gender_Transgender  Severity_Mild  Severity_Moderate  \
0            1                   0              1                  0

In [5]:
print("\nDataset Shape:", df.shape)


Dataset Shape: (316800, 27)


In [6]:
print("\nDataset Info:")
print(df.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316800 entries, 0 to 316799
Data columns (total 27 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Fever                    316800 non-null  int64 
 1   Tiredness                316800 non-null  int64 
 2   Dry-Cough                316800 non-null  int64 
 3   Difficulty-in-Breathing  316800 non-null  int64 
 4   Sore-Throat              316800 non-null  int64 
 5   None_Sympton             316800 non-null  int64 
 6   Pains                    316800 non-null  int64 
 7   Nasal-Congestion         316800 non-null  int64 
 8   Runny-Nose               316800 non-null  int64 
 9   Diarrhea                 316800 non-null  int64 
 10  None_Experiencing        316800 non-null  int64 
 11  Age_0-9                  316800 non-null  int64 
 12  Age_10-19                316800 non-null  int64 
 13  Age_20-24                316800 non-null  int64 
 14  Age_2

In [7]:
print("\nMissing Values:\n", df.isnull().sum())


Missing Values:
 Fever                      0
Tiredness                  0
Dry-Cough                  0
Difficulty-in-Breathing    0
Sore-Throat                0
None_Sympton               0
Pains                      0
Nasal-Congestion           0
Runny-Nose                 0
Diarrhea                   0
None_Experiencing          0
Age_0-9                    0
Age_10-19                  0
Age_20-24                  0
Age_25-59                  0
Age_60+                    0
Gender_Female              0
Gender_Male                0
Gender_Transgender         0
Severity_Mild              0
Severity_Moderate          0
Severity_None              0
Severity_Severe            0
Contact_Dont-Know          0
Contact_No                 0
Contact_Yes                0
Country                    0
dtype: int64


In [8]:
print("\nSeverity Distribution:\n", df[['Severity_Mild', 'Severity_Moderate', 'Severity_Severe', 'Severity_None']].sum())



Severity Distribution:
 Severity_Mild        79200
Severity_Moderate    79200
Severity_Severe      79200
Severity_None        79200
dtype: int64


In [9]:
df["Has_Disease"] = (df["Severity_Mild"] == 1) | (df["Severity_Moderate"] == 1) | (df["Severity_Severe"] == 1)
df["Has_Disease"] = df["Has_Disease"].astype(int)


In [10]:
df = df.drop(columns=["Severity_Mild", "Severity_Moderate", "Severity_None", "Severity_Severe", "Country"])

In [12]:
X = df.drop(columns="Has_Disease", axis=1)
Y = df["Has_Disease"]
print("\nFeature Data (X):\n", X.head())
print("\nTarget Data (Y):\n", Y.head())


Feature Data (X):
    Fever  Tiredness  Dry-Cough  Difficulty-in-Breathing  Sore-Throat  \
0      1          1          1                        1            1   
1      1          1          1                        1            1   
2      1          1          1                        1            1   
3      1          1          1                        1            1   
4      1          1          1                        1            1   

   None_Sympton  Pains  Nasal-Congestion  Runny-Nose  Diarrhea  ...  \
0             0      1                 1           1         1  ...   
1             0      1                 1           1         1  ...   
2             0      1                 1           1         1  ...   
3             0      1                 1           1         1  ...   
4             0      1                 1           1         1  ...   

   Age_10-19  Age_20-24  Age_25-59  Age_60+  Gender_Female  Gender_Male  \
0          0          0          0        0  

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

print("\nData Split Shape:", X.shape, X_train.shape, X_test.shape)




Data Split Shape: (316800, 22) (253440, 22) (63360, 22)


In [14]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)

In [15]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

print("\nAccuracy on Training Data:", training_data_accuracy)
print("Accuracy on Test Data:", test_data_accuracy)


Accuracy on Training Data: 0.75
Accuracy on Test Data: 0.75


In [16]:
def predict_disease(symptoms):
    input_data_as_numpy_array = np.asarray(symptoms).reshape(1, -1)
    prediction = model.predict(input_data_as_numpy_array)

    if prediction[0] == 0:
        return "The Person does NOT have the Disease"
    else:
        return "The Person HAS the Disease"

In [20]:
sample_symptoms = [
    1,  # Fever
    0,  # Tiredness
    1,  # Dry-Cough
    0,  # Difficulty-in-Breathing
    1,  # Sore-Throat
    0,  # None_Symptom
    0,  # Pains
    0,  # Nasal-Congestion
    0,  # Runny-Nose
    0,  # Diarrhea
    0,  # None_Experiencing
    0,  # Age_0-9
    0,  # Age_10-19
    0,  # Age_20-24
    1,  # Age_25-59
    0,  # Age_60+
    0,  # Gender_Female
    1,  # Gender_Male
    0,  # Gender_Transgender
    0,  # Contact_Dont-Know
    0,  # Contact_No
    1   # Contact_Yes
]

print("\nPrediction Example:", predict_disease(sample_symptoms))



Prediction Example: The Person HAS the Disease


