In [None]:
"The goal is to predict the urgency with which a COVID-19 patient will need to be admitted to the " \
"hospital from the time of onset of symptoms. The dataset contains some COVID-19 symptoms and " \
"demographic information. This dataset was collected in the peak of a COVID-19 wave and hence may " \
"have some errors and missing data. The aim of this exercise is to pre-process the given data and " \
"create a clean dataset to use for modelling and inference. Additionally, you are required to " \
"perform some EDA to understand the data better."

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer, KNNImputer

In [None]:
df = pd.read_csv('covid.csv')
df.head()

In [None]:
# Check missing values

df.isnull()

In [None]:
### edTest(test_na) ###

# Find the number of rows with missing values
num_null = df.isnull().any(axis=1).sum()
print("Number of rows with null values:", num_null)

In [49]:
# kNN impute the missing data
# Use a k value of 5

x = df.drop('Urgency', axis=1)
y = df['Urgency']

imputer = KNNImputer(n_neighbors=5)

x_imputed = imputer.fit_transform(x)
x_imputed = pd.DataFrame(x_imputed, columns=x.columns)

df_imputed = pd.concat([x_imputed, y], axis=1)

In [50]:
### edTest(test_impute) ###
# Replace the original dataframe with the imputed data, continue to use df for the dataframe

df = df_imputed

In [None]:
bins = [20, 30, 40, 50, 60, 70, 80, 90]
labels = ['20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90']

df['AgeGroup'] = pd.cut(df['age'], bins=bins, labels=labels)
urgency_count = df[df['Urgency'] == 1].groupby('AgeGroup', observed=True).size()

plt.figure(figsize=(10, 6))
urgency_count.plot(kind="bar", color="skyblue", edgecolor="black")
plt.title("Number of High Urgency Cases by Age Group", fontsize=16)
plt.xlabel("Age Group", fontsize=14)
plt.ylabel("Number of High Urgency Cases", fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
symptoms = df.columns[2:7]
symptoms_count = df[symptoms].sum()

plt.figure(figsize=(10, 6))
symptoms_count.plot(kind="bar", color="skyblue", edgecolor="black")
plt.title("Number of Symptoms Present", fontsize=16)
plt.xlabel("Symptoms", fontsize=14)
plt.ylabel("Number of Cases", fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Filter counts for "cough" by urgency levels
symptom_name = "cough"
symptom_counts = df.groupby("Urgency")[symptom_name].sum()

# Create a bar plot
fig, ax = plt.subplots(figsize=(8, 5))

# Define data for the bars
labels = ["Non-Urgent", "Urgent"]
values = [symptom_counts[0], symptom_counts[1]]
colors = ["blue", "red"]

# Plot bars
bars = ax.bar(labels, values, color=colors, width=0.5)

# Add titles and labels
ax.set_title(f"Occurrence of '{symptom_name}' by Urgency", fontsize=16)
ax.set_ylabel("Count of Patients", fontsize=14)
ax.set_xlabel("Urgency Level", fontsize=14)

# Add value labels above bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, height + 0.5,  # Position above the bar
            f'{int(height)}', ha='center', va='bottom', fontsize=12, color="black")

# Customize the layout
plt.tight_layout()
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [60]:
### edTest(test_split) ###
# Split the data into train and test sets with 70% for training
# Use random state of 60 and set of data as the train split

# Your code here
df_train, df_test = train_test_split(df, test_size=0.3, random_state=60)

In [None]:
# # Save the train data into a csv called "covid_train.csv"
# # Remember to not include the default indices

# df_train.to_csv('covid_train.csv', index=False)

# # Save the test data into a csv called "covid_test.csv"
# # Remember to not include the default indices

# df_test.to_csv('covid_test.csv', index=False)