In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Set style
sns.set(style="whitegrid")

# Load the dataset
try:
    df = pd.read_csv('owid-covid-data.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Dataset not found. Please make sure 'owid-covid-data.csv' is in your working directory.")

# Preview the data
df.head()

# Explore structure and check for missing values
print(df.info())
print("\nMissing values (Top 10):")
print(df.isnull().sum().sort_values(ascending=False).head(10))

# Filter and clean data
countries = ['Kenya', 'United States', 'India']
df = df[df['location'].isin(countries)]
df['date'] = pd.to_datetime(df['date'])
df_cleaned = df.dropna(subset=['total_cases', 'total_deaths'])
df_cleaned.fillna(0, inplace=True)

# Line chart: Total cases over time
plt.figure(figsize=(10, 5))
sns.lineplot(data=df_cleaned, x='date', y='total_cases', hue='location')
plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.tight_layout()
plt.show()

# Line chart: Total deaths over time
plt.figure(figsize=(10, 5))
sns.lineplot(data=df_cleaned, x='date', y='total_deaths', hue='location')
plt.title('Total COVID-19 Deaths Over Time')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.tight_layout()
plt.show()

# Calculate death rate
df_cleaned['death_rate'] = df_cleaned['total_deaths'] / df_cleaned['total_cases']
death_rate_summary = df_cleaned[['location', 'date', 'death_rate']].groupby('location').mean().sort_values(by='death_rate', ascending=False)
print("\nAverage Death Rate by Country:")
print(death_rate_summary)

# Line chart: Total vaccinations over time
plt.figure(figsize=(10, 5))
sns.lineplot(data=df_cleaned, x='date', y='total_vaccinations', hue='location')
plt.title('Total Vaccinations Over Time')
plt.xlabel('Date')
plt.ylabel('Total Vaccinations')
plt.tight_layout()
plt.show()

# Choropleth map: Total cases by country (latest snapshot)
latest = df.groupby('location').apply(lambda x: x.loc[x['date'].idxmax()]).reset_index(drop=True)

fig = px.choropleth(latest,
                    locations="iso_code",
                    color="total_cases",
                    hover_name="location",
                    color_continuous_scale="Reds",
                    title="Total COVID-19 Cases by Country")
fig.show()

# 💡 Insights
"""
- India showed sharp increases in early waves, but the USA leads in total case count.
- Kenya has had a slower but consistent vaccination rollout.
- Death rates vary significantly between countries, highlighting healthcare disparities.
- Visualizing data over time reveals patterns that align with known global COVID-19 waves.
"""
