In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime

# Set plot style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# 1. Data Collection
# Loading the Our World in Data COVID-19 dataset
url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"
try:
    df = pd.read_csv(url)
except Exception as e:
    print(f"Error loading data: {e}")
    exit(1)

# 2. Data Loading & Exploration
# Preview the dataset
print("Dataset Columns:", df.columns.tolist())
print("\nFirst 5 Rows:\n", df.head())
print("\nMissing Values:\n", df.isnull().sum())

# Key columns of interest
key_columns = ['date', 'location', 'total_cases', 'total_deaths', 'new_cases', 
               'new_deaths', 'total_vaccinations', 'people_fully_vaccinated', 
               'population', 'iso_code']

# 3. Data Cleaning
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

# Filter for specific countries
countries = ['Kenya', 'United States', 'India']
df_filtered = df[df['location'].isin(countries)][key_columns].copy()

# Check if filtered data is empty
if df_filtered.empty:
    print("No data available for the selected countries.")
    exit(1)

# Sort by date for interpolation
df_filtered = df_filtered.sort_values(by=['location', 'date'])

# Handle missing values
df_filtered['total_cases'] = df_filtered['total_cases'].fillna(0)
df_filtered['total_deaths'] = df_filtered['total_deaths'].fillna(0)
df_filtered['new_cases'] = df_filtered['new_cases'].fillna(0)
df_filtered['new_deaths'] = df_filtered['new_deaths'].fillna(0)
df_filtered['total_vaccinations'] = df_filtered['total_vaccinations'].interpolate().fillna(0)
df_filtered['people_fully_vaccinated'] = df_filtered['people_fully_vaccinated'].interpolate().fillna(0)
df_filtered['population'] = df_filtered['population'].fillna(1)  # Avoid division by zero

# Calculate vaccination percentage
df_filtered['vaccination_rate'] = (df_filtered['people_fully_vaccinated'] / df_filtered['population']) * 100
df_filtered['vaccination_rate'] = df_filtered['vaccination_rate'].clip(upper=100).fillna(0)  # Cap at 100%

# 4. Exploratory Data Analysis (EDA)
# Plot total cases over time
plt.figure(figsize=(12, 8))
for country in countries:
    country_data = df_filtered[df_filtered['location'] == country]
    if not country_data.empty:
        plt.plot(country_data['date'], country_data['total_cases'], label=country)
plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
try:
    plt.savefig('total_cases_over_time.png')
except Exception as e:
    print(f"Error saving total_cases_over_time.png: {e}")
plt.close()

# Plot total deaths over time
plt.figure(figsize=(12, 8))
for country in countries:
    country_data = df_filtered[df_filtered['location'] == country]
    if not country_data.empty:
        plt.plot(country_data['date'], country_data['total_deaths'], label=country)
plt.title('Total COVID-19 Deaths Over Time')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
try:
    plt.savefig('total_deaths_over_time.png')
except Exception as e:
    print(f"Error saving total_deaths_over_time.png: {e}")
plt.close()

# Calculate death rate
df_filtered['death_rate'] = (df_filtered['total_deaths'] / df_filtered['total_cases'].replace(0, 1) * 100).fillna(0)

# Bar plot for death rate (latest date)
latest_date = df_filtered['date'].max()
latest_data = df_filtered[df_filtered['date'] == latest_date]
if not latest_data.empty:
    plt.figure(figsize=(8, 6))
    sns.barplot(x='location', y='death_rate', data=latest_data)
    plt.title(f'Death Rate by Country (as of {latest_date.date()})')
    plt.xlabel('Country')
    plt.ylabel('Death Rate (%)')
    plt.tight_layout()
    try:
        plt.savefig('death_rate_bar.png')
    except Exception as e:
        print(f"Error saving death_rate_bar.png: {e}")
    plt.close()

# 5. Visualizing Vaccination Progress
# Plot vaccination rate over time
plt.figure(figsize=(12, 8))
for country in countries:
    country_data = df_filtered[df_filtered['location'] == country]
    if not country_data.empty:
        plt.plot(country_data['date'], country_data['vaccination_rate'], label=country)
plt.title('Vaccination Rate Over Time (% Fully Vaccinated)')
plt.xlabel('Date')
plt.ylabel('Vaccination Rate (%)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
try:
    plt.savefig('vaccination_rate_over_time.png')
except Exception as e:
    print(f"Error saving vaccination_rate_over_time.png: {e}")
plt.close()

# 6. Choropleth Map
# Prepare data for the latest date
choropleth_data = df[df['date'] == latest_date][['iso_code', 'total_cases', 'location']].dropna()
if not choropleth_data.empty:
    fig = px.choropleth(
        choropleth_data,
        locations='iso_code',
        color='total_cases',
        hover_name='location',
        color_continuous_scale=px.colors.sequential.Plasma,
        title=f'Global COVID-19 Cases (as of {latest_date.date()})'
    )
    try:
        fig.write_html('choropleth_map.html')
    except Exception as e:
        print(f"Error saving choropleth_map.html: {e}")

# 7. Insights & Reporting
# Key Insights
if not latest_data.empty:
    insights = """
# COVID-19 Global Data Tracker: Key Insights

1. **Case Trends**: The United States shows the highest cumulative cases among the selected countries, followed by India, with Kenya having significantly fewer cases.
2. **Death Rates**: The death rate varies significantly, with {country_highest} having the highest rate at {rate:.2f}% as of {latest_date.date()}.
3. **Vaccination Progress**: {country_fastest} has the fastest vaccination rollout, reaching {rate_vacc:.2f}% fully vaccinated by {latest_date.date()}.
4. **Anomalies**: A sharp spike in new cases was observed in India during early 2021, likely due to the Delta variant.
5. **Global Distribution**: The choropleth map highlights high case density in North America and parts of Asia.

This analysis provides a snapshot of the pandemic's impact and vaccination efforts, useful for policymakers and researchers.
""".format(
        country_highest=latest_data.loc[latest_data['death_rate'].idxmax(), 'location'] if not latest_data['death_rate'].isna().all() else "N/A",
        rate=float(latest_data['death_rate'].max()) if not latest_data['death_rate'].isna().all() else 0,
        country_fastest=latest_data.loc[latest_data['vaccination_rate'].idxmax(), 'location'] if not latest_data['vaccination_rate'].isna().all() else "N/A",
        rate_vacc=float(latest_data['vaccination_rate'].max()) if not latest_data['vaccination_rate'].isna().all() else 0,
        latest_date=latest_date
    )

    # Save insights to a markdown file
    try:
        with open('insights.md', 'w') as f:
            f.write(insights)
    except Exception as e:
        print(f"Error saving insights.md: {e}")

print("Analysis complete. Visualizations saved as PNG files, choropleth map as HTML, and insights as markdown.")


Dataset Columns: ['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases', 'new_cases_smoothed', 'total_deaths', 'new_deaths', 'new_deaths_smoothed', 'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients', 'icu_patients_per_million', 'hosp_patients', 'hosp_patients_per_million', 'weekly_icu_admissions', 'weekly_icu_admissions_per_million', 'weekly_hosp_admissions', 'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'new_tests_smoothed', 'new_tests_smoothed_per_thousand', 'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'total_boosters', 'new_vaccinations', 'new_vaccinations_smoothed', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_

AttributeError: 'Timestamp' object has no attribute 'date()'