<a href="https://colab.research.google.com/github/chela-lavin/My-list/blob/main/COVID_19_Data_Analysis_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import geopandas
from datetime import datetime

# Load the COVID-19 data from Johns Hopkins University CSSE
# Note:  Consider adding error handling (try-except) around file loading.
try:
    confirmed_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/CSSE_COVID-19_time_series/time_series_covid19_confirmed_global.csv')
    deaths_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/CSSE_COVID-19_time_series/time_series_covid19_deaths_global.csv')
    recovered_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/CSSE_COVID-19_time_series/time_series_covid19_recovered_global.csv')
except Exception as e:
    print(f"Error loading data: {e}")
    #  Consider raising an exception or exiting if data loading is critical
    exit()

# 1. Data Cleaning and Transformation
def clean_data(df, data_type):
    """
    Cleans and transforms the COVID-19 data.

    Args:
        df (pd.DataFrame): The input DataFrame.
        data_type (str):  'confirmed', 'deaths', or 'recovered'.

    Returns:
        pd.DataFrame: The cleaned and transformed DataFrame.
    """
    # Drop 'Province/State' column
    if 'Province/State' in df.columns:
        df = df.drop(columns=['Province/State'])

    # Melt the DataFrame to long format
    id_vars = ['Country/Region', 'Lat', 'Long']
    df_melted = df.melt(id_vars=id_vars, var_name='Date', value_name=data_type)

    # Convert 'Date' column to datetime
    df_melted['Date'] = pd.to_datetime(df_melted['Date'], format='%m/%d/%y')

    # Handle missing values (fill with 0) - important for cumulative counts
    df_melted[data_type] = df_melted[data_type].fillna(0)

    return df_melted

# Clean the dataframes
confirmed_melted = clean_data(confirmed_df, 'Confirmed')
deaths_melted = clean_data(deaths_df, 'Deaths')
recovered_melted = clean_data(recovered_df, 'Recovered')

# Merge the cleaned dataframes
combined_df = pd.merge(confirmed_melted, deaths_melted, on=['Country/Region', 'Date', 'Lat', 'Long'])
combined_df = pd.merge(combined_df, recovered_melted, on=['Country/Region', 'Date', 'Lat', 'Long'])

# Group by 'Country/Region' and 'Date'
combined_df = combined_df.groupby(['Country/Region', 'Date']).sum().reset_index()

# Calculate daily cases, deaths, and recoveries
combined_df['Daily Confirmed'] = combined_df.groupby('Country/Region')['Confirmed'].diff().fillna(0)
combined_df['Daily Deaths'] = combined_df.groupby('Country/Region')['Deaths'].diff().fillna(0)
combined_df['Daily Recovered'] = combined_df.groupby('Country/Region')['Recovered'].diff().fillna(0)

# Cap daily values at 0 (to handle any negative differences)
combined_df['Daily Confirmed'] = combined_df['Daily Confirmed'].clip(lower=0)
combined_df['Daily Deaths'] = combined_df['Daily Deaths'].clip(lower=0)
combined_df['Daily Recovered'] = combined_df['Daily Recovered'].clip(lower=0)


# 2. Exploratory Data Analysis and Visualization

def plot_time_trends(df, country, data_type='Confirmed'):
    """
    Plots time trends for a specific country and data type.

    Args:
        df (pd.DataFrame): The combined DataFrame.
        country (str): The name of the country.
        data_type (str):  'Confirmed', 'Deaths', or 'Recovered', or 'Daily Confirmed', 'Daily Deaths', 'Daily Recovered'
    """
    country_df = df[df['Country/Region'] == country].copy() # Create a copy to avoid modifying the original DataFrame
    if country_df.empty:
        print(f"No data found for {country}")
        return

    plt.figure(figsize=(12, 6))
    plt.plot(country_df['Date'], country_df[data_type], label=data_type)
    plt.title(f'{data_type} Trend for {country}')
    plt.xlabel('Date')
    plt.ylabel(data_type)
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_top_countries(df, date, top_n=10, data_type='Confirmed'):
    """
    Plots the top N countries for a specific data type on a given date.

    Args:
        df (pd.DataFrame): The combined DataFrame.
        date (str): The date in 'YYYY-MM-DD' format.
        top_n (int): The number of top countries to display.
        data_type (str): 'Confirmed', 'Deaths', or 'Recovered'.
    """
    try:
        date_obj = datetime.strptime(date, '%Y-%m-%d').date()
    except ValueError:
        print("Invalid date format. Please use YYYY-MM-DD.")
        return

    # Filter for the specified date
    df_filtered = df[df['Date'].dt.date == date_obj]

    # Check if the date exists in the data.
    if df_filtered.empty:
        print(f"No data available for {date}")
        return

    # Sort by the data type and get the top N countries
    top_countries_df = df_filtered.nlargest(top_n, data_type)

    plt.figure(figsize=(12, 6))
    sns.barplot(x='Country/Region', y=data_type, data=top_countries_df)
    plt.title(f'Top {top_n} Countries by {data_type} on {date}')
    plt.xlabel('Country/Region')
    plt.ylabel(data_type)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y')
    plt.show()



def plot_global_map(df, date, data_type='Confirmed'):
    """
    Plots a global map of COVID-19 data for a specific date.

    Args:
        df (pd.DataFrame): The combined DataFrame.
        date (str): The date in 'YYYY-MM-DD' format.
        data_type (str): 'Confirmed', 'Deaths', or 'Recovered'.
    """

    try:
        date_obj = datetime.strptime(date, '%Y-%m-%d').date()
    except ValueError:
        print("Invalid date format. Please use YYYY-MM-DD.")
        return

    # Filter for the specified date
    df_filtered = df[df['Date'].dt.date == date_obj]

    # Check if the date exists in the data.
    if df_filtered.empty:
        print(f"No data available for {date}")
        return

    # Load world map data
    world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

    # Merge COVID-19 data with world map data
    merged_df = world.merge(df_filtered, left_on='iso_a3', right_on='Country/Region', how='inner')
    # Check if the merge was successful
    if merged_df.empty:
        print("No matching countries found for mapping.  Check country names.")
        return

    # Create the plot
    fig, ax = plt.subplots(1, 1, figsize=(15, 10))
    merged_df.plot(column=data_type, cmap='viridis', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)
    ax.set_title(f'Global {data_type} on {date}')
    plt.show()



# 3. Analysis and Reporting
def generate_report(df, country):
    """
    Generates a report for a specific country.

    Args:
        df (pd.DataFrame): The combined DataFrame.
        country (str): The name of the country.
    """
    print(f"Report for {country}:")
    country_df = df[df['Country/Region'] == country].copy() # Create a copy
    if country_df.empty:
        print(f"No data found for {country}")
        return

    # Basic statistics
    print(f"  Total Confirmed Cases: {country_df['Confirmed'].max()}")
    print(f"  Total Deaths: {country_df['Deaths'].max()}")
    print(f"  Total Recovered: {country_df['Recovered'].max()}")

    # Daily trends
    print("\n  Daily Trends:")
    print(country_df[['Date', 'Daily Confirmed', 'Daily Deaths', 'Daily Recovered']].tail())

    # Plotting
    plot_time_trends(df, country, 'Confirmed')
    plot_time_trends(df, country, 'Deaths')
    plot_time_trends(df, country, 'Recovered')
    plot_time_trends(df, country, 'Daily Confirmed')
    plot_time_trends(df, country, 'Daily Deaths')
    plot_time_trends(df, country, 'Daily Recovered')

    # Additional analysis (e.g., mortality rate)
    latest_data = country_df.iloc[-1]
    if latest_data['Confirmed'] > 0:
        mortality_rate = (latest_data['Deaths'] / latest_data['Confirmed']) * 100
        print(f"\n  Mortality Rate: {mortality_rate:.2f}%")
    else:
        print("\n  Mortality Rate: N/A (No confirmed cases)")



# Example Usage
# Ensure your data is loaded and cleaned as shown above
if __name__ == "__main__":
    # Example: Analyze and visualize data for a specific country
    generate_report(combined_df, 'Italy')

    # Example: Plot top 10 countries by confirmed cases on a specific date
    plot_top_countries(combined_df, '2020-03-15', top_n=10, data_type='Confirmed')
    plot_top_countries(combined_df, '2021-01-15', top_n=10, data_type='Deaths')

    # Example: Plot global map of confirmed cases on a specific date
    plot_global_map(combined_df, '2020-04-01', data_type='Confirmed')
    plot_global_map(combined_df, '2021-04-01', data_type='Deaths')