# COVID-19 Data Analysis Project

## 1. Introduction
This notebook analyzes COVID-19 trends in the United States using data from Our World in Data (OWID). 
We focus on:
- Monthly infection peaks in 2023
- Hospitalization trends
- Statistical relationships between cases and deaths

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np

%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')

## 2. Data Loading and Cleaning
We load the pre-processed US dataset.

In [None]:
# Load Data
df = pd.read_csv('../data/cleaned_us_data.csv')
df['date'] = pd.to_datetime(df['date'])
df.head()

## 3. Analysis: Peak Infection Month (2023)

In [None]:
df_2023 = df[df['date'].dt.year == 2023].copy()
monthly_cases = df_2023.groupby(df_2023['date'].dt.to_period('M'))['new_cases'].sum()

peak_month = monthly_cases.idxmax()
peak_value = monthly_cases.max()

print(f"Peak Month: {peak_month}")
print(f"Cases: {peak_value:,.0f}")

monthly_cases.plot(kind='line', marker='o', figsize=(10, 5), title='Monthly Cases 2023')
plt.show()

## 4. Hospitalization Analysis

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(df['date'], df['weekly_hosp_admissions'], label='Weekly Admissions', color='red')
plt.title('Weekly Hospital Admissions (2020-Present)')
plt.xlabel('Date')
plt.ylabel('Admissions')
plt.legend()
plt.show()

## 5. Statistical Analysis

In [None]:
# Correlation
corr, p_val = stats.pearsonr(df['new_cases_smoothed'].fillna(0), df['new_deaths_smoothed'].fillna(0))
print(f"Correlation (Cases vs Deaths): {corr:.4f}")
print(f"P-value: {p_val:.4e}")

## 6. Advanced Analysis: Forecasting
We use ARIMA to forecast future cases based on the last year of data.

In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Prepare data
data = df['new_cases_smoothed'].dropna()
data = data[data > 0]
train_data = data.tail(365)

# Fit model
model = ARIMA(train_data.values, order=(5,1,0))
model_fit = model.fit()

# Forecast
forecast = model_fit.forecast(steps=30)

plt.figure(figsize=(10, 5))
plt.plot(range(len(train_data)), train_data.values, label='History')
plt.plot(range(len(train_data), len(train_data)+30), forecast, label='Forecast', color='red')
plt.title('30-Day Forecast (ARIMA)')
plt.legend()
plt.show()