# Dependencies and Setup
import pandas as pd
import requests
import glob
import matplotlib.pyplot as plt
import numpy as np 

COVID-19 Case Surveillance Public Use Data with Geography
https://data.cdc.gov/Case-Surveillance/COVID-19-Case-Surveillance-Public-Use-Data-with-Ge/n8mc-b4w4/about_data

State data availability
According to https://coviddatadispatch.com
As of March 26, 2022:

    28 states and the District of Columbia are reporting COVID-19 cases in K-12 schools, in some form
    11 states are reporting incomplete data on school outbreaks or cases in school-aged children
    24 states are separating out school case counts by students and staff
    2 states are reporting COVID-19 tests conducted for school students and staff (New York and Massachusetts)
    8 states are reporting in-person enrollment (Connecticut, Delaware, Hawaii, New Jersey, New York, Texas, Utah, West Virginia) 

https://coviddatadispatch.com/k-12-schools-data-in-the-u-s/

url = 'https://health.data.ny.gov/resource/977p-3txa.json'
params = {'$limit': 730492}

response = requests.get(url, params = params)
data_json = response.json()
NYData_df = pd.DataFrame(data_json)

# Convert 'TimePeriodEnd' to datetime
NYData_df['report_date'] = pd.to_datetime(NYData_df['report_date'])
# Convert 'CumulativeCasesStudents' to integers
NYData_df['positive_students'] = NYData_df['positive_students'].astype(int)
NYData_df['number_of_students'] = NYData_df['number_of_students'].astype(int)
NYData_df['total_tests'] = NYData_df['total_tests'].astype(int)

# Sort the DataFrame by 'StateName', 'NCESDistrictID', and 'TimePeriodEnd'
NYData_df = NYData_df.sort_values(by=['school_beds_code', 'report_date'],ascending=[True, True])

# Reset the index
NYData_df.reset_index(drop=True, inplace=True)

# Saving as CSV
NYData_df.to_csv('clean_data/NY_case_data.csv', index=False)

# Path to the directory containing CDC reported case for childern between 0-17 CSV files
schoolCDC_path = 'CDC_data/'

# Use glob to get a list of file paths matching a specific pattern (e.g., all CSV files)
schoolCDC_csv = glob.glob(schoolCDC_path + '*.csv')

# Initialize an empty list to store individual DataFrames
list_of_dfs = []

# Loop through each CSV file, read it into a DataFrame, and append it to the list
for file in schoolCDC_csv:
    df = pd.read_csv(file)
    list_of_dfs.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
schoolCDC_df = pd.concat(list_of_dfs, ignore_index=True)
schoolCDC_df.head()

# Define columns to keep
schoolCDC_df = schoolCDC_df[['case_month', 'res_state', 'age_group','current_status']]

schoolCDC_df.head(20)

# Grouping by 'res_state' and 'case_month', and counting occurrences
schoolCDCgrouped = schoolCDC_df.groupby(['res_state', 'case_month']).size().reset_index(name='count_of_cases')

# Output the grouped DataFrame
schoolCDCgrouped.head(50)

# Grouping by 'res_state' and 'case_month', and counting occurrences
schoolCDCbyYear = schoolCDC_df.groupby(['res_state']).size().reset_index(name='Total_cases')

# Output the grouped DataFrame
schoolCDCbyYear.head(50)