# Time series analysis


We will repeat the steps from previous notebooks until we get to the modeling phase

In [1]:
#import the relevant modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.utils import indexable
from sklearn.utils.validation import _num_samples
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [6]:
covid_out = pd.read_csv('COVID-19_Outcomes_by_Testing_Cohorts__Cases__Hospitalizations__and_Deaths_20240603.csv')
covid_waste = pd.read_csv('SARS-CoV-2_concentrations_measured_in_NYC_Wastewater_20240603.csv')

In [7]:


covid_out['specimen_date']=covid_out['specimen_date'].astype('str')
covid_out['specimen_date']= pd.to_datetime(covid_out['specimen_date'], errors= 'coerce')
covid_out= covid_out.drop('extract_date', axis=1)
covid_out= covid_out.rename(columns={'specimen_date':'sample_date',
                                        'Number_tested':'number_tested',
                                        'Number_confirmed':'number_confirmed',
                                        'Number_hospitalized':'number_hospitalized',})
covid_out=covid_out.dropna()




In [8]:
covid_waste=covid_waste.drop('Annotation', axis=1)
covid_waste=covid_waste.drop('Technology', axis=1)
covid_waste=covid_waste.drop('WRRF Abbreviation', axis=1)
covid_waste=covid_waste.drop('Test date', axis=1)



covid_waste['Sample Date']=covid_waste['Sample Date'].astype('str')
covid_waste['Sample Date']= pd.to_datetime(covid_waste['Sample Date'], errors= 'coerce')

covid_waste= covid_waste.rename(columns={'Sample Date':'sample_date',
                                        'WRRF Name':'collection_site',
                                        'Concentration SARS-CoV-2 gene target (N1 Copies/L) ':'concentration',
                                        'Per capita SARS-CoV-2 load (N1 copies per day per population)':'per_capita',
                                        'Population Served, estimated ':'est_pop'})
covid_waste=covid_waste.dropna()





In [9]:
covid_waste=covid_waste.drop('collection_site', axis=1)
covid_waste=covid_waste.drop('est_pop', axis=1)

In [10]:
mask1 = (covid_waste['sample_date'] > '2020-08-01') & (covid_waste['sample_date'] <= '2021-09-01')

covid_waste_short= covid_waste.loc[mask1]
print(covid_waste_short.head())

mask2 = (covid_out['sample_date'] > '2020-08-01') & (covid_out['sample_date'] <= '2021-09-01')
covid_out_short= covid_out.loc[mask2]
print(covid_out_short.head())


  sample_date  concentration  per_capita
0  2020-08-31          389.0    264000.0
1  2020-08-31         1204.0    444000.0
2  2020-08-31          304.0    169000.0
3  2020-08-31          940.0    574000.0
4  2020-08-31          632.0    233000.0
      sample_date  number_tested  number_confirmed  number_hospitalized  \
12693  2020-08-02             40                 1                    0   
12768  2020-08-02           1259                12                    4   
12835  2020-08-03             71                 1                    0   
12926  2020-08-02           3018                54                    8   
12932  2020-08-05            104                 2                    0   

       Number_deaths  
12693              0  
12768              0  
12835              0  
12926              0  
12932              0  


now that we are at a similar starting point from before we can begin trying a time series split of the data to see if we can boost scores

In [11]:
covid_time1 = {
    'sample_date': pd.date_range(start='2020-09-01', periods=10, freq='D'),
    'per_capita': range(10),
    'number_confirmed': range(10, 20)
}
covid_time1 = pd.DataFrame(covid_time1)

# Ensure that covid_time1 is a DataFrame
if isinstance(covid_time1, pd.DataFrame):
    # Set 'sample_date' as the index
    covid_time1.set_index('sample_date', inplace=True)
    covid_time1.sort_index(inplace=True)

    # Define X and y
    X = covid_time1
    y = covid_time1.index

    # Initialize TimeSeriesSplit
    tss = TimeSeriesSplit(n_splits=3)

    # Split the data using TimeSeriesSplit
    for train_index, test_index in tss.split(X):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y[train_index], y[test_index]

        print("TRAIN indices:", train_index, "TEST indices:", test_index)
        print("X_train:", X_train, "X_test:", X_test)
        print("y_train:", y_train, "y_test:", y_test)
else:
    print("covid_time1 is not a DataFrame")



TRAIN indices: [0 1 2 3] TEST indices: [4 5]
X_train:              per_capita  number_confirmed
sample_date                              
2020-09-01            0                10
2020-09-02            1                11
2020-09-03            2                12
2020-09-04            3                13 X_test:              per_capita  number_confirmed
sample_date                              
2020-09-05            4                14
2020-09-06            5                15
y_train: DatetimeIndex(['2020-09-01', '2020-09-02', '2020-09-03', '2020-09-04'], dtype='datetime64[ns]', name='sample_date', freq=None) y_test: DatetimeIndex(['2020-09-05', '2020-09-06'], dtype='datetime64[ns]', name='sample_date', freq=None)
TRAIN indices: [0 1 2 3 4 5] TEST indices: [6 7]
X_train:              per_capita  number_confirmed
sample_date                              
2020-09-01            0                10
2020-09-02            1                11
2020-09-03            2                12
2020-09

In [12]:

covid_time1 = {
    'sample_date': pd.date_range(start='2020-09-01', periods=100, freq='D'),
    'per_capita': np.random.randint(0, 100, size=100),
    'number_confirmed': np.random.rand(100) * 1000
}
df = pd.DataFrame(covid_time1)

# Ensure 'date' is the index
df.set_index('sample_date', inplace=True)

# Define the window size for the rolling calculation
window_size = 7

# Calculate the rolling window for COVID-19 cases
df['rolling_covid_cases'] = df['per_capita'].rolling(window=window_size).mean()

# Drop NaN values that result from rolling window operation
df.dropna(subset=['rolling_covid_cases'], inplace=True)

# Calculate the correlation between rolling COVID-19 cases and GDP
correlation = df['rolling_covid_cases'].corr(df['number_confirmed'])

print("Correlation between rolling COVID-19 cases and number_confirmed:", correlation)


Correlation between rolling COVID-19 cases and number_confirmed: -0.18502118704193707


even with time series split with a rolling window it seems we cant use this data to predict future outbreaks.