# Read in data

In [37]:
import pandas as pd

full_df = pd.read_csv('./Data/full_biweekly.csv')
full_df.head()

Unnamed: 0,iso_date,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,...,Delta,Alpha,non_who,Lambda,Mu,Omicron,year_variant,month_variant,day_variant,iso_date_variant
0,ABW_2021-02-08,,,,,,,,,,...,0.0,7.89,91.23,0.0,0.0,0.0,2021,2021,8,ABW_2021-02-08
1,ABW_2021-02-22,,,,,,,,,,...,0.0,24.03,66.88,0.0,0.0,0.0,2021,2021,22,ABW_2021-02-22
2,ABW_2021-03-08,,,,,,,,,,...,0.0,52.0,28.8,0.0,0.0,0.0,2021,2021,8,ABW_2021-03-08
3,ABW_2021-03-22,,,,,,,,,,...,0.0,66.67,11.11,0.0,0.0,0.0,2021,2021,22,ABW_2021-03-22
4,ABW_2021-04-05,ABW,North America,Aruba,2021-04-05,,,,,,...,0.0,80.43,3.27,0.0,0.0,0.0,2021,2021,5,ABW_2021-04-05


# Store each country's data in an individual dataframe

In [38]:
country_dfs = {}  # Dictionary that stores a dataframe for each country. Key is iso_code. 
header = full_df.columns  # Stores column names

# Iterate over the rows of the data frame
for index, row in full_df.iterrows():
    # Get the country name for the current row of data
    country = row['iso_code']
    # If the country name is not present in dictionary of country dataframes
    if country not in country_dfs:
        # Get all rows with the country name
        country_rows = full_df.loc[full_df['iso_code'] == country]
        # Put those rows into a dataframe and store in country_dfs
        country_dfs[country] = pd.DataFrame(country_rows)
        
print(country_dfs['ABW'])

          iso_date iso_code      continent location        date  total_cases  \
4   ABW_2021-04-05      ABW  North America    Aruba  2021-04-05          NaN   
5   ABW_2021-04-19      ABW  North America    Aruba  2021-04-19          NaN   
6   ABW_2021-05-03      ABW  North America    Aruba  2021-05-03          NaN   
7   ABW_2021-05-17      ABW  North America    Aruba  2021-05-17          NaN   
8   ABW_2021-05-31      ABW  North America    Aruba  2021-05-31          NaN   
9   ABW_2021-06-14      ABW  North America    Aruba  2021-06-14          NaN   
10  ABW_2021-07-26      ABW  North America    Aruba  2021-07-26          NaN   
11  ABW_2021-08-09      ABW  North America    Aruba  2021-08-09          NaN   
12  ABW_2021-08-23      ABW  North America    Aruba  2021-08-23          NaN   
13  ABW_2021-09-06      ABW  North America    Aruba  2021-09-06          NaN   
14  ABW_2021-09-20      ABW  North America    Aruba  2021-09-20          NaN   
15  ABW_2021-10-04      ABW  North Ameri

# Find groups of consecutive biweekly data 
Right now, timestep = 5. This can be changed by setting TIMESTEP=#. 

In [55]:
from datetime import date

def get_days_between_rows(date1_str, date2_str):
    # Split the date strings (e.g., 2021-11-01) on the hypen
    date1_split = date1_str.split('-')
    date2_split = date2_str.split('-')

    # Make date objects using year, month, date
    date1 = date(int(date1_split[0]), int(date1_split[1]), int(date1_split[2]))
    date2 = date(int(date2_split[0]), int(date2_split[1]), int(date2_split[2]))

    # Calculate days between the days
    time_between = date2 - date1
    days_between = time_between.days
    return days_between

# Dictionary of timestep data. Keys are iso_code. Value is a list of dataframes, where each 
# dataframe contains 5 rows of data and a sixth row, which is what we need to predict. 
# TODO: remove everything in the sixth row but the value we want to predict. 
timestep_dfs = {} 
TIMESTEP = 5

# Get list of all country names
countries = country_dfs.keys()

# Iterate over all countries
for country in countries:
    # Look at df for current country
    df = country_dfs[country]
    
    num_consecutive = 0
    consecutive_rows = []
    header = df.columns
    # Iterate over pairs of rows in the data frame
    
    for (index1,row1),(index2,row2) in zip(df[:-1].iterrows(),df[1:].iterrows()):
        # Get the number of days between the date strings (e.g., 2021-11-01)
        days_between = get_days_between_rows(row1['date'], row2['date'])
        
        # If there are 14 days between this pair of rows
        if days_between == 14:
            num_consecutive += 1
            # Add both rows to the consecutive rows list
            consecutive_rows.append(row1)
            consecutive_rows.append(row2)
            
            # If we have found `timestep` number of rows, then we have a complete dateset
            if num_consecutive == TIMESTEP:

                # Turn this list of rows into a datagrame and drop duplicates
                # Duplicates are due to adding both row1 and row2 to consecutive_rows list
                consecutive_df = pd.DataFrame(consecutive_rows)
                consecutive_df.drop_duplicates(inplace=True)

                # If the country is already present in timestep_dfs, just add the current dataframe the 
                # country's list of dataframes
                if country in timestep_dfs:
                    timestep_dfs[country].append(consecutive_df)
                else:
                    # If country is not already present in timestep_dfs, add an empty list to the 
                    # dictionary where we will append the dataframe of consecutive rows
                    timestep_dfs[country] = []
                    timestep_dfs[country].append(consecutive_df)

                # Reset so that we can look for a new group of consecutive rows
                num_consecutive = 0
                consecutive_rows = []
            
        else:
            # Reset because encountered a row that is not 14 days away
            num_consecutive = 0
            consecutive_rows = []

print(timestep_dfs['ABW'])        

[         iso_date iso_code      continent location        date  total_cases  \
4  ABW_2021-04-05      ABW  North America    Aruba  2021-04-05          NaN   
5  ABW_2021-04-19      ABW  North America    Aruba  2021-04-19          NaN   
6  ABW_2021-05-03      ABW  North America    Aruba  2021-05-03          NaN   
7  ABW_2021-05-17      ABW  North America    Aruba  2021-05-17          NaN   
8  ABW_2021-05-31      ABW  North America    Aruba  2021-05-31          NaN   
9  ABW_2021-06-14      ABW  North America    Aruba  2021-06-14          NaN   

   new_cases  new_cases_smoothed  total_deaths  new_deaths  ...  Delta  Alpha  \
4        NaN                 NaN           NaN         NaN  ...   0.00  80.43   
5        NaN                 NaN           NaN         NaN  ...   9.66  66.21   
6        NaN                 NaN           NaN         NaN  ...   1.43  61.43   
7        NaN                 NaN           NaN         NaN  ...   9.90  53.47   
8        NaN                 NaN        