# Read in data

In [37]:
import pandas as pd

full_df = pd.read_csv('./Data/full_biweekly.csv')
full_df.head()

Unnamed: 0,iso_date,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,...,Delta,Alpha,non_who,Lambda,Mu,Omicron,year_variant,month_variant,day_variant,iso_date_variant
0,ABW_2021-02-08,,,,,,,,,,...,0.0,7.89,91.23,0.0,0.0,0.0,2021,2021,8,ABW_2021-02-08
1,ABW_2021-02-22,,,,,,,,,,...,0.0,24.03,66.88,0.0,0.0,0.0,2021,2021,22,ABW_2021-02-22
2,ABW_2021-03-08,,,,,,,,,,...,0.0,52.0,28.8,0.0,0.0,0.0,2021,2021,8,ABW_2021-03-08
3,ABW_2021-03-22,,,,,,,,,,...,0.0,66.67,11.11,0.0,0.0,0.0,2021,2021,22,ABW_2021-03-22
4,ABW_2021-04-05,ABW,North America,Aruba,2021-04-05,,,,,,...,0.0,80.43,3.27,0.0,0.0,0.0,2021,2021,5,ABW_2021-04-05


# Store each country's data in an individual dataframe

In [38]:
country_dfs = {}  # Dictionary that stores a dataframe for each country. Key is iso_code. 
header = full_df.columns  # Stores column names

# Iterate over the rows of the data frame
for index, row in full_df.iterrows():
    # Get the country name for the current row of data
    country = row['iso_code']
    # If the country name is not present in dictionary of country dataframes
    if country not in country_dfs:
        # Get all rows with the country name
        country_rows = full_df.loc[full_df['iso_code'] == country]
        # Put those rows into a dataframe and store in country_dfs
        country_dfs[country] = pd.DataFrame(country_rows)
        
print(country_dfs['ABW'])

          iso_date iso_code      continent location        date  total_cases  \
4   ABW_2021-04-05      ABW  North America    Aruba  2021-04-05          NaN   
5   ABW_2021-04-19      ABW  North America    Aruba  2021-04-19          NaN   
6   ABW_2021-05-03      ABW  North America    Aruba  2021-05-03          NaN   
7   ABW_2021-05-17      ABW  North America    Aruba  2021-05-17          NaN   
8   ABW_2021-05-31      ABW  North America    Aruba  2021-05-31          NaN   
9   ABW_2021-06-14      ABW  North America    Aruba  2021-06-14          NaN   
10  ABW_2021-07-26      ABW  North America    Aruba  2021-07-26          NaN   
11  ABW_2021-08-09      ABW  North America    Aruba  2021-08-09          NaN   
12  ABW_2021-08-23      ABW  North America    Aruba  2021-08-23          NaN   
13  ABW_2021-09-06      ABW  North America    Aruba  2021-09-06          NaN   
14  ABW_2021-09-20      ABW  North America    Aruba  2021-09-20          NaN   
15  ABW_2021-10-04      ABW  North Ameri

# Find groups of consecutive biweekly data 
Right now, timestep = 5. This can be changed by setting TIMESTEP=#. 

In [54]:
from datetime import date

def get_days_between_rows(date1_str, date2_str):
    # Split the date strings (e.g., 2021-11-01) on the hypen
    date1_split = date1_str.split('-')
    date2_split = date2_str.split('-')

    # Make date objects using year, month, date
    date1 = date(int(date1_split[0]), int(date1_split[1]), int(date1_split[2]))
    date2 = date(int(date2_split[0]), int(date2_split[1]), int(date2_split[2]))

    # Calculate days between the days
    time_between = date2 - date1
    days_between = time_between.days
    return days_between

# Dictionary of timestep data. Keys are iso_code. Value is a list of dataframes, where each 
# dataframe contains 5 rows of data and a sixth row, which is what we need to predict. 
# TODO: remove everything in the sixth row but the value we want to predict. 
timestep_dfs = {} 
TIMESTEP = 5

# Get list of all country names
countries = country_dfs.keys()

# Iterate over all countries
for country in countries:
    # Look at df for current country
    df = country_dfs[country]
    
    num_consecutive = 0
    consecutive_rows = []
    header = df.columns
    # Iterate over pairs of rows in the data frame
    
    for (index1,row1),(index2,row2) in zip(df[:-1].iterrows(),df[1:].iterrows()):
        # Get the number of days between the date strings (e.g., 2021-11-01)
        days_between = get_days_between_rows(row1['date'], row2['date'])
        
        # If there are 14 days between this pair of rows
        if days_between == 14:
            num_consecutive += 1
            # Add both rows to the consecutive rows list
            consecutive_rows.append(row1)
            consecutive_rows.append(row2)
            
            # If we have found `timestep` number of rows, then we have a complete dateset
            if num_consecutive == TIMESTEP:

                # Turn this list of rows into a datagrame and drop duplicates
                # Duplicates are due to adding both row1 and row2 to consecutive_rows list
                consecutive_df = pd.DataFrame(consecutive_rows)
                consecutive_df.drop_duplicates(inplace=True)

                # If the country is already present in timestep_dfs, just add the current dataframe the 
                # country's list of dataframes
                if country in timestep_dfs:
                    timestep_dfs[country].append(consecutive_df)
                else:
                    # If country is not already present in timestep_dfs, add an empty list to the 
                    # dictionary where we will append the dataframe of consecutive rows
                    timestep_dfs[country] = []
                    timestep_dfs[country].append(consecutive_df)

                # Reset so that we can look for a new group of consecutive rows
                num_consecutive = 0
                consecutive_rows = []
            
        else:
            # Reset because encountered a row that is not 14 days away
            num_consecutive = 0
            consecutive_rows = []
    
    if country in timestep_dfs:
        print(timestep_dfs[country])

            

[         iso_date iso_code      continent location        date  total_cases  \
4  ABW_2021-04-05      ABW  North America    Aruba  2021-04-05          NaN   
5  ABW_2021-04-19      ABW  North America    Aruba  2021-04-19          NaN   
6  ABW_2021-05-03      ABW  North America    Aruba  2021-05-03          NaN   
7  ABW_2021-05-17      ABW  North America    Aruba  2021-05-17          NaN   
8  ABW_2021-05-31      ABW  North America    Aruba  2021-05-31          NaN   
9  ABW_2021-06-14      ABW  North America    Aruba  2021-06-14          NaN   

   new_cases  new_cases_smoothed  total_deaths  new_deaths  ...  Delta  Alpha  \
4        NaN                 NaN           NaN         NaN  ...   0.00  80.43   
5        NaN                 NaN           NaN         NaN  ...   9.66  66.21   
6        NaN                 NaN           NaN         NaN  ...   1.43  61.43   
7        NaN                 NaN           NaN         NaN  ...   9.90  53.47   
8        NaN                 NaN        

[          iso_date iso_code continent   location        date  total_cases  \
57  AUS_2021-02-22      AUS   Oceania  Australia  2021-02-22      28937.0   
58  AUS_2021-03-08      AUS   Oceania  Australia  2021-03-08      29061.0   
59  AUS_2021-03-22      AUS   Oceania  Australia  2021-03-22      29211.0   
60  AUS_2021-04-05      AUS   Oceania  Australia  2021-04-05      29365.0   
61  AUS_2021-04-19      AUS   Oceania  Australia  2021-04-19      29556.0   
62  AUS_2021-05-03      AUS   Oceania  Australia  2021-05-03      29850.0   

    new_cases  new_cases_smoothed  total_deaths  new_deaths  ...  Delta  \
57        7.0               4.571         909.0         0.0  ...   1.23   
58       15.0              10.714         909.0         0.0  ...   0.00   
59        5.0              10.571         909.0         0.0  ...   0.82   
60        8.0               9.857         909.0         0.0  ...   2.46   
61       23.0              18.143         910.0         0.0  ...  21.66   
62       

[           iso_date iso_code continent location        date  total_cases  \
101  BEL_2021-01-11      BEL    Europe  Belgium  2021-01-11     665223.0   
102  BEL_2021-01-25      BEL    Europe  Belgium  2021-01-25     694858.0   
103  BEL_2021-02-08      BEL    Europe  Belgium  2021-02-08     726483.0   
104  BEL_2021-02-22      BEL    Europe  Belgium  2021-02-22     755594.0   
105  BEL_2021-03-08      BEL    Europe  Belgium  2021-03-08     789008.0   
106  BEL_2021-03-22      BEL    Europe  Belgium  2021-03-22     839238.0   

     new_cases  new_cases_smoothed  total_deaths  new_deaths  ...  Delta  \
101      960.0            2048.000       20122.0        44.0  ...    0.0   
102     1192.0            2155.286       20814.0        35.0  ...    0.0   
103      873.0            2152.286       21423.0        34.0  ...    0.0   
104     1121.0            2300.857       21923.0        20.0  ...    0.0   
105     1117.0            2387.714       22292.0        31.0  ...    0.0   
106     22

[           iso_date iso_code continent  location        date  total_cases  \
146  BGR_2021-01-11      BGR    Europe  Bulgaria  2021-01-11     209131.0   
147  BGR_2021-01-25      BGR    Europe  Bulgaria  2021-01-25     215589.0   
148  BGR_2021-02-08      BGR    Europe  Bulgaria  2021-02-08     224849.0   
149  BGR_2021-02-22      BGR    Europe  Bulgaria  2021-02-22     238591.0   
150  BGR_2021-03-08      BGR    Europe  Bulgaria  2021-03-08     263303.0   
151  BGR_2021-03-22      BGR    Europe  Bulgaria  2021-03-22     307890.0   

     new_cases  new_cases_smoothed  total_deaths  new_deaths  ...  Delta  \
146      620.0             721.571        8232.0       106.0  ...    0.0   
147      772.0             458.000        8880.0        60.0  ...    0.0   
148     1115.0             752.714        9420.0        89.0  ...    0.0   
149     1925.0            1120.571        9933.0        79.0  ...    0.0   
150     2995.0            1953.857       10764.0       150.0  ...    0.0   
151

[           iso_date iso_code continent  location        date  total_cases  \
210  BWA_2021-05-31      BWA    Africa  Botswana  2021-05-31      54973.0   
211  BWA_2021-06-14      BWA    Africa  Botswana  2021-06-14      61457.0   
212  BWA_2021-06-28      BWA    Africa  Botswana  2021-06-28      69680.0   
213  BWA_2021-07-12      BWA    Africa  Botswana  2021-07-12      80153.0   
214  BWA_2021-07-26      BWA    Africa  Botswana  2021-07-26     102124.0   
215  BWA_2021-08-09      BWA    Africa  Botswana  2021-08-09     130771.0   

     new_cases  new_cases_smoothed  total_deaths  new_deaths  ...  Delta  \
210     1202.0             301.143         849.0        18.0  ...   4.17   
211     1977.0             480.286         926.0        30.0  ...  13.73   
212     2188.0             553.143        1125.0        30.0  ...  67.86   
213     4765.0             882.286        1253.0        51.0  ...  82.08   
214     4467.0            1460.286        1485.0       110.0  ...  93.55   
215

[           iso_date iso_code continent     location        date  total_cases  \
247  CHE_2021-01-11      CHE    Europe  Switzerland  2021-01-11     484506.0   
248  CHE_2021-01-25      CHE    Europe  Switzerland  2021-01-25     513599.0   
249  CHE_2021-02-08      CHE    Europe  Switzerland  2021-02-08     535153.0   
250  CHE_2021-02-22      CHE    Europe  Switzerland  2021-02-22     550224.0   
251  CHE_2021-03-08      CHE    Europe  Switzerland  2021-03-08     565034.0   
252  CHE_2021-03-22      CHE    Europe  Switzerland  2021-03-22     584252.0   

     new_cases  new_cases_smoothed  total_deaths  new_deaths  ...  Delta  \
247     6523.0            3220.714        8587.0        63.0  ...   0.06   
248     4320.0            1952.571        9309.0        37.0  ...   0.00   
249     3280.0            1436.857        9629.0       -87.0  ...   0.00   
250     2449.0            1002.429        9958.0        12.0  ...   0.11   
251     2744.0            1077.429       10083.0         9

[           iso_date iso_code      continent  location        date  \
304  COL_2021-03-08      COL  South America  Colombia  2021-03-08   
305  COL_2021-03-22      COL  South America  Colombia  2021-03-22   
306  COL_2021-04-05      COL  South America  Colombia  2021-04-05   
307  COL_2021-04-19      COL  South America  Colombia  2021-04-19   
308  COL_2021-05-03      COL  South America  Colombia  2021-05-03   
309  COL_2021-05-17      COL  South America  Colombia  2021-05-17   

     total_cases  new_cases  new_cases_smoothed  total_deaths  new_deaths  \
304    2278861.0     2205.0            3371.571       60598.0        95.0   
305    2342278.0     5048.0            5199.143       62148.0       120.0   
306    2456409.0    10190.0            9518.571       64293.0       199.0   
307    2667136.0    14189.0           16314.143       68748.0       420.0   
308    2905254.0    11599.0           16850.143       75164.0       687.0   
309    3131410.0    12984.0           16587.000      

[           iso_date iso_code      continent location        date  total_cases  \
348  CUW_2021-07-12      CUW  North America  Curacao  2021-07-12          NaN   
349  CUW_2021-07-26      CUW  North America  Curacao  2021-07-26          NaN   
350  CUW_2021-08-09      CUW  North America  Curacao  2021-08-09          NaN   
351  CUW_2021-08-23      CUW  North America  Curacao  2021-08-23          NaN   
352  CUW_2021-09-06      CUW  North America  Curacao  2021-09-06          NaN   
353  CUW_2021-09-20      CUW  North America  Curacao  2021-09-20          NaN   

     new_cases  new_cases_smoothed  total_deaths  new_deaths  ...   Delta  \
348        NaN                 NaN           NaN         NaN  ...   84.21   
349        NaN                 NaN           NaN         NaN  ...   95.52   
350        NaN                 NaN           NaN         NaN  ...   98.39   
351        NaN                 NaN           NaN         NaN  ...  100.00   
352        NaN                 NaN           N

KeyboardInterrupt: 