# Read in data

In [59]:
import pandas as pd

full_df = pd.read_csv('./Data/full.csv')
full_df.head()

Unnamed: 0,iso_code,date,continent,location,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,Gamma,Kappa,Iota,Eta,Delta,Alpha,non_who,Lambda,Year,Mu
0,ABW,2021-02-08,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,7.89,91.23,0.0,507.529412,4.340588
1,ABW,2021-02-22,,,,,,,,,...,0.65,0.0,0.0,0.0,0.0,24.03,66.88,0.0,384.0,0.0
2,ABW,2021-03-08,,,,,,,,,...,0.0,0.0,5.6,0.0,0.0,52.0,28.8,0.0,384.0,0.0
3,ABW,2021-03-22,,,,,,,,,...,1.01,0.0,7.07,0.0,0.0,66.67,11.11,0.0,384.0,0.0
4,ABW,2021-03-29,North America,Aruba,,,,,,,...,,,,,,,,,,


# Store each country's data in an individual dataframe

In [60]:
country_dfs = {}  # Dictionary that stores a dataframe for each country. Key is iso_code. 
header = full_df.columns  # Stores column names

# Iterate over the rows of the data frame
for index, row in full_df.iterrows():
    # Get the country name for the current row of data
    country = row['iso_code']
    # If the country name is not present in dictionary of country dataframes
    if country not in country_dfs:
        # Get all rows with the country name
        country_rows = full_df.loc[full_df['iso_code'] == country]
        # Put those rows into a dataframe and store in country_dfs
        country_dfs[country] = pd.DataFrame(country_rows)
        
print(country_dfs['ABW'])

    iso_code        date      continent location  total_cases  new_cases  \
0        ABW  2021-02-08            NaN      NaN          NaN        NaN   
1        ABW  2021-02-22            NaN      NaN          NaN        NaN   
2        ABW  2021-03-08            NaN      NaN          NaN        NaN   
3        ABW  2021-03-22            NaN      NaN          NaN        NaN   
4        ABW  2021-03-29  North America    Aruba          NaN        NaN   
..       ...         ...            ...      ...          ...        ...   
236      ABW  2021-11-16  North America    Aruba          NaN        NaN   
237      ABW  2021-11-17  North America    Aruba          NaN        NaN   
238      ABW  2021-11-18  North America    Aruba          NaN        NaN   
239      ABW  2021-11-19  North America    Aruba          NaN        NaN   
240      ABW  2021-11-20  North America    Aruba          NaN        NaN   

     new_cases_smoothed  total_deaths  new_deaths  new_deaths_smoothed  ...  \
0       

# Find groups of consecutive biweekly data 
Right now, timestep = 5. This can be changed by setting TIMESTEP=#. 

In [61]:
from datetime import date

def get_days_between_rows(date1_str, date2_str):
    # Split the date strings (e.g., 2021-11-01) on the hypen
    date1_split = date1_str.split('-')
    date2_split = date2_str.split('-')

    # Make date objects using year, month, date
    date1 = date(int(date1_split[0]), int(date1_split[1]), int(date1_split[2]))
    date2 = date(int(date2_split[0]), int(date2_split[1]), int(date2_split[2]))

    # Calculate days between the days
    time_between = date2 - date1
    days_between = time_between.days
    return days_between

# Dictionary of timestep data. Keys are iso_code. Value is a list of dataframes, where each 
# dataframe contains 5 rows of data and a sixth row, which is what we need to predict. 
# TODO: remove everything in the sixth row but the value we want to predict. 
timestep_dfs = {} 
TIMESTEP = 4

# Get list of all country names
countries = country_dfs.keys()

# Iterate over all countries
for country in countries:
    # Look at df for current country
    df = country_dfs[country]
    
    num_consecutive = 0
    consecutive_rows = []
    header = df.columns
    # Iterate over pairs of rows in the data frame
    
    for (index1,row1),(index2,row2) in zip(df[:-1].iterrows(),df[1:].iterrows()):
        # Get the number of days between the date strings (e.g., 2021-11-01)
        days_between = get_days_between_rows(row1['date'], row2['date'])
        
        # If there are 14 days between this pair of rows
        if days_between == 1:
            num_consecutive += 1
            # Add both rows to the consecutive rows list
            consecutive_rows.append(row1)
            consecutive_rows.append(row2)
            
            # If we have found `timestep` number of rows, then we have a complete dateset
            if num_consecutive == TIMESTEP:

                # Turn this list of rows into a datagrame and drop duplicates
                # Duplicates are due to adding both row1 and row2 to consecutive_rows list
                consecutive_df = pd.DataFrame(consecutive_rows)
                consecutive_df.drop_duplicates(inplace=True)

                # If the country is already present in timestep_dfs, just add the current dataframe the 
                # country's list of dataframes
                if country in timestep_dfs:
                    timestep_dfs[country].append(consecutive_df)
                else:
                    # If country is not already present in timestep_dfs, add an empty list to the 
                    # dictionary where we will append the dataframe of consecutive rows
                    timestep_dfs[country] = []
                    timestep_dfs[country].append(consecutive_df)

                # Reset so that we can look for a new group of consecutive rows
                num_consecutive = 0
                consecutive_rows = []
            
        else:
            # Reset because encountered a row that is not 14 days away
            num_consecutive = 0
            consecutive_rows = []

print(timestep_dfs['ABW'])        

[  iso_code        date      continent location  total_cases  new_cases  \
4      ABW  2021-03-29  North America    Aruba          NaN        NaN   
5      ABW  2021-03-30  North America    Aruba          NaN        NaN   
6      ABW  2021-03-31  North America    Aruba          NaN        NaN   
7      ABW  2021-04-01  North America    Aruba          NaN        NaN   
8      ABW  2021-04-02  North America    Aruba          NaN        NaN   

   new_cases_smoothed  total_deaths  new_deaths  new_deaths_smoothed  ...  \
4                 NaN           NaN         NaN                  NaN  ...   
5                 NaN           NaN         NaN                  NaN  ...   
6                 NaN           NaN         NaN                  NaN  ...   
7                 NaN           NaN         NaN                  NaN  ...   
8                 NaN           NaN         NaN                  NaN  ...   

   Gamma  Kappa  Iota  Eta  Delta  Alpha  non_who  Lambda  Year  Mu  
4    NaN    NaN   NaN

# Find groups of consecutive daily data
Run these blocks of code after imputing data. This code depends on some columns (iso_code and date) that were dropped during imputation, so maybe there is a way to exclude them from the imputation process and drop those columns only after the blocks of code below have been run. 

This code is not currently general in terms of picking a timestep. Can fix later.

## 1. Compute state_group column
Determine if the next four rows are consecutively after the first row

In [74]:
import pandas as pd

df = pd.read_csv('./Data/full.csv')
df.head()

df['date'] = pd.to_datetime(df['date'])
df['start_group'] = 0
df.reset_index(inplace = True, drop = True)

for i, row in df.iterrows():
    if i <= len(df) - 5:
        if row['date'] + pd.Timedelta(days=1) == df.iloc[i+1]['date'] and \
            row['date'] + pd.Timedelta(days=2) == df.iloc[i+2]['date'] and \
            row['date'] + pd.Timedelta(days=3) == df.iloc[i+3]['date'] and \
            row['date'] + pd.Timedelta(days=4) == df.iloc[i+4]['date'] and \
            row['iso_code'] == df.iloc[i+1]['iso_code'] and \
            row['iso_code'] == df.iloc[i+2]['iso_code'] and \
            row['iso_code'] == df.iloc[i+3]['iso_code'] and \
            row['iso_code'] == df.iloc[i+4]['iso_code']:

            df.loc[i, 'start_group'] = 1

In [75]:
df.tail(15)

Unnamed: 0,iso_code,date,continent,location,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,Kappa,Iota,Eta,Delta,Alpha,non_who,Lambda,Year,Mu,start_group
135921,ZWE,2021-11-06,Africa,Zimbabwe,133168.0,56.0,34.571,4685.0,0.0,1.429,...,,,,,,,,,,1
135922,ZWE,2021-11-07,Africa,Zimbabwe,133187.0,19.0,30.0,4685.0,0.0,1.0,...,,,,,,,,,,1
135923,ZWE,2021-11-08,Africa,Zimbabwe,133205.0,18.0,28.286,4690.0,5.0,1.571,...,,,,,,,,,,1
135924,ZWE,2021-11-09,Africa,Zimbabwe,133242.0,37.0,27.857,4691.0,1.0,1.143,...,,,,,,,,,,1
135925,ZWE,2021-11-10,Africa,Zimbabwe,133302.0,60.0,30.143,4694.0,3.0,1.429,...,,,,,,,,,,1
135926,ZWE,2021-11-11,Africa,Zimbabwe,133329.0,27.0,31.0,4694.0,0.0,1.286,...,,,,,,,,,,1
135927,ZWE,2021-11-12,Africa,Zimbabwe,133329.0,0.0,31.0,4694.0,0.0,1.286,...,,,,,,,,,,1
135928,ZWE,2021-11-13,Africa,Zimbabwe,133393.0,64.0,32.143,4696.0,2.0,1.571,...,,,,,,,,,,1
135929,ZWE,2021-11-14,Africa,Zimbabwe,133428.0,35.0,34.429,4696.0,0.0,1.571,...,,,,,,,,,,1
135930,ZWE,2021-11-15,Africa,Zimbabwe,133438.0,10.0,33.286,4697.0,1.0,1.0,...,,,,,,,,,,1


## 2. Split data into groups of five
The groups of five rows are stored in dataframes, and those dataframes are stored in a list called `timestepped_data`.

* Todo: remove the columns that we don't need (like date, iso_code, etc. that are dropped in the imputation code). 

In [88]:
timestepped_data = []

for i, row in df.iterrows():
    if i < len(df) - 5:
        if row['start_group'] == 1:
            # TODO Remove the columns of data that we don't want when you append the row to current_group_data. 
            # Didn't implement this yet because I'm not sure which columns we want to keep. 
            current_group_data = []
            current_group_data.append(row.values.tolist())
            # current_group_data.append(row.to_numpy())
            current_group_data.append(df.iloc[i+1].values.tolist())
            current_group_data.append(df.iloc[i+2].values.tolist())
            current_group_data.append(df.iloc[i+3].values.tolist())
            current_group_data.append(df.iloc[i+4].values.tolist())
            
            current_group_df = pd.DataFrame(current_group_data)
            timestepped_data.append(current_group_df)
            
        

In [93]:
timestepped_data

[    0          1              2      3   4   5   6   7   8   9   ...  75  76  \
 0  ABW 2021-03-29  North America  Aruba NaN NaN NaN NaN NaN NaN  ... NaN NaN   
 1  ABW 2021-03-30  North America  Aruba NaN NaN NaN NaN NaN NaN  ... NaN NaN   
 2  ABW 2021-03-31  North America  Aruba NaN NaN NaN NaN NaN NaN  ... NaN NaN   
 3  ABW 2021-04-01  North America  Aruba NaN NaN NaN NaN NaN NaN  ... NaN NaN   
 4  ABW 2021-04-02  North America  Aruba NaN NaN NaN NaN NaN NaN  ... NaN NaN   
 
    77  78  79  80  81  82  83  84  
 0 NaN NaN NaN NaN NaN NaN NaN   1  
 1 NaN NaN NaN NaN NaN NaN NaN   1  
 2 NaN NaN NaN NaN NaN NaN NaN   1  
 3 NaN NaN NaN NaN NaN NaN NaN   1  
 4 NaN NaN NaN NaN NaN NaN NaN   1  
 
 [5 rows x 85 columns],
     0          1              2      3   4   5   6   7   8   9   ...  75  76  \
 0  ABW 2021-03-30  North America  Aruba NaN NaN NaN NaN NaN NaN  ... NaN NaN   
 1  ABW 2021-03-31  North America  Aruba NaN NaN NaN NaN NaN NaN  ... NaN NaN   
 2  ABW 2021-04-01  N

In [89]:
len(timestepped_data)

134930

There are 134,930 groups of 5 rows. 

In [91]:
df.columns

Index(['iso_code', 'date', 'continent', 'location', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

These ^^ are all the columns of data contained in the final dataframes in `timestepped_data`. 