---
# Gathering the Data
---

In [1]:
# Importing tools

import requests
# import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)



---
### Kaggle Dataset
---


In [2]:
# kaggle dataset 
# world_population = pd.read_csv('./datasets/population_by_country_2020.csv')
# world_population.info()

---
### Scraping
#### Fetch the content by URL
---

The data was collected from 2 sources:
1. Global data (source updates this on a daily basis): \
    https://github.com/CSSEGISandData/COVID-19 
2. Singapore data (source updated this daily, however search tab was last updated 19 Apr 2020): \
    https://co.vid19.sg/singapore/dashboard. 

No API or loop was needed to collect the data.


---
### 1. Global COVID cases 
Import & save the datasets. 

In [3]:
# # importing confirmed cases straight of the web url
# confirmed_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
# confirmed_df.to_csv('./datasets/confirmed_df.csv', index=False)

# # importing death cases straight of the web url
# deaths_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
# deaths_df.to_csv('./datasets/deaths_df.csv', index=False)

# # importing recovered cases straight of the web url
# recovered_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
# recovered_df.to_csv('./datasets/recovered_df.csv', index=False)

# # # importing updated cases straight of the web url
# # latest_data = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/04-19-2020.csv')
# # latest_data.to_csv('./datasets/latest_data.csv', index=False)


In [4]:
# load global cases for confirmed/deaths/recovered
confirmed_df = pd.read_csv('./datasets/confirmed_df.csv')
deaths_df = pd.read_csv('./datasets/deaths_df.csv')
recovered_df = pd.read_csv('./datasets/recovered_df.csv')
# latest_data = pd.read_csv('./datasets/latest_data.csv')

#### Cleaning the Global Datasets

In [5]:
print(confirmed_df.shape)
print(deaths_df.shape)
print(recovered_df.shape)

(264, 98)
(264, 98)
(250, 98)


In [6]:
# function to find rows that are different between two DataFrames

def difference_df(dfa, dfb, which=None):

    comparison_df = dfa.merge(dfb,
                              indicator=True,
                              how='outer')
    if which is None:
        diff_df = comparison_df[comparison_df['_merge'] != 'both']
    else:
        diff_df = comparison_df[comparison_df['_merge'] == which]

    return diff_df

In [7]:
# comparing row difference in confirmed and deaths
cd_diff = difference_df(deaths_df[['Province/State', 'Country/Region']], confirmed_df[['Province/State', 'Country/Region']])
cd_diff

Unnamed: 0,Province/State,Country/Region,_merge


In [8]:
# comparing row difference in recovered and deaths since death and confirmed are the same
# left recovered, right is deaths
dr_diff = difference_df(deaths_df[['Province/State', 'Country/Region']], recovered_df[['Province/State', 'Country/Region']])
dr_diff

Unnamed: 0,Province/State,Country/Region,_merge
35,Alberta,Canada,left_only
36,British Columbia,Canada,left_only
37,Grand Princess,Canada,left_only
38,Manitoba,Canada,left_only
39,New Brunswick,Canada,left_only
40,Newfoundland and Labrador,Canada,left_only
41,Nova Scotia,Canada,left_only
42,Ontario,Canada,left_only
43,Prince Edward Island,Canada,left_only
44,Quebec,Canada,left_only


In [9]:
recovered_df.loc[recovered_df["Country/Region"]=='Canada'].index

Int64Index([36], dtype='int64')

In [10]:
dr_diff.loc[dr_diff['_merge']=='left_only'].index

Int64Index([35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 231, 238, 245, 246], dtype='int64')

In [11]:
# taking the easy way out and dropping rows that only present in all df
clean_confirmed_df = confirmed_df.drop([35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 231, 238, 245, 246])
clean_deaths_df = deaths_df.drop([35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 231, 238, 245, 246])
clean_recovered_df = recovered_df.drop([36])

In [12]:
# # verfication
# difference_df(clean_deaths_df[['Province/State', 'Country/Region']], clean_confirmed_df[['Province/State', 'Country/Region']])

In [13]:
# # verfication
# difference_df(clean_deaths_df[['Province/State', 'Country/Region']], clean_recovered_df[['Province/State', 'Country/Region']])

In [14]:
# taking reference from kaggle COVID dataset to form similar dataframe format for confirmed/deaths/recovered
confirmed_ext = pd.melt(clean_confirmed_df,
                        id_vars=clean_confirmed_df.iloc[:, :4],
                        var_name='date',
                        value_name='confirmed')

deaths_ext = pd.melt(clean_deaths_df,
                    id_vars=clean_deaths_df.iloc[:, :4],
                    var_name='date',
                    value_name='deaths')

recovered_ext = pd.melt(clean_recovered_df,
                        id_vars=clean_recovered_df.iloc[:, :4],
                        var_name='date',
                        value_name='recovered')

In [15]:
print(confirmed_ext.shape)
print(deaths_ext.shape)
print(recovered_ext.shape)

(23406, 6)
(23406, 6)
(23406, 6)


In [16]:
confirmed_ext.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,confirmed
0,,Afghanistan,33.0,65.0,1/22/20,0
1,,Albania,41.1533,20.1683,1/22/20,0
2,,Algeria,28.0339,1.6596,1/22/20,0
3,,Andorra,42.5063,1.5218,1/22/20,0
4,,Angola,-11.2027,17.8739,1/22/20,0


In [17]:
# save newly framed data 
confirmed_ext.to_csv('./datasets/confirmed_ext.csv', index=False)
deaths_ext.to_csv('./datasets/deaths_ext.csv', index=False)
recovered_ext.to_csv('./datasets/recovered_ext.csv', index=False)

#### Preparing Training & Test Sets

In [18]:
# combining confirmed and deaths to new df
traintestcomb = pd.merge(confirmed_ext, 
                         deaths_ext, 
                         how='inner', 
                         on=['Province/State', 'Country/Region', 'Lat', 'Long', 'date'])
traintestcomb.shape

(23406, 7)

In [19]:
traintestcomb.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,confirmed,deaths
0,,Afghanistan,33.0,65.0,1/22/20,0,0
1,,Albania,41.1533,20.1683,1/22/20,0,0
2,,Algeria,28.0339,1.6596,1/22/20,0,0
3,,Andorra,42.5063,1.5218,1/22/20,0,0
4,,Angola,-11.2027,17.8739,1/22/20,0,0


In [20]:
# formatting to datetime
traintestcomb['date']= pd.to_datetime(traintestcomb['date'], errors='coerce') 

In [21]:
# creating trainin set - 2020/1/22 till 2020/4/10
train_mask = (traintestcomb['date'] > '2020-1-21') & (traintestcomb['date'] <= '2020-4-10')
train = traintestcomb.loc[train_mask]
train.shape

(19920, 7)

In [22]:
# creating test set - 2020/4/11 till 2020/4/24
test_mask = (traintestcomb['date'] > '2020-4-10') & (traintestcomb['date'] <= '2020-4-25')
test = traintestcomb.loc[test_mask]
test.shape

(3486, 7)

In [23]:
# save training and test datasets 
train.to_csv('./datasets/train.csv', index=False)
test.to_csv('./datasets/test.csv', index=False)

#### Combined Dataeframe for Confirmed/Deaths/Recovered

In [24]:
# new df for confirmed/deaths/recovered
comb_ext = confirmed_ext
comb_ext['deaths'] = deaths_ext['deaths']
comb_ext['recovered'] = recovered_ext['recovered']

In [59]:
print(comb_ext.shape)
comb_ext.head()

(23406, 8)


Unnamed: 0,Province/State,Country/Region,Lat,Long,date,confirmed,deaths,recovered
0,,Afghanistan,33.0,65.0,1/22/20,0,0,0
1,,Albania,41.1533,20.1683,1/22/20,0,0,0
2,,Algeria,28.0339,1.6596,1/22/20,0,0,0
3,,Andorra,42.5063,1.5218,1/22/20,0,0,0
4,,Angola,-11.2027,17.8739,1/22/20,0,0,0


In [26]:
# save newly framed data 
comb_ext.to_csv('./datasets/comb_ext.csv', index=False)

---
### 2. Singapore COVID cases

In [27]:
# # Code for parsing HTML Tables in Python with BeautifulSoup and pandas
# # ref https://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/?fbclid=IwAR3R12X84U6xkgZ0_a02gPPbKm-Tk6MnYl6ga7R7srPDNXXkzfMU79jhZr4

# url = "https://co.vid19.sg/singapore/cases/search.html"
# response = requests.get(url)
# response.text[:100] # Access the HTML with the text property

# class HTMLTableParser:
    
#     def parse_url(self, url):
#         response = requests.get(url)
#         soup = BeautifulSoup(response.text, 'lxml')
#         return [(table['id'],self.parse_html_table(table))\
#                 for table in soup.find_all('table')]
        
#     def parse_html_table(self, table):
#         n_columns = 0
#         n_rows=0
#         column_names = []
        
#         # Find number of rows and columns
#         # we also find the column titles if we can
#         for row in table.find_all('tr'):
            
#             # Determine the number of rows in the table
#             td_tags = row.find_all('td')
#             if len(td_tags) > 0:
#                 n_rows+=1
#                 if n_columns == 0:
#                      # Set the number of columns for our table
#                      n_columns = len(td_tags)
            
#             # Handle column names if we find them
#             th_tags = row.find_all('th')
#             if len(th_tags) > 0 and len(column_names) == 0:
#                 for th in th_tags:
#                     column_names.append(th.get_text())
                    
#         # Safeguard on Column Titles
#         if len(column_names) > 0 and len(column_names) != n_columns:
#             raise Exception("Column titles do not match the number of columns")
            
#         columns = column_names if len(column_names) > 0 else range(0,n_columns)
         
#         df = pd.DataFrame(columns = columns,index= range(0,n_rows))
         
#         row_marker = 0
#         for row in table.find_all('tr'):
#              column_marker = 0
#              columns = row.find_all('td')
#              for column in columns:
#                  df.iat[row_marker,column_marker] = column.get_text()
#                  column_marker += 1
#              if len(columns) > 0:
#                  row_marker += 1
                 
#         for col in df:
#                 try:
#                      df[col] = df[col].astype(float)
#                 except ValueError:
#                     pass
                
#         return df
    
# hp = HTMLTableParser()
# df = hp.parse_url(url)[0][1]

In [28]:
# # Map lower case function and replace spaces with '_' to all column names for my own ease 
# df.columns = map(str.lower, df.columns)
# df.columns = df.columns.str.replace(" ", "_")

In [29]:
# # Remove question mark in column name
# df = df.rename(columns={'displayed_symptoms?': 'displayed_symptoms'})

In [30]:
# # # save newly framed singapore related dataset 
# df.to_csv('./datasets/df_sg_covid.csv', index=False)

In [31]:
# load singapore related datset
df = pd.read_csv('./datasets/df_sg_covid.csv')

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6588 entries, 0 to 6587
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   case                        6588 non-null   float64
 1   patient                     6588 non-null   object 
 2   age                         6588 non-null   object 
 3   gender                      6588 non-null   object 
 4   nationality                 6588 non-null   object 
 5   status                      6588 non-null   object 
 6   infection_source            6588 non-null   object 
 7   country_of_origin           6588 non-null   object 
 8   symptomatic_toconfirmation  6588 non-null   object 
 9   days_torecover              6588 non-null   object 
 10  symptomatic_at              6588 non-null   object 
 11  confirmed_at                6588 non-null   object 
 12  recovered_at                6588 non-null   object 
 13  displayed_symptoms          6588 

#### Cleaning the Singapore-only data


In [33]:
df.columns

Index(['case', 'patient', 'age', 'gender', 'nationality', 'status',
       'infection_source', 'country_of_origin', 'symptomatic_toconfirmation',
       'days_torecover', 'symptomatic_at', 'confirmed_at', 'recovered_at',
       'displayed_symptoms'],
      dtype='object')

In [34]:
df.isnull().sum()

case                          0
patient                       0
age                           0
gender                        0
nationality                   0
status                        0
infection_source              0
country_of_origin             0
symptomatic_toconfirmation    0
days_torecover                0
symptomatic_at                0
confirmed_at                  0
recovered_at                  0
displayed_symptoms            0
dtype: int64

In [35]:
df.head()

Unnamed: 0,case,patient,age,gender,nationality,status,infection_source,country_of_origin,symptomatic_toconfirmation,days_torecover,symptomatic_at,confirmed_at,recovered_at,displayed_symptoms
0,6560.0,37 Year Old Male Bangladeshi,\n 37\n ...,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,-,-,"19th, Apr 2020",-,True
1,6558.0,37 Year Old Male Bangladeshi,\n 37\n ...,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,-,-,"19th, Apr 2020",-,True
2,6531.0,31 Year Old Male Bangladeshi,\n 31\n ...,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,-,-,"19th, Apr 2020",-,True
3,6529.0,43 Year Old Male Indian,\n 43\n ...,Male,Indian,In hospital,Local transmission,Unclear origin,-,-,-,"19th, Apr 2020",-,True
4,6509.0,31 Year Old Male Myanmarian,\n 31\n ...,Male,Myanmarian,In hospital,Local transmission,Unclear origin,-,-,-,"18th, Apr 2020",-,True


In [36]:
# check out posts that are duplicates based on case number
df[df['case'].duplicated(keep=False)]

Unnamed: 0,case,patient,age,gender,nationality,status,infection_source,country_of_origin,symptomatic_toconfirmation,days_torecover,symptomatic_at,confirmed_at,recovered_at,displayed_symptoms


In [37]:
df.symptomatic_toconfirmation.value_counts()

-     6343
1       31
2       29
5       28
3       22
4       22
6       19
9       18
8       16
7       15
10      11
12       7
15       5
14       4
0        3
13       3
16       3
11       3
17       2
19       1
33       1
22       1
31       1
Name: symptomatic_toconfirmation, dtype: int64

##### Creating age group dataset

In [38]:
df.age = df.age.replace(r'\n',  '', regex=True)
df.age = df.age.replace(r'\ ',  '', regex=True)
df.age = df.age.replace(r'',  '1', regex=True)
df.age = df.age.replace(r'-',  '1', regex=True)

In [39]:
df.age = pd.to_numeric(df.age)
df.age.isnull()
if df.age.isnull == 0:
    print("mother me")

In [40]:
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 120]
# bins = [9, 19, 29, 39, 49, 59, 69, 79, 89, 120]

labels = ['1-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '>80']
df['agegroup'] = pd.cut(df.age, bins = bins, right=False, labels = labels, include_lowest = True)

df.head(20)

Unnamed: 0,case,patient,age,gender,nationality,status,infection_source,country_of_origin,symptomatic_toconfirmation,days_torecover,symptomatic_at,confirmed_at,recovered_at,displayed_symptoms,agegroup
0,6560.0,37 Year Old Male Bangladeshi,37,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,-,-,"19th, Apr 2020",-,True,30-39
1,6558.0,37 Year Old Male Bangladeshi,37,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,-,-,"19th, Apr 2020",-,True,30-39
2,6531.0,31 Year Old Male Bangladeshi,31,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,-,-,"19th, Apr 2020",-,True,30-39
3,6529.0,43 Year Old Male Indian,43,Male,Indian,In hospital,Local transmission,Unclear origin,-,-,-,"19th, Apr 2020",-,True,40-49
4,6509.0,31 Year Old Male Myanmarian,31,Male,Myanmarian,In hospital,Local transmission,Unclear origin,-,-,-,"18th, Apr 2020",-,True,30-39
5,6497.0,30 Year Old Male Bangladeshi,30,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,-,-,"18th, Apr 2020",-,True,30-39
6,6489.0,23 Year Old Male Bangladeshi,23,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,-,-,"18th, Apr 2020",-,True,20-29
7,6472.0,26 Year Old Male Bangladeshi,26,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,-,-,"18th, Apr 2020",-,True,20-29
8,6471.0,36 Year Old Male Bangladeshi,36,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,-,-,"18th, Apr 2020",-,True,30-39
9,6469.0,28 Year Old Male Bangladeshi,28,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,-,-,"18th, Apr 2020",-,True,20-29


In [41]:
age_details_df = pd.DataFrame(df.agegroup.value_counts())
age_details_df.reset_index(inplace=True)
age_details_df = age_details_df.rename(columns={'index': 'group', 'agegroup' : 'total'})
age_details_df

Unnamed: 0,group,total
0,30-39,2563
1,20-29,1989
2,40-49,1139
3,50-59,455
4,60-69,211
5,70-79,71
6,1-9,63
7,10-19,59
8,>80,38


In [42]:
# save age data
age_details_df.to_csv('./datasets/age_details.csv', index=False)

##### Creating nationality group dataset

In [43]:
nat_details_df = pd.DataFrame(df.nationality.value_counts())

In [44]:
nat_details_df.count()

nationality    39
dtype: int64

In [45]:
nat_details_df.reset_index(inplace=True)
nat_details_df = nat_details_df.rename(columns={'index': 'nationality', 'nationality' : 'total'})
nat_details_df

Unnamed: 0,nationality,total
0,Bangladeshi,2922
1,Indian,1608
2,Singaporean,1286
3,Chinese,233
4,Myanmarian,136
5,Malaysian,95
6,Unidentified nationality,62
7,Filipino,45
8,Thai,39
9,Indonesian,33


In [46]:
# save nationality data
nat_details_df.to_csv('./datasets/nat_details.csv', index=False)

##### Creating infection source dataset

In [47]:
source_df = pd.DataFrame(df.infection_source.value_counts())
source_df

Unnamed: 0,infection_source
Local transmission,6026
Imported case,562


In [48]:
source_df = pd.DataFrame(df.infection_source.value_counts())
source_df.reset_index(inplace=True)
source_df = source_df.rename(columns={'index': 'source', 'infection_source' : 'total'})
source_df

Unnamed: 0,source,total
0,Local transmission,6026
1,Imported case,562


In [49]:
# save infection source data
source_df.to_csv('./datasets/source_details.csv', index=False)

##### Creating recovery time dataset

In [50]:
df.days_torecover.isnull().sum()

0

In [51]:
df.days_torecover.value_counts()

-     5784
12      53
13      51
16      47
14      47
18      46
10      39
9       36
11      36
15      35
8       35
19      35
20      30
6       29
4       29
17      26
23      24
26      21
7       21
25      20
5       19
21      16
22      15
3       15
24      13
27      13
2       11
30       7
28       7
31       6
32       5
29       3
1        3
33       2
35       2
73       1
36       1
40       1
37       1
0        1
38       1
67       1
Name: days_torecover, dtype: int64

In [52]:
df.days_torecover.replace('-', np.nan, inplace=True)
df.days_torecover.value_counts()

12    53
13    51
16    47
14    47
18    46
10    39
11    36
9     36
8     35
19    35
15    35
20    30
4     29
6     29
17    26
23    24
26    21
7     21
25    20
5     19
21    16
22    15
3     15
27    13
24    13
2     11
30     7
28     7
31     6
32     5
29     3
1      3
35     2
33     2
67     1
36     1
73     1
0      1
40     1
37     1
38     1
Name: days_torecover, dtype: int64

In [53]:
df.days_torecover = pd.to_numeric(df.days_torecover)

df.days_torecover.isnull()
if df.days_torecover.isnull == 0:
    print("mother me")

In [54]:
bins = [0, 8, 15, 22, 120]
# bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 120]

labels = ['<1 week', '1-2 weeks', '2-3 weeks', '>3 weeks']
# labels = ['1-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '>80']
df['recovery_in_weeks'] = pd.cut(df.days_torecover, bins = bins, right=False, labels = labels, include_lowest = True)

df.head(200)

Unnamed: 0,case,patient,age,gender,nationality,status,infection_source,country_of_origin,symptomatic_toconfirmation,days_torecover,symptomatic_at,confirmed_at,recovered_at,displayed_symptoms,agegroup,recovery_in_weeks
0,6560.0,37 Year Old Male Bangladeshi,37,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,,-,"19th, Apr 2020",-,True,30-39,
1,6558.0,37 Year Old Male Bangladeshi,37,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,,-,"19th, Apr 2020",-,True,30-39,
2,6531.0,31 Year Old Male Bangladeshi,31,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,,-,"19th, Apr 2020",-,True,30-39,
3,6529.0,43 Year Old Male Indian,43,Male,Indian,In hospital,Local transmission,Unclear origin,-,,-,"19th, Apr 2020",-,True,40-49,
4,6509.0,31 Year Old Male Myanmarian,31,Male,Myanmarian,In hospital,Local transmission,Unclear origin,-,,-,"18th, Apr 2020",-,True,30-39,
5,6497.0,30 Year Old Male Bangladeshi,30,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,,-,"18th, Apr 2020",-,True,30-39,
6,6489.0,23 Year Old Male Bangladeshi,23,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,,-,"18th, Apr 2020",-,True,20-29,
7,6472.0,26 Year Old Male Bangladeshi,26,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,,-,"18th, Apr 2020",-,True,20-29,
8,6471.0,36 Year Old Male Bangladeshi,36,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,,-,"18th, Apr 2020",-,True,30-39,
9,6469.0,28 Year Old Male Bangladeshi,28,Male,Bangladeshi,In hospital,Local transmission,Unclear origin,-,,-,"18th, Apr 2020",-,True,20-29,


In [55]:
df["recovery_in_weeks"].value_counts()

1-2 weeks    297
2-3 weeks    235
>3 weeks     144
<1 week      128
Name: recovery_in_weeks, dtype: int64

In [56]:
df.recovery_in_weeks = df.recovery_in_weeks.astype(object).fillna("N.A")

In [57]:
recovery_df = pd.DataFrame(df.recovery_in_weeks.value_counts())
recovery_df.reset_index(inplace=True)
recovery_df = recovery_df.rename(columns={'index': 'rec_time', 'recovery_in_weeks' : 'total'})
recovery_df

Unnamed: 0,rec_time,total
0,N.A,5784
1,1-2 weeks,297
2,2-3 weeks,235
3,>3 weeks,144
4,<1 week,128


In [58]:
# save age data
recovery_df.to_csv('./datasets/recovery_details.csv', index=False)