### Packages

In [45]:
import pandas as pd
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup

### Utility for extracting HTML Table 

This is adapted from [Parsing HTML Tables in Python with BeautifulSoup and pandas](https://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/)

In [46]:


class HTMLTableParser:

    def parse_url(self, url):
        response = requests.get(url)
        #print(response)
        soup = BeautifulSoup(response.text, 'html')
        #print(soup)
        return [(self.parse_html_table(table))\
            for table in soup.find_all('table')]  

    def parse_html_table(self, table):
        #print("new table")
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        df = pd.DataFrame() 
        try:                    
            # Safeguard on Column Titles
            if len(column_names) > 0 and len(column_names) != n_columns:
                raise Exception("Column titles do not match the number of columns")

            columns = column_names if len(column_names) > 0 else range(0,n_columns)

            #print(n_rows, n_columns)
            df = pd.DataFrame(columns = columns,
                  index= range(0,n_rows))

            row_marker = 0
            for row in table.find_all('tr'):
                column_marker = 0
                columns = row.find_all('td')

                for column in columns:
                    df.iat[row_marker,column_marker] = column.get_text()
                    column_marker += 1
                if len(columns) > 0:
                    row_marker += 1

            # Convert to float if possible
            for col in df:
                    df[col] = df[col]
        except Exception as ex:
            print(ex)
            pass
        #df.head(10)
        return df


### Grab the historical data

Keep the historical data.

At a later stage, just download the daily new data and add to the time series.

In [47]:
url_data =\
    [('2020-04-02',"https://www.mai.gov.ro/informare-covid-19-grupul-de-comunicare-strategica-2-aprilie-ora-13-00/"),
    ('2020-04-03',"https://www.mai.gov.ro/informare-covid-19-grupul-de-comunicare-strategica-3-aprilie-2020-ora-13-00/"),
    ('2020-04-04',"https://www.mai.gov.ro/informare-covid-19-grupul-de-comunicare-strategica-4-aprilie-2020-ora-13-00/"),
    ('2020-04-05',"https://www.mai.gov.ro/informare-covid-19-grupul-de-comunicare-strategica-5-aprilie-2020-ora-13-00/")]

In [48]:
hp = HTMLTableParser()
all_data_df = pd.DataFrame()
for current_date, current_url in url_data:
    tables = hp.parse_url(current_url)
    payload_table = tables[0]
    print(payload_table.shape)
    payload_table['date'] = current_date
    #remove headers & footers
    payload_table = payload_table.iloc[1:]
    payload_table = payload_table.iloc[:-1]
    all_data_df = all_data_df.append(payload_table)
all_data_df.columns = ['No', 'County', 'Confirmed', 'Date']

Column titles do not match the number of columns
(44, 3)
Column titles do not match the number of columns
(44, 3)
index 1 is out of bounds for axis 0 with size 1
Column titles do not match the number of columns
(45, 3)
Column titles do not match the number of columns
(44, 3)


In [49]:
all_data_df.shape, all_data_df.columns

((169, 4), Index(['No', 'County', 'Confirmed', 'Date'], dtype='object'))

In [50]:
all_data_df.head()

Unnamed: 0,No,County,Confirmed,Date
1,1.0,Alba,9,2020-04-02
2,2.0,Arad,110,2020-04-02
3,3.0,Argeș,10,2020-04-02
4,4.0,Bacău,19,2020-04-02
5,5.0,Bihor,39,2020-04-02


In [51]:
all_data_df.County.unique()

array(['Alba', 'Arad', 'Argeș', 'Bacău', 'Bihor', 'Bistrița-Năsăud',
       'Botoșani', 'Brașov', 'Brăila', 'Buzău', 'Caraș-Severin',
       'Călărași', 'Cluj', 'Constanța', 'Covasna', 'Dâmbovița', 'Dolj',
       'Galați', 'Giurgiu', 'Gorj', 'Harghita', 'Hunedoara', 'Ialomița',
       'Iași', 'Ilfov', 'Maramureș', 'Mehedinți', 'Mureș', 'Neamț', 'Olt',
       'Prahova', 'Satu Mare', 'Sălaj', 'Sibiu', 'Suceava', 'Teleorman',
       'Timiș', 'Tulcea', 'Vaslui', 'Vâlcea', 'Vrancea', 'Mun. București',
       '–'], dtype=object)

In [52]:
all_data_df.Confirmed.unique()

array(['9', '110', '10', '19', '39', '22', '40', '117', '11', '12', '13',
       '17', '105', '111', '34', '82', '7', '–', '100', '37', '54', '38',
       '107', '8', '27', '16', '6', '701', '98', '60', '505', '24', '44',
       '124', '26', '108', '90', '1', '45', '72', '46', '55', '150',
       '866', '126', '70', '544', '15', '128', '14', '28', '48', '127',
       '29', '121', '101', '23', '123', '50', '81', '64', '42', '57',
       '148', '25', '5', '967', '21', '136', '550', '33', '68', '131',
       '30', '114', '47', '56', '88', '78', '160', '49', '1.215', '176',
       '79', '552'], dtype=object)

In [53]:
all_data_df.Date.unique()

array(['2020-04-02', '2020-04-03', '2020-04-04', '2020-04-05'],
      dtype=object)

### Replace '-' in County with 'Not identified'

In [54]:
all_data_df.loc[all_data_df['County']=='–', 'County'] = 'Not identified'

### Replace '-' in Confirmed with '0'

In [55]:
all_data_df.loc[all_data_df['Confirmed']=='–', 'Confirmed'] = 0
all_data_df['Confirmed'] = all_data_df['Confirmed'].astype(str)
all_data_df['Confirmed'] = all_data_df['Confirmed'].apply(lambda x: x.replace(".", ""))
all_data_df['Confirmed'] = all_data_df['Confirmed'].astype(int)

In [56]:
max(all_data_df.Confirmed), min(all_data_df.Confirmed)

(1215, 0)

In [57]:
all_data_df.County.unique()

array(['Alba', 'Arad', 'Argeș', 'Bacău', 'Bihor', 'Bistrița-Năsăud',
       'Botoșani', 'Brașov', 'Brăila', 'Buzău', 'Caraș-Severin',
       'Călărași', 'Cluj', 'Constanța', 'Covasna', 'Dâmbovița', 'Dolj',
       'Galați', 'Giurgiu', 'Gorj', 'Harghita', 'Hunedoara', 'Ialomița',
       'Iași', 'Ilfov', 'Maramureș', 'Mehedinți', 'Mureș', 'Neamț', 'Olt',
       'Prahova', 'Satu Mare', 'Sălaj', 'Sibiu', 'Suceava', 'Teleorman',
       'Timiș', 'Tulcea', 'Vaslui', 'Vâlcea', 'Vrancea', 'Mun. București',
       'Not identified'], dtype=object)

In [58]:
all_data_df.Confirmed.unique()

array([   9,  110,   10,   19,   39,   22,   40,  117,   11,   12,   13,
         17,  105,  111,   34,   82,    7,    0,  100,   37,   54,   38,
        107,    8,   27,   16,    6,  701,   98,   60,  505,   24,   44,
        124,   26,  108,   90,    1,   45,   72,   46,   55,  150,  866,
        126,   70,  544,   15,  128,   14,   28,   48,  127,   29,  121,
        101,   23,  123,   50,   81,   64,   42,   57,  148,   25,    5,
        967,   21,  136,  550,   33,   68,  131,   30,  114,   47,   56,
         88,   78,  160,   49, 1215,  176,   79,  552])

In [59]:
for date in all_data_df.Date.unique():
    d_df = all_data_df.loc[all_data_df.Date==date]
    d_df.to_csv(os.path.join('ro_covid_19_daily_reports', f"{date}.csv"), index=False)

In [60]:
all_data_df.to_csv(os.path.join('ro_covid_19_time_series', "ro_covid_19_time_series.csv"), index=False)