In [1]:
# Imports of the respective libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Convert HTML table to a list of lists for each row
# so as to easily load the same pandas dataframe.

def generate_raw_table(html_table):
    table = []    
    year, disaster_type, death_toll, damage_cost, disaster_name, notes = None, None, None, None, None, None
    # find all rows from the table
    for row in html_table.find_all('tr'):
        r=[]        
        # find all columns from the row
        for col in row.find_all('td'):
            
            # remove extra spaces from cell value
            cell = col.text.strip()
            
            # if the text is of 4 chars, store it as year if it's an integer 
            if len(cell) == 4:  
                try:
                    # distinguish between 1901 and July
                    year = int(cell)
                except:
                    pass
            # HTML date column has values like July 21, May–June, Mid-October
            # so split and convert the same in correct format
            elif 4 < len(cell) < 13:                
                if cell.startswith('Mid'):
                    date = cell.split('-')[1] + ' 15'
                else:
                    date = cell.split('–')[0]
            
            # to pick up the description (Event Col)
            elif len(cell) > 13:                
                # To remove text like [12][33][87] from the end.
                event = '.'.join(s for s in cell.split('.')[:-1])                
            else:
                continue
                
        # if we have got all the values then convert it into a list of 2 values
        if all([year, disaster_type, death_toll, damage_cost, disaster_name, notes]):
            event_date = str(year) + disaster_type + death_toll + damage_cost + disaster_name + notes
            r.append([event_date, event])
            
        # extend parent table list with above rows. 
        if r:
            table.extend(r)
    return table



In [2]:
# Create dataframe with Event Date and Event as 6 coloumns
def create_dataframe_from_raw_table(raw_table):
    df = pd.DataFrame(raw_table, columns=['Year', 'Disaster Type', 'Death Toll', 'Damage Cost', 'Disaster Name', 'Notes'])    
    return df

    

In [3]:
# Combine the dataframes at year
# and convert datetime 'string' to 'datetime' type
def combine_dataframes(df0, df1):
    df = df0.append(df1, ignore_index=True)    
    df['Year'] = pd.to_datetime(df['Year'])
    return df



In [7]:
# main():  the first function and it holds the flow of the script.
#def main():
url = 'https://en.wikipedia.org/wiki/List_of_natural_disasters_in_the_United_States'

# To get the (HTML) text of the static webpage.
res = requests.get(url).text
soup = BeautifulSoup(res,'lxml')
    
# Get all tables from the HTML text
html_table = soup.find_all('table',{'class':'wikitable'})
    
# Return dataframe
#return html_table

html_table

[<table class="wikitable sortable">
 <tbody><tr>
 <td><b>Year</b>
 </td>
 <td><b>Disaster</b>
 </td>
 <td><b>Death toll</b>
 </td>
 <td><b>Damage cost</b>
 </td>
 <td><b>Main article</b>
 </td>
 <td><b>Location</b>
 </td>
 <td><b>Notes</b>
 </td></tr>
 <tr>
 <td>2017
 </td>
 <td><a class="mw-redirect" href="/wiki/Hurricane" title="Hurricane">Hurricane</a>
 </td>
 <td>112-4760
 </td>
 <td>$91.61 billion
 </td>
 <td><a href="/wiki/Hurricane_Maria" title="Hurricane Maria">Hurricane Maria</a>
 </td>
 <td><a href="/wiki/Florida" title="Florida">Florida</a> and <a href="/wiki/Puerto_Rico" title="Puerto Rico">Puerto Rico</a>
 </td>
 <td>After strengthening at a near record pace and affecting multiple islands in the eastern Caribbean Sea, Maria struck Puerto Rico as a high-end Category 4 hurricane, causing catastrophic damage to the US island due to extremely powerful winds and devastating floods. The hurricane also knocked out the entire power grid, triggering a near total island blackout. Th

In [None]:
# Dataframe for Natural Disasters.
df = pd.DataFrame(html_table[:])

df.tail(10)