In [1]:
# Import modules
from bs4 import BeautifulSoup
import requests
from splinter import Browser
from selenium import webdriver
import pandas as pd
import time
from secret import username, password
from sqlalchemy import create_engine
import csv
import numpy as np
import html5lib

### Load main hurricane dataset from csv to pandas dataframe

In [2]:
main_csv_df = pd.read_csv("Data/Hurricane Data 2019.csv")
main_csv_df

Unnamed: 0,Date,Name,Time,Status,Latitude,Longitude,MaximumWind,AirPressure
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999
...,...,...,...,...,...,...,...,...
29969,20191126,SEBASTIEN,1200,EX,52.2N,9.3W,45,970
29970,20191126,SEBASTIEN,1800,EX,52.2N,8.9W,40,972
29971,20191127,SEBASTIEN,0,EX,51.8N,8.2W,40,974
29972,20191127,SEBASTIEN,600,EX,51.4N,6.0W,40,976


In [3]:
main_df = main_csv_df #.drop(columns=("Unnamed: 8"))
main_df

Unnamed: 0,Date,Name,Time,Status,Latitude,Longitude,MaximumWind,AirPressure
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999
...,...,...,...,...,...,...,...,...
29969,20191126,SEBASTIEN,1200,EX,52.2N,9.3W,45,970
29970,20191126,SEBASTIEN,1800,EX,52.2N,8.9W,40,972
29971,20191127,SEBASTIEN,0,EX,51.8N,8.2W,40,974
29972,20191127,SEBASTIEN,600,EX,51.4N,6.0W,40,976


In [4]:
main_df.columns = main_df.columns.str.lower()

In [5]:
main_df.rename(columns={'maximumwind':'max_wind','airpressure':'air_pressure'}, inplace=True)
main_df

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999
...,...,...,...,...,...,...,...,...
29969,20191126,SEBASTIEN,1200,EX,52.2N,9.3W,45,970
29970,20191126,SEBASTIEN,1800,EX,52.2N,8.9W,40,972
29971,20191127,SEBASTIEN,0,EX,51.8N,8.2W,40,974
29972,20191127,SEBASTIEN,600,EX,51.4N,6.0W,40,976


In [6]:
longitudes = main_df['longitude']
longitudes

0        55.5W
1        56.3W
2        57.4W
3        58.6W
4        60.0W
         ...  
29969     9.3W
29970     8.9W
29971     8.2W
29972     6.0W
29973     2.1W
Name: longitude, Length: 29974, dtype: object

In [7]:
long_list = []
for i in range(len(longitudes)):
    longitude = longitudes[i]
    W = 'W' in longitude
    d, m = map(float, longitude[:-1].split('.'))
    longitude = (d + m / 60.) * (-1 if W else 1)
    long_list.append(round(longitude, 2))
print(long_list[:5])

[-55.08, -56.05, -57.07, -58.1, -60.0]


In [8]:
latitudes = main_df['latitude']

In [9]:
lat_list = []
for i in range(len(latitudes)):
    latitude = latitudes[i]
    N = 'N' in latitude
    d, m = map(float, latitude[:-1].split('.'))
    latitude = (d + m / 60.) * (1 if N else -1)
    lat_list.append(round(latitude, 2))
print(lat_list[:5])

[17.02, 17.12, 18.03, 19.0, 20.0]


In [10]:
main_df['latitude_decimal'] = lat_list
main_df['longitude_decimal'] = long_list

In [11]:
len(main_df)

29974

In [12]:
clean_df = main_df.dropna()
clean_df

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999,17.02,-55.08
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999,17.12,-56.05
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999,19.00,-58.10
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999,20.00,-60.00
...,...,...,...,...,...,...,...,...,...,...
29969,20191126,SEBASTIEN,1200,EX,52.2N,9.3W,45,970,52.03,-9.05
29970,20191126,SEBASTIEN,1800,EX,52.2N,8.9W,40,972,52.03,-8.15
29971,20191127,SEBASTIEN,0,EX,51.8N,8.2W,40,974,51.13,-8.03
29972,20191127,SEBASTIEN,600,EX,51.4N,6.0W,40,976,51.07,-6.00


In [13]:
len(clean_df)

29974

### Group by hurricane name to assign unique ID

In [14]:
# Need to create a df with unique hurricanes IDs, based on name and date

In [15]:
dates = clean_df['date']

In [16]:
# Extract year column from date
years = []
for i in range(len(dates)):
    split_str = int(str(dates[i])[:4])
    years.append(split_str)

In [17]:
print(len(years))

29974


In [18]:
clean_df['year'] = years
clean_df.head()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999,17.12,-56.05,1950
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07,1950
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999,19.0,-58.1,1950
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999,20.0,-60.0,1950


### Rename hurricanes to Capitalized

In [19]:
names = clean_df['name']
names_capitalized = names.str.title()
names_capitalized
clean_df['name'] = names_capitalized
clean_df.head()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,19500812,Able,600,TS,17.7N,56.3W,40,-999,17.12,-56.05,1950
2,19500812,Able,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07,1950
3,19500812,Able,1800,TS,19.0N,58.6W,50,-999,19.0,-58.1,1950
4,19500813,Able,0,TS,20.0N,60.0W,50,-999,20.0,-60.0,1950


In [20]:
clean_df.tail(100)

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
29874,20191018,Nestor,1200,DB,25.2N,91.2W,50,1001,25.03,-91.03,2019
29875,20191018,Nestor,1800,TS,26.4N,89.4W,50,1000,26.07,-89.07,2019
29876,20191019,Nestor,0,TS,27.8N,87.8W,50,996,27.13,-87.13,2019
29877,20191019,Nestor,600,TS,28.6N,87.1W,50,996,28.10,-87.02,2019
29878,20191019,Nestor,1200,EX,29.3N,86.4W,40,996,29.05,-86.07,2019
...,...,...,...,...,...,...,...,...,...,...,...
29969,20191126,Sebastien,1200,EX,52.2N,9.3W,45,970,52.03,-9.05,2019
29970,20191126,Sebastien,1800,EX,52.2N,8.9W,40,972,52.03,-8.15,2019
29971,20191127,Sebastien,0,EX,51.8N,8.2W,40,974,51.13,-8.03,2019
29972,20191127,Sebastien,600,EX,51.4N,6.0W,40,976,51.07,-6.00,2019


### Find duplicated names

In [21]:
# Getting the duplicate movie by title and star released. 
dupl_df = clean_df.loc[clean_df.duplicated(subset=["name", "year"]), "name"].unique()
dupl_df

array(['Able', 'Baker', 'Charlie', 'Dog', 'Easy', 'Fox', 'George', 'How',
       'Item', 'Jig', 'King', 'Unnamed', 'Love', 'Mike', 'Alice',
       'Barbara', 'Carol', 'Dolly', 'Edna', 'Florence', 'Gail', 'Hazel',
       'Gilda', 'Brenda', 'Connie', 'Diane', 'Edith', 'Flora', 'Gladys',
       'Ione', 'Hilda', 'Janet', 'Katie', 'Anna', 'Betsy', 'Carla',
       'Dora', 'Ethel', 'Flossy', 'Greta', 'Audrey', 'Bertha', 'Carrie',
       'Debbie', 'Esther', 'Frieda', 'Alma', 'Becky', 'Cleo', 'Daisy',
       'Ella', 'Fifi', 'Gerda', 'Helene', 'Ilsa', 'Janice', 'Arlene',
       'Beulah', 'Cindy', 'Debra', 'Gracie', 'Hannah', 'Irene', 'Judith',
       'Abby', 'Donna', 'Frances', 'Hattie', 'Jenny', 'Inga', 'Celia',
       'Ginny', 'Helena', 'Isbell', 'Elena', 'Dorothy', 'Faith', 'Hallie',
       'Inez', 'Lois', 'Chloe', 'Doria', 'Fern', 'Ginger', 'Heidi',
       'Candy', 'Blanche', 'Camille', 'Eve', 'Francelia', 'Holly', 'Kara',
       'Laurie', 'Martha', 'Felice', 'Beth', 'Kristy', 'Laura', 'Alph

In [22]:
len(dupl_df)

295

In [23]:
# Remove the duplicates 
clean_df = clean_df.drop_duplicates(subset=['name','year'], keep='first')
clean_df.head()
clean_df.to_csv("Data/clean_df.csv")

In [24]:
len(clean_df)

825

### Add hurricane index column

In [25]:
hurricanes_df = clean_df
hurricanes_df.head()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
51,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.0,-54.0,1950
111,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
172,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.1,1950
246,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950


In [26]:
hurricanes_df.reset_index()
hurricanes_df.head()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
51,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.0,-54.0,1950
111,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
172,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.1,1950
246,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950


In [27]:
hurricanes_df = hurricanes_df.reset_index(drop=True)
hurricanes_df

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.00,-54.00,1950
2,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.10,1950
4,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950
...,...,...,...,...,...,...,...,...,...,...,...
820,20191017,Nestor,1200,DB,22.2N,95.7W,35,1007,22.03,-95.12,2019
821,20191025,Olga,1200,TS,24.7N,94.8W,35,1004,24.12,-94.13,2019
822,20191023,Pablo,1800,EX,40.0N,38.3W,40,1000,40.00,-38.05,2019
823,20191027,Rebekah,0,EX,40.5N,51.5W,35,1004,40.08,-51.08,2019


In [28]:
# Start index from 1
hurricanes_df.index += 1
hurricanes_df.head(100)

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.00,-54.00,1950
3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.10,1950
5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950
...,...,...,...,...,...,...,...,...,...,...,...
96,19590920,Gracie,1200,TD,19.8N,68.3W,25,1009,19.13,-68.05,1959
97,19590927,Hannah,1200,TD,26.8N,49.9W,30,-999,26.13,-49.15,1959
98,19591006,Irene,1800,TD,23.2N,92.5W,30,-999,23.03,-92.08,1959
99,19591014,Judith,600,TD,14.0N,73.5W,30,-999,14.00,-73.08,1959


In [29]:
# Reset index
hurricanes_df = hurricanes_df.reset_index()
hurricanes_df.head()

Unnamed: 0,index,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.0,-54.0,1950
2,3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.1,1950
4,5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950


In [30]:
hurricanes_df = hurricanes_df.rename(columns={'index': 'hurricane_id'})
hurricanes_df

Unnamed: 0,hurricane_id,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.00,-54.00,1950
2,3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.10,1950
4,5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950
...,...,...,...,...,...,...,...,...,...,...,...,...
820,821,20191017,Nestor,1200,DB,22.2N,95.7W,35,1007,22.03,-95.12,2019
821,822,20191025,Olga,1200,TS,24.7N,94.8W,35,1004,24.12,-94.13,2019
822,823,20191023,Pablo,1800,EX,40.0N,38.3W,40,1000,40.00,-38.05,2019
823,824,20191027,Rebekah,0,EX,40.5N,51.5W,35,1004,40.08,-51.08,2019


In [31]:
hurricanes_df.to_csv("Data/00_Hurricane Data 2019_with_unique_IDs.csv")

In [32]:
hurricanes_df.head()

Unnamed: 0,hurricane_id,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.0,-54.0,1950
2,3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.1,1950
4,5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950


### Merge main df with costliest df on name and keep hurricane_id as foreign key

### Mac (Irina) Connection

In [33]:
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

/usr/local/bin/chromedriver


In [34]:
# executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
# browser = Browser('chrome', **executable_path, headless=False)

### Windows (Clay) Connection

In [35]:
# executable_path = {'executable_path': 'C:/Users/Clay/chromedriver.exe'}
# browser = Browser('chrome', **executable_path, headless=False)

### Dataset 1 - Web Scrape Wikipedia Most Costliest hurricanes

In [36]:
url = 'https://en.wikipedia.org/wiki/List_of_costliest_Atlantic_hurricanes'
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(url)
time.sleep(3)
print("Open web browser")
tables = pd.read_html(url)
most_costly_hurricanes = tables[0]
print("Read and save table")
browser.quit()
most_costly_hurricanes.head()

Open web browser
Read and save table


Unnamed: 0,Name,Nominal damage(Billions USD),Normalized damage(Billions USD),Season,Storm classificationat peak intensity,Areas affected,References
0,Katrina,$125.0,$116.9,2005,Category 5 hurricane,Louisiana Mississippi The Bahamas United State...,[4][5]
1,Harvey,$125.0,$62.2,2017,Category 4 hurricane,Texas Louisiana South America Central America ...,[5][6]
2,Maria,$90.0,,2017,Category 5 hurricane,Puerto Rico Lesser Antilles Greater Antilles C...,[7]
3,Irma,$77.2,$31.0,2017,Category 5 hurricane,Lesser Antilles Greater Antilles Caribbean Sea...,[5]
4,Sandy,$68.7,$73.5,2012,Category 3 hurricane,The Caribbean United States East Coast Eastern...,[5][8]


In [37]:
# Create a copy of the df
renamed_cost_df = most_costly_hurricanes

In [38]:
# # Rename columns
renamed_cost_df = renamed_cost_df.rename(columns={'Name': 'name','Nominal damage(Billions USD)': 'damage_usd', 'Normalized damage(Billions USD)': 'norm_damage_usd', 
                'Season': 'year', 'Storm classificationat peak intensity': 'category', 'Areas affected':'states'})

In [39]:
# Drop Reference column
renamed_cost_df = renamed_cost_df.drop(columns=['References','states'])

In [40]:
renamed_cost_df.head()

Unnamed: 0,name,damage_usd,norm_damage_usd,year,category
0,Katrina,$125.0,$116.9,2005,Category 5 hurricane
1,Harvey,$125.0,$62.2,2017,Category 4 hurricane
2,Maria,$90.0,,2017,Category 5 hurricane
3,Irma,$77.2,$31.0,2017,Category 5 hurricane
4,Sandy,$68.7,$73.5,2012,Category 3 hurricane


In [41]:
# Save to the csv file
renamed_cost_df.to_csv("Data/01_most_costly_hurricanes_wiki_web.csv")

In [42]:
# Read csv file into df
renamed_cost_df = pd.read_csv('Data/01_most_costly_hurricanes_wiki_web.csv', index_col=[0])

In [43]:
renamed_cost_df.head()

Unnamed: 0,name,damage_usd,norm_damage_usd,year,category
0,Katrina,$125.0,$116.9,2005,Category 5 hurricane
1,Harvey,$125.0,$62.2,2017,Category 4 hurricane
2,Maria,$90.0,,2017,Category 5 hurricane
3,Irma,$77.2,$31.0,2017,Category 5 hurricane
4,Sandy,$68.7,$73.5,2012,Category 3 hurricane


## Prepare Cost data

In [44]:
cost_df = renamed_cost_df

In [45]:
cost_df["norm_damage_usd"] = cost_df["norm_damage_usd"].str.replace('$','')
cost_df["norm_damage_usd"] = cost_df["norm_damage_usd"].str.replace('<','')
cost_df["norm_damage_usd"] = cost_df["norm_damage_usd"].str.replace('>','')
cost_df["damage_usd"] = cost_df["damage_usd"].str.replace('$','')
cost_df["damage_usd"] = cost_df["damage_usd"].str.replace('<','')
cost_df["damage_usd"] = cost_df["damage_usd"].str.replace('>','')
cost_df['norm_damage_usd'] = cost_df['norm_damage_usd'].astype(float)
cost_df['norm_damage_usd'] = cost_df['norm_damage_usd'].dropna()
cost_df['damage_usd'] = cost_df['damage_usd'].astype(float)
cost_df['damage_usd'] = cost_df['damage_usd'].dropna()
cost_df = cost_df.dropna()
len(cost_df) # 54 count

54

In [46]:
len(cost_df)

54

In [47]:
len(cost_df['name'].unique())

54

In [50]:
cost_df.head()

Unnamed: 0,name,damage_usd,norm_damage_usd,year,category
0,Katrina,125.0,116.9,2005,Category 5 hurricane
1,Harvey,125.0,62.2,2017,Category 4 hurricane
3,Irma,77.2,31.0,2017,Category 5 hurricane
4,Sandy,68.7,73.5,2012,Category 3 hurricane
5,Ike,38.0,35.2,2008,Category 4 hurricane


In [51]:
# # Merge two dataframes on both 'name' and 'year' columns to avoid duplicated entries
# new_df = pd.merge(cost_df, clean_df,  how='left', left_on=['name','year'], right_on = ['name','year'])

In [52]:
# Merge two dataframes on both 'name' and 'year' columns to avoid duplicated entries
new_df2 = pd.merge(fatal_df, hurricanes_df,  
                  how='left', left_on=['name','year'], right_on = ['name','year'])

NameError: name 'fatal_df' is not defined

In [None]:
len(new_df2)

In [None]:
new_df2.head(60)

In [None]:
# Sort merged df by norm_damage_usd column in descending order 
sorted_new_df = new_df.sort_values(by=['norm_damage_usd'], ascending=False)
sorted_new_df.head(54)

In [None]:
# Re-assign to cost_df
cost_df = sorted_new_df

In [None]:
# Drop unnecessary columns
cost_df = cost_df.drop(columns=['category','date','time','status','latitude','longitude','max_wind','air_pressure','latitude_decimal','longitude_decimal'])

In [None]:
cost_df = cost_df[['hurricane_id','name','year','norm_damage_usd','damage_usd']]
cost_df.head()

## Dataset 2 - Deadliest hurricanes in the atlantic
#### https://en.wikipedia.org/wiki/List_of_deadliest_Atlantic_hurricanes

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_deadliest_Atlantic_hurricanes'
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(url)
time.sleep(3)
print("Open web browser")
tables = pd.read_html(url)
most_deadliest_hurricanes = tables[2]
print("Read and save table")
browser.quit()
most_deadliest_hurricanes.head()

In [None]:
most_fatal_hurricanes = most_deadliest_hurricanes.head(22)

In [None]:
most_fatal_hurricanes.tail()

In [None]:
len(most_fatal_hurricanes)

In [53]:
# Create a copy of the df
fatal_df = most_fatal_hurricanes
len(fatal_df)

NameError: name 'most_fatal_hurricanes' is not defined

In [None]:
dates_active = fatal_df['Dates active']
dates_active

In [None]:
dates_list = []
for i in range(len(dates_active)):
    split_str = dates_active[i].split(",")[-1]
    print(split_str)
    dates_list.append(int(split_str))
print(len(dates_list))

In [None]:
fatal_df['year'] = dates_lst
fatal_df.head()

In [None]:
# Drop columns
fatal_df = fatal_df.drop(columns=['Dates active','Saffir-Simpson Category',
                                          'Sustainedwind speeds','Pressure','Areas affected','Damage(US$)','Refs'])

In [None]:
fatal_df = fatal_df.rename(columns={'Name': 'name', 'Deaths': 'deaths'})
fatal_df.head()

In [None]:
fatal_df.head()

In [None]:
fatal_df["deaths"] = fatal_df["deaths"].str.replace(',','')
fatal_df["deaths"] = fatal_df["deaths"].str.replace('–',',')
fatal_df.head()

In [None]:
fatal_list = fatal_df["deaths"]

In [None]:
num_avg = []
for i in range(len(fatal_list)):
    if "," in fatal_list[i]:
        num0 = fatal_list[i].split(",")[0]
        num1 = fatal_list[i].split(",")[1]
        num = (int(num1) + int(num0))/2
        num_avg.append(num)
    else: num_avg.append(fatal_list[i])
print(num_avg)

In [None]:
fatal_df = fatal_df.drop(columns="deaths")

In [None]:
fatal_df['deaths'] = num_avg

In [None]:
fatal_df = fatal_df.head(22)

In [None]:
fatal_df.head()

In [None]:
len(fatal_df)

In [None]:
# Merge two dataframes on both 'name' and 'year' columns to avoid duplicated entries
new_fatal_merged_df = pd.merge(fatal_df, hurricanes_df,  how='left', 
                               left_on=['name', 'year'], right_on = ['name','year'])
new_fatal_merged_df.head()

In [None]:
new_fatal_merged_df.tail(60)

In [None]:
# Filter out unnamed hurricanes before 1975
fatal_df = new_fatal_merged_df[new_fatal_merged_df['year'] > 1975]
fatal_df.head()

In [None]:
fatal_df.info()

In [None]:
# Convert hurricane_id back to integer
fatal_df['hurricane_id'] = fatal_df['hurricane_id'].astype(int)
fatal_df.head()

In [None]:
fatal_df.info()

In [None]:
# Drop columns
fatal_df = fatal_df[['hurricane_id','name','year','deaths']]

In [None]:
fatal_df

### Merge total_gross_info and domestic_gross_info tables on 'name' column

In [None]:
# # combine total and domestic tables on left using name of the movie column
# combined_gross_info = pd.merge(domestic_gross_info, total_gross_info, how='left', on="name")

In [None]:
# combined_gross_info.head()

In [None]:
# len(combined_gross_info)AssertionError

In [None]:
# # Add new column and compute the international_total_revenue
# combined_gross_info['international_revenue_usd'] = \
#         (combined_gross_info['total_revenue_usd'] - combined_gross_info['domestic_revenue_usd'])

In [None]:
# # Rename columns
# combined_gross_info = combined_gross_info.rename(columns={'star':'actor'})

In [None]:
# # Getting the duplicate movie by title and star released. 
# dupl_combined_df = combined_gross_info.loc[combined_gross_info.duplicated(subset=["name","director", "year"]), "name"].unique()
# dupl_combined_df

In [None]:
# # Get all the data for the duplicate movie 
# dupl_df = combined_gross_info.loc[combined_gross_info["name"] == "Clash of the titans"]
# dupl_df

#### (removed) Use pandas to push data into database

In [None]:
# # Check existing tables
# engine.table_names()

In [None]:
# # Create domesic gross revenue table in the database
# domestic_gross_info.to_sql(name='domestic_gross_info', con=engine, if_exists='append', index=False)

In [None]:
# type(total_gross_info['name'])

In [None]:
# total_gross_info

In [None]:
# domestic_gross_info

In [None]:
# # Remove the duplicates
# # clean_combined_df = combined_gross_info.drop_duplicates(subset=['name','director','year'], keep='first')
# clean_combined_df

In [None]:
# len(clean_combined_df)

### Create movies table from combined table

In [None]:
# # Re-arrange the columns and write into new table
# movies = clean_combined_df[['name','total_revenue_usd','international_revenue_usd','domestic_revenue_usd',
#                               'director','actor','writer','genre','rating','company','country','year']]

In [None]:
# movies.head()

In [None]:
# # Fill NaN values with zeroes
# movies[['total_revenue_usd','international_revenue_usd','domestic_revenue_usd']] = \
#     movies[['total_revenue_usd','international_revenue_usd','domestic_revenue_usd']].fillna(0)

### Extract Actor data into a separate table

In [None]:
# # Save star_actor data into a separate table
# actor = movies['actor'].unique()
# len(actor)

In [None]:
# actor[:5]

In [None]:
# actor = pd.DataFrame({'actor': actor})
# actor.head(2)

In [None]:
# actor.index += 1
# actor

In [None]:
# actor = actor.reset_index()
# actor.head(2)

In [None]:
# actor = actor.rename(columns={'index': 'actor_id'})
# actor.head()

In [None]:
# len(actor)

### Merge movie and actor df on actor column

In [None]:
# # combine movie and tables
# movie_actor = pd.merge(movies, actor, how='outer', on="actor")

In [None]:
# # Extract only movie_id and actor_id columns
# movie_actor = movie_actor[['movie_id', 'actor_id']]
# movie_actor

In [None]:
# # Save to csv
# movie_actor.to_csv('csv_files/movie_actor.csv')

### Create director_movie table

In [None]:
# # Save star_actor data into a separate table
# director = movies['director'].unique()
# len(director)

In [None]:
# director[:5]

In [None]:
# director = pd.DataFrame({'director': director})
# director.head(2)

In [None]:
# director.index += 1
# director

In [None]:
# director = director.reset_index()
# director.head(2)

In [54]:
# director = director.rename(columns={'index': 'director_id'})
# director.head()

In [55]:
# len(director)

### Merge movie and director df on actor column

In [56]:
# # combine movie and tables
# movie_director = pd.merge(movies, director, how='outer', on="director")

In [57]:
# # Extract only movie_id and actor_id columns
# movie_director = movie_director[['movie_id', 'director_id']]
# movie_director

In [58]:
# # Save to csv
# movie_director.to_csv('csv_files/movie_director.csv')

### Create company_movie table

In [59]:
# # Save company_movie data into a separate table
# company = movies['company'].unique()
# len(company)

In [60]:
# company[:5]

In [61]:
# company = pd.DataFrame({'company': company})
# company.head(2)

In [62]:
# company.index += 1
# company

In [63]:
# company = company.reset_index()
# company.head(2)

In [64]:
# company = company.rename(columns={'index': 'company_id'})
# company.head()

In [65]:
# len(company)

### Merge movie and company df on actor column

In [66]:
# # combine movie and tables
# company_movie = pd.merge(movies, company, how='outer', on="company")

In [67]:
# # Extract only movie_id and actor_id columns
# company_movie = company_movie[['movie_id', 'company_id']]
# company_movie

In [68]:
# # Save to csv
# company_movie.to_csv('csv_files/company_movie.csv')

## Extract Writer data into a separate table

In [69]:
# # Save writer data into a separate table
# writer = movies['writer'].unique()
# len(writer)

In [70]:
# writer[:5]

In [71]:
# writer = pd.DataFrame({'writer': writer})
# writer.head(2)

In [72]:
# writer.index += 1
# writer

In [73]:
# writer = writer.reset_index()
# writer.head(2)

In [74]:
# writer = writer.rename(columns={'index': 'writer_id'})
# writer.head()

In [75]:
# len(writer)

### Merge writer_id column to movies table

In [76]:
# # combine movie and tables
# movies = pd.merge(movies, writer, how='outer', on="writer")

In [77]:
# movies.head()

## Extract Genre data into a separate table

In [78]:
# # Save genre data into a separate table
# genre = movies['genre'].unique()
# len(genre)

In [79]:
# genre[:5]

In [80]:
# genre = pd.DataFrame({'genre': genre})
# genre.head(2)

In [81]:
# genre.index += 1
# genre

In [82]:
# genre = genre.reset_index()
# genre.head(2)

In [83]:
# genre = genre.rename(columns={'index': 'genre_id'})
# genre.head()

In [84]:
# len(genre)

### Merge genre_id column to movies table

In [85]:
# # combine movies and genre
# movies = pd.merge(movies, genre, how='outer', on="genre")

In [86]:
# movies.head()

In [87]:
# movies.to_csv('csv_files/movies_genre_ids.csv')

## Extract rating data into a separate table

In [88]:
# # Save rating data into a separate table
# rating = movies['rating'].unique()
# len(rating)

In [89]:
# rating[:5]

In [90]:
# rating = pd.DataFrame({'rating': rating})
# rating.head(2)

In [91]:
# rating.index += 1

In [92]:
# rating = rating.reset_index()
# rating.head(2)

In [93]:
# rating = rating.rename(columns={'index': 'rating_id'})
# rating.head()

In [94]:
# len(rating)

### Merge rating_id column to movies table

In [95]:
# # combine movies and rating
# movies = pd.merge(movies, rating, how='outer', on="rating")

In [96]:
# movies.head()

In [97]:
# movies.to_csv('csv_files/movies_rating_ids.csv')

# Remove unnecessary columns from movies

In [98]:
# movies.head(2)

In [99]:
# movies = movies[['movie_id','country','genre_id','writer_id','rating_id','name','year','domestic_revenue_usd',
#                  'international_revenue_usd', 'total_revenue_usd']]

In [100]:
# movies.head()

## Push table to PostgreSQL

### !!! Please import QuickDBD-export.sql in the repo root folder file in pgAdmin
### !!! Run SQL file to generate the tables and reference links
### UNCOMMENT cell below to run the rest of the code

In [101]:
# Connect to the local database
connection_string = f'{username}:{password}@localhost:5432/hurricanes_db'
engine = create_engine(f'postgresql://{connection_string}')

In [102]:
# genre.to_sql(name='genre', con=engine, if_exists='append', index=False)

In [103]:
# writer.to_sql(name='writer', con=engine, if_exists='append', index=False)

In [104]:
# director.to_sql(name='director', con=engine, if_exists='append', index=False)

In [105]:
# rating.to_sql(name='rating', con=engine, if_exists='append', index=False)

In [106]:
# company.to_sql(name='company', con=engine, if_exists='append', index=False)

In [107]:
# actor.to_sql(name='actor', con=engine, if_exists='append', index=False)

In [108]:
# movies.to_sql(name='movies', con=engine, if_exists='append', index=False)

In [109]:
# company_movie.to_sql(name='company_movie', con=engine, if_exists='append', index=False)

In [110]:
hurricanes_df.head()

Unnamed: 0,hurricane_id,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.0,-54.0,1950
2,3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.1,1950
4,5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950


In [111]:
hurricanes_df.to_sql(name="hurricanes", con=engine, if_exists='append', index=False)

In [112]:
cost_df.to_sql(name="cost", con=engine, if_exists='append', index=False)

ProgrammingError: (psycopg2.errors.UndefinedColumn) column "category" of relation "cost" does not exist
LINE 1: ...TO cost (name, damage_usd, norm_damage_usd, year, category) ...
                                                             ^

[SQL: INSERT INTO cost (name, damage_usd, norm_damage_usd, year, category) VALUES (%(name)s, %(damage_usd)s, %(norm_damage_usd)s, %(year)s, %(category)s)]
[parameters: ({'name': 'Katrina', 'damage_usd': 125.0, 'norm_damage_usd': 116.9, 'year': 2005, 'category': 'Category\xa05 hurricane'}, {'name': 'Harvey', 'damage_usd': 125.0, 'norm_damage_usd': 62.2, 'year': 2017, 'category': 'Category\xa04 hurricane'}, {'name': 'Irma', 'damage_usd': 77.2, 'norm_damage_usd': 31.0, 'year': 2017, 'category': 'Category\xa05 hurricane'}, {'name': 'Sandy', 'damage_usd': 68.7, 'norm_damage_usd': 73.5, 'year': 2012, 'category': 'Category 3 hurricane'}, {'name': 'Ike', 'damage_usd': 38.0, 'norm_damage_usd': 35.2, 'year': 2008, 'category': 'Category\xa04 hurricane'}, {'name': 'Wilma', 'damage_usd': 27.4, 'norm_damage_usd': 31.9, 'year': 2005, 'category': 'Category\xa05 hurricane'}, {'name': 'Andrew', 'damage_usd': 27.3, 'norm_damage_usd': 106.0, 'year': 1992, 'category': 'Category\xa05 hurricane'}, {'name': 'Ivan', 'damage_usd': 26.1, 'norm_damage_usd': 25.9, 'year': 2004, 'category': 'Category\xa05 hurricane'}  ... displaying 10 of 54 total bound parameter sets ...  {'name': 'Emily', 'damage_usd': 1.01, 'norm_damage_usd': 6.0, 'year': 2005, 'category': 'Category\xa05 hurricane'}, {'name': 'Bonnie', 'damage_usd': 1.0, 'norm_damage_usd': 6.0, 'year': 1998, 'category': 'Category\xa03 hurricane'})]
(Background on this error at: http://sqlalche.me/e/13/f405)

In [None]:
fatal_df.to_sql(name="fatalities", con=engine, if_exists='append', index=False)