In [1]:
# Import modules
from bs4 import BeautifulSoup
import requests
from splinter import Browser
from selenium import webdriver
import pandas as pd
import time
from secret import username, password
from sqlalchemy import create_engine
import csv
import numpy as np
import html5lib

### Load main hurricane dataset from csv to pandas dataframe

In [2]:
main_csv_df = pd.read_csv("Data/Hurricane Data 2019.csv")
main_csv_df

Unnamed: 0,Date,Name,Time,Status,Latitude,Longitude,MaximumWind,AirPressure
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999
...,...,...,...,...,...,...,...,...
29969,20191126,SEBASTIEN,1200,EX,52.2N,9.3W,45,970
29970,20191126,SEBASTIEN,1800,EX,52.2N,8.9W,40,972
29971,20191127,SEBASTIEN,0,EX,51.8N,8.2W,40,974
29972,20191127,SEBASTIEN,600,EX,51.4N,6.0W,40,976


In [3]:
main_df = main_csv_df #.drop(columns=("Unnamed: 8"))
main_df

Unnamed: 0,Date,Name,Time,Status,Latitude,Longitude,MaximumWind,AirPressure
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999
...,...,...,...,...,...,...,...,...
29969,20191126,SEBASTIEN,1200,EX,52.2N,9.3W,45,970
29970,20191126,SEBASTIEN,1800,EX,52.2N,8.9W,40,972
29971,20191127,SEBASTIEN,0,EX,51.8N,8.2W,40,974
29972,20191127,SEBASTIEN,600,EX,51.4N,6.0W,40,976


In [4]:
main_df.columns = main_df.columns.str.lower()

In [5]:
main_df.rename(columns={'maximumwind':'max_wind','airpressure':'air_pressure'}, inplace=True)
main_df

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999
...,...,...,...,...,...,...,...,...
29969,20191126,SEBASTIEN,1200,EX,52.2N,9.3W,45,970
29970,20191126,SEBASTIEN,1800,EX,52.2N,8.9W,40,972
29971,20191127,SEBASTIEN,0,EX,51.8N,8.2W,40,974
29972,20191127,SEBASTIEN,600,EX,51.4N,6.0W,40,976


In [6]:
# change format for latitudes and longitudes
longitudes = main_df['longitude']
long_list = []
for i in range(len(longitudes)):
    longitude = longitudes[i]
    W = 'W' in longitude
    d, m = map(float, longitude[:-1].split('.'))
    longitude = (d + m / 60.) * (-1 if W else 1)
    long_list.append(round(longitude, 2))
latitudes = main_df['latitude']
lat_list = []
for i in range(len(latitudes)):
    latitude = latitudes[i]
    N = 'N' in latitude
    d, m = map(float, latitude[:-1].split('.'))
    latitude = (d + m / 60.) * (1 if N else -1)
    lat_list.append(round(latitude, 2))
main_df['latitude_decimal'] = lat_list
main_df['longitude_decimal'] = long_list

In [7]:
len(main_df)

29974

In [8]:
clean_df = main_df.dropna()
clean_df

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999,17.02,-55.08
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999,17.12,-56.05
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999,19.00,-58.10
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999,20.00,-60.00
...,...,...,...,...,...,...,...,...,...,...
29969,20191126,SEBASTIEN,1200,EX,52.2N,9.3W,45,970,52.03,-9.05
29970,20191126,SEBASTIEN,1800,EX,52.2N,8.9W,40,972,52.03,-8.15
29971,20191127,SEBASTIEN,0,EX,51.8N,8.2W,40,974,51.13,-8.03
29972,20191127,SEBASTIEN,600,EX,51.4N,6.0W,40,976,51.07,-6.00


In [9]:
len(clean_df)

29974

### Create year column 
#### Group by hurricane name to assign unique ID

In [10]:
# Extract year based on date column
dates = clean_df['date']
years = []
for i in range(len(dates)):
    split_str = int(str(dates[i])[:4])
    years.append(split_str)
clean_df['year'] = years
clean_df.head()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999,17.12,-56.05,1950
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07,1950
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999,19.0,-58.1,1950
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999,20.0,-60.0,1950


### Rename hurricanes to Capitalized

In [11]:
names = clean_df['name']
names_capitalized = names.str.title()
names_capitalized
clean_df['name'] = names_capitalized
clean_df

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,19500812,Able,600,TS,17.7N,56.3W,40,-999,17.12,-56.05,1950
2,19500812,Able,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07,1950
3,19500812,Able,1800,TS,19.0N,58.6W,50,-999,19.00,-58.10,1950
4,19500813,Able,0,TS,20.0N,60.0W,50,-999,20.00,-60.00,1950
...,...,...,...,...,...,...,...,...,...,...,...
29969,20191126,Sebastien,1200,EX,52.2N,9.3W,45,970,52.03,-9.05,2019
29970,20191126,Sebastien,1800,EX,52.2N,8.9W,40,972,52.03,-8.15,2019
29971,20191127,Sebastien,0,EX,51.8N,8.2W,40,974,51.13,-8.03,2019
29972,20191127,Sebastien,600,EX,51.4N,6.0W,40,976,51.07,-6.00,2019


In [12]:
len(clean_df)

29974

#### Save cleaned csv df with all hurricanes data - to use when combined with ids

In [13]:
# Save cleaned csv df with all hurricanes data - to use when combined with ids
clean_df_no_ids = clean_df

### Extract unique hurricanes names to create and associate ids

In [14]:
# Getting the duplicate movie by title and star released. 
dupl_df = clean_df.loc[clean_df.duplicated(subset=["name", "year"]), "name"].unique()
dupl_df

array(['Able', 'Baker', 'Charlie', 'Dog', 'Easy', 'Fox', 'George', 'How',
       'Item', 'Jig', 'King', 'Unnamed', 'Love', 'Mike', 'Alice',
       'Barbara', 'Carol', 'Dolly', 'Edna', 'Florence', 'Gail', 'Hazel',
       'Gilda', 'Brenda', 'Connie', 'Diane', 'Edith', 'Flora', 'Gladys',
       'Ione', 'Hilda', 'Janet', 'Katie', 'Anna', 'Betsy', 'Carla',
       'Dora', 'Ethel', 'Flossy', 'Greta', 'Audrey', 'Bertha', 'Carrie',
       'Debbie', 'Esther', 'Frieda', 'Alma', 'Becky', 'Cleo', 'Daisy',
       'Ella', 'Fifi', 'Gerda', 'Helene', 'Ilsa', 'Janice', 'Arlene',
       'Beulah', 'Cindy', 'Debra', 'Gracie', 'Hannah', 'Irene', 'Judith',
       'Abby', 'Donna', 'Frances', 'Hattie', 'Jenny', 'Inga', 'Celia',
       'Ginny', 'Helena', 'Isbell', 'Elena', 'Dorothy', 'Faith', 'Hallie',
       'Inez', 'Lois', 'Chloe', 'Doria', 'Fern', 'Ginger', 'Heidi',
       'Candy', 'Blanche', 'Camille', 'Eve', 'Francelia', 'Holly', 'Kara',
       'Laurie', 'Martha', 'Felice', 'Beth', 'Kristy', 'Laura', 'Alph

In [15]:
len(dupl_df)

295

In [16]:
# Remove the duplicates 
clean_df = clean_df.drop_duplicates(subset=['name','year'], keep='first')
clean_df.head()
clean_df.to_csv("Data/clean_df.csv")

In [17]:
len(clean_df)

825

### Add hurricane index column

In [18]:
hurricanes_df = clean_df
hurricanes_df.head()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
51,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.0,-54.0,1950
111,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
172,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.1,1950
246,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950


In [19]:
hurricanes_df.reset_index()
hurricanes_df.head()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
51,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.0,-54.0,1950
111,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
172,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.1,1950
246,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950


In [20]:
hurricanes_df = hurricanes_df.reset_index(drop=True)
hurricanes_df

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.00,-54.00,1950
2,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.10,1950
4,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950
...,...,...,...,...,...,...,...,...,...,...,...
820,20191017,Nestor,1200,DB,22.2N,95.7W,35,1007,22.03,-95.12,2019
821,20191025,Olga,1200,TS,24.7N,94.8W,35,1004,24.12,-94.13,2019
822,20191023,Pablo,1800,EX,40.0N,38.3W,40,1000,40.00,-38.05,2019
823,20191027,Rebekah,0,EX,40.5N,51.5W,35,1004,40.08,-51.08,2019


In [21]:
# Start index from 1
hurricanes_df.index += 1
hurricanes_df.head(100)

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.00,-54.00,1950
3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.10,1950
5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950
...,...,...,...,...,...,...,...,...,...,...,...
96,19590920,Gracie,1200,TD,19.8N,68.3W,25,1009,19.13,-68.05,1959
97,19590927,Hannah,1200,TD,26.8N,49.9W,30,-999,26.13,-49.15,1959
98,19591006,Irene,1800,TD,23.2N,92.5W,30,-999,23.03,-92.08,1959
99,19591014,Judith,600,TD,14.0N,73.5W,30,-999,14.00,-73.08,1959


In [22]:
# Reset index
hurricanes_df = hurricanes_df.reset_index()
hurricanes_df.head()

Unnamed: 0,index,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.0,-54.0,1950
2,3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.1,1950
4,5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950


In [23]:
hurricanes_df = hurricanes_df.rename(columns={'index': 'hurricane_id'})
hurricanes_df

Unnamed: 0,hurricane_id,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.00,-54.00,1950
2,3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.10,1950
4,5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950
...,...,...,...,...,...,...,...,...,...,...,...,...
820,821,20191017,Nestor,1200,DB,22.2N,95.7W,35,1007,22.03,-95.12,2019
821,822,20191025,Olga,1200,TS,24.7N,94.8W,35,1004,24.12,-94.13,2019
822,823,20191023,Pablo,1800,EX,40.0N,38.3W,40,1000,40.00,-38.05,2019
823,824,20191027,Rebekah,0,EX,40.5N,51.5W,35,1004,40.08,-51.08,2019


In [24]:
hurricanes_df.to_csv("Data/00_Hurricane Data 2019_with_unique_IDs.csv")

In [25]:
hurricanes_df

Unnamed: 0,hurricane_id,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.00,-54.00,1950
2,3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.10,1950
4,5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950
...,...,...,...,...,...,...,...,...,...,...,...,...
820,821,20191017,Nestor,1200,DB,22.2N,95.7W,35,1007,22.03,-95.12,2019
821,822,20191025,Olga,1200,TS,24.7N,94.8W,35,1004,24.12,-94.13,2019
822,823,20191023,Pablo,1800,EX,40.0N,38.3W,40,1000,40.00,-38.05,2019
823,824,20191027,Rebekah,0,EX,40.5N,51.5W,35,1004,40.08,-51.08,2019


#### Merge clean_df_no_ids with hurricanes_df to get a master df with all data

In [26]:
clean_df_no_ids

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,19500812,Able,600,TS,17.7N,56.3W,40,-999,17.12,-56.05,1950
2,19500812,Able,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07,1950
3,19500812,Able,1800,TS,19.0N,58.6W,50,-999,19.00,-58.10,1950
4,19500813,Able,0,TS,20.0N,60.0W,50,-999,20.00,-60.00,1950
...,...,...,...,...,...,...,...,...,...,...,...
29969,20191126,Sebastien,1200,EX,52.2N,9.3W,45,970,52.03,-9.05,2019
29970,20191126,Sebastien,1800,EX,52.2N,8.9W,40,972,52.03,-8.15,2019
29971,20191127,Sebastien,0,EX,51.8N,8.2W,40,974,51.13,-8.03,2019
29972,20191127,Sebastien,600,EX,51.4N,6.0W,40,976,51.07,-6.00,2019


In [27]:
# Filter columns from hurricanes_df before merge
temp_hurr_df = hurricanes_df[['name','year','hurricane_id']]
temp_hurr_df

Unnamed: 0,name,year,hurricane_id
0,Able,1950,1
1,Baker,1950,2
2,Charlie,1950,3
3,Dog,1950,4
4,Easy,1950,5
...,...,...,...
820,Nestor,2019,821
821,Olga,2019,822
822,Pablo,2019,823
823,Rebekah,2019,824


In [28]:
# Merge two dataframes on both 'name' and 'year' columns to avoid duplicated entries
master_df = pd.merge(clean_df_no_ids, temp_hurr_df,  
                  how='left', left_on=['name','year'], right_on = ['name','year'])

In [29]:
master_df.head(50)

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year,hurricane_id
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950,1
1,19500812,Able,600,TS,17.7N,56.3W,40,-999,17.12,-56.05,1950,1
2,19500812,Able,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07,1950,1
3,19500812,Able,1800,TS,19.0N,58.6W,50,-999,19.0,-58.1,1950,1
4,19500813,Able,0,TS,20.0N,60.0W,50,-999,20.0,-60.0,1950,1
5,19500813,Able,600,TS,20.7N,61.1W,50,-999,20.12,-61.02,1950,1
6,19500813,Able,1200,TS,21.3N,62.2W,55,-999,21.05,-62.03,1950,1
7,19500813,Able,1800,TS,22.0N,63.2W,55,997,22.0,-63.03,1950,1
8,19500814,Able,0,TS,22.7N,63.8W,60,995,22.12,-63.13,1950,1
9,19500814,Able,600,TS,23.1N,64.6W,60,-999,23.02,-64.1,1950,1


In [30]:
master_df.head(60)

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year,hurricane_id
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950,1
1,19500812,Able,600,TS,17.7N,56.3W,40,-999,17.12,-56.05,1950,1
2,19500812,Able,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07,1950,1
3,19500812,Able,1800,TS,19.0N,58.6W,50,-999,19.0,-58.1,1950,1
4,19500813,Able,0,TS,20.0N,60.0W,50,-999,20.0,-60.0,1950,1
5,19500813,Able,600,TS,20.7N,61.1W,50,-999,20.12,-61.02,1950,1
6,19500813,Able,1200,TS,21.3N,62.2W,55,-999,21.05,-62.03,1950,1
7,19500813,Able,1800,TS,22.0N,63.2W,55,997,22.0,-63.03,1950,1
8,19500814,Able,0,TS,22.7N,63.8W,60,995,22.12,-63.13,1950,1
9,19500814,Able,600,TS,23.1N,64.6W,60,-999,23.02,-64.1,1950,1


In [31]:
master_df.tail(60)

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year,hurricane_id
29914,20191028,Pablo,1200,EX,46.5N,17.9W,35,995,46.08,-17.15,2019,823
29915,20191028,Pablo,1800,EX,46.6N,17.6W,35,995,46.1,-17.1,2019,823
29916,20191029,Pablo,0,EX,46.8N,16.4W,30,995,46.13,-16.07,2019,823
29917,20191027,Rebekah,0,EX,40.5N,51.5W,35,1004,40.08,-51.08,2019,824
29918,20191027,Rebekah,600,EX,40.4N,48.0W,50,989,40.07,-48.0,2019,824
29919,20191027,Rebekah,1200,EX,40.2N,45.0W,60,976,40.03,-45.0,2019,824
29920,20191027,Rebekah,1800,EX,40.2N,43.1W,65,968,40.03,-43.02,2019,824
29921,20191028,Rebekah,0,EX,40.5N,41.1W,70,965,40.08,-41.02,2019,824
29922,20191028,Rebekah,600,EX,41.2N,39.5W,70,966,41.03,-39.08,2019,824
29923,20191028,Rebekah,1200,EX,42.4N,39.4W,65,967,42.07,-39.07,2019,824


In [32]:
len(master_df)

29974

### Merge main df with costliest df on name and keep hurricane_id as foreign key

### Mac (Irina) Connection

In [33]:
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

/usr/local/bin/chromedriver


In [34]:
# executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
# browser = Browser('chrome', **executable_path, headless=False)

### Windows (Clay) Connection

In [35]:
# executable_path = {'executable_path': 'C:/Users/Clay/chromedriver.exe'}
# browser = Browser('chrome', **executable_path, headless=False)

### Dataset 1 - Web Scrape Wikipedia Most Costliest hurricanes

In [36]:
url = 'https://en.wikipedia.org/wiki/List_of_costliest_Atlantic_hurricanes'
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(url)
time.sleep(3)
print("Open web browser")
tables = pd.read_html(url)
most_costly_hurricanes = tables[0]
print("Read and save table")
browser.quit()
most_costly_hurricanes.head()

Open web browser
Read and save table


Unnamed: 0,Name,Nominal damage(Billions USD),Normalized damage(Billions USD),Season,Storm classificationat peak intensity,Areas affected,References
0,Katrina,$125.0,$116.9,2005,Category 5 hurricane,Louisiana Mississippi The Bahamas United State...,[4][5]
1,Harvey,$125.0,$62.2,2017,Category 4 hurricane,Texas Louisiana South America Central America ...,[5][6]
2,Maria,$90.0,,2017,Category 5 hurricane,Puerto Rico Lesser Antilles Greater Antilles C...,[7]
3,Irma,$77.2,$31.0,2017,Category 5 hurricane,Lesser Antilles Greater Antilles Caribbean Sea...,[5]
4,Sandy,$68.7,$73.5,2012,Category 3 hurricane,The Caribbean United States East Coast Eastern...,[5][8]


In [37]:
# Create a copy of the df
renamed_cost_df = most_costly_hurricanes

In [38]:
# # Rename columns
renamed_cost_df = renamed_cost_df.rename(columns={'Name': 'name','Nominal damage(Billions USD)': 'damage_usd', 'Normalized damage(Billions USD)': 'norm_damage_usd', 
                'Season': 'year', 'Storm classificationat peak intensity': 'category', 'Areas affected':'states'})

In [39]:
# Drop Reference column
renamed_cost_df = renamed_cost_df.drop(columns=['References','states'])

In [40]:
renamed_cost_df.head()

Unnamed: 0,name,damage_usd,norm_damage_usd,year,category
0,Katrina,$125.0,$116.9,2005,Category 5 hurricane
1,Harvey,$125.0,$62.2,2017,Category 4 hurricane
2,Maria,$90.0,,2017,Category 5 hurricane
3,Irma,$77.2,$31.0,2017,Category 5 hurricane
4,Sandy,$68.7,$73.5,2012,Category 3 hurricane


In [41]:
# Save to the csv file
renamed_cost_df.to_csv("Data/01_most_costly_hurricanes_wiki_web.csv")

In [42]:
# Read csv file into df
renamed_cost_df = pd.read_csv('Data/01_most_costly_hurricanes_wiki_web.csv', index_col=[0])

In [43]:
renamed_cost_df.head()

Unnamed: 0,name,damage_usd,norm_damage_usd,year,category
0,Katrina,$125.0,$116.9,2005,Category 5 hurricane
1,Harvey,$125.0,$62.2,2017,Category 4 hurricane
2,Maria,$90.0,,2017,Category 5 hurricane
3,Irma,$77.2,$31.0,2017,Category 5 hurricane
4,Sandy,$68.7,$73.5,2012,Category 3 hurricane


In [44]:
len(renamed_cost_df)

61

## Prepare Cost data

In [45]:
cost_df = renamed_cost_df

In [46]:
cost_df["norm_damage_usd"] = cost_df["norm_damage_usd"].str.replace('$','')
cost_df["norm_damage_usd"] = cost_df["norm_damage_usd"].str.replace('<','')
cost_df["norm_damage_usd"] = cost_df["norm_damage_usd"].str.replace('>','')
cost_df["damage_usd"] = cost_df["damage_usd"].str.replace('$','')
cost_df["damage_usd"] = cost_df["damage_usd"].str.replace('<','')
cost_df["damage_usd"] = cost_df["damage_usd"].str.replace('>','')
cost_df['norm_damage_usd'] = cost_df['norm_damage_usd'].astype(float)
cost_df['norm_damage_usd'] = cost_df['norm_damage_usd'].dropna()
cost_df['damage_usd'] = cost_df['damage_usd'].astype(float)
cost_df['damage_usd'] = cost_df['damage_usd'].dropna()
cost_df = cost_df.dropna()
len(cost_df) # 54 count

54

In [47]:
len(cost_df)

54

In [48]:
len(cost_df['name'].unique())

54

In [49]:
cost_df.head()

Unnamed: 0,name,damage_usd,norm_damage_usd,year,category
0,Katrina,125.0,116.9,2005,Category 5 hurricane
1,Harvey,125.0,62.2,2017,Category 4 hurricane
3,Irma,77.2,31.0,2017,Category 5 hurricane
4,Sandy,68.7,73.5,2012,Category 3 hurricane
5,Ike,38.0,35.2,2008,Category 4 hurricane


In [50]:
hurricanes_df

Unnamed: 0,hurricane_id,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.00,-54.00,1950
2,3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.10,1950
4,5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950
...,...,...,...,...,...,...,...,...,...,...,...,...
820,821,20191017,Nestor,1200,DB,22.2N,95.7W,35,1007,22.03,-95.12,2019
821,822,20191025,Olga,1200,TS,24.7N,94.8W,35,1004,24.12,-94.13,2019
822,823,20191023,Pablo,1800,EX,40.0N,38.3W,40,1000,40.00,-38.05,2019
823,824,20191027,Rebekah,0,EX,40.5N,51.5W,35,1004,40.08,-51.08,2019


In [51]:
# Merge two dataframes on both 'name' and 'year' columns to avoid duplicated entries
new_df = pd.merge(cost_df, hurricanes_df,  
                  how='left', left_on=['name','year'], right_on = ['name','year'])

In [52]:
len(new_df)

54

In [53]:
new_df.head()

Unnamed: 0,name,damage_usd,norm_damage_usd,year,category,hurricane_id,date,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
0,Katrina,125.0,116.9,2005,Category 5 hurricane,584,20050823,1800,TD,23.1N,75.1W,30,1008,23.02,-75.02
1,Harvey,125.0,62.2,2017,Category 4 hurricane,780,20170816,600,LO,13.7N,45.8W,25,1013,13.12,-45.13
2,Irma,77.2,31.0,2017,Category 5 hurricane,781,20170830,0,TD,16.1N,26.9W,30,1008,16.02,-26.15
3,Sandy,68.7,73.5,2012,Category 3 hurricane,718,20121021,1800,LO,14.3N,77.4W,25,1006,14.05,-77.07
4,Ike,38.0,35.2,2008,Category 4 hurricane,640,20080901,600,TD,17.2N,37.0W,30,1006,17.03,-37.0


In [54]:
# Sort merged df by norm_damage_usd column in descending order 
sorted_new_df = new_df.sort_values(by=['norm_damage_usd'], ascending=False)
sorted_new_df.head(54)

Unnamed: 0,name,damage_usd,norm_damage_usd,year,category,hurricane_id,date,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
0,Katrina,125.0,116.9,2005,Category 5 hurricane,584,20050823,1800,TD,23.1N,75.1W,30,1008,23.02,-75.02
6,Andrew,27.3,106.0,1992,Category 5 hurricane,405,19920816,1800,TD,10.8N,35.5W,25,1010,10.13,-35.08
3,Sandy,68.7,73.5,2012,Category 3 hurricane,718,20121021,1800,LO,14.3N,77.4W,25,1006,14.05,-77.07
1,Harvey,125.0,62.2,2017,Category 4 hurricane,780,20170816,600,LO,13.7N,45.8W,25,1013,13.12,-45.13
4,Ike,38.0,35.2,2008,Category 4 hurricane,640,20080901,600,TD,17.2N,37.0W,30,1006,17.03,-37.0
5,Wilma,27.4,31.9,2005,Category 5 hurricane,597,20051015,1800,TD,17.6N,78.5W,25,1004,17.1,-78.08
2,Irma,77.2,31.0,2017,Category 5 hurricane,781,20170830,0,TD,16.1N,26.9W,30,1008,16.02,-26.15
9,Charley,16.9,26.9,2004,Category 4 hurricane,559,20040809,1200,TD,11.4N,59.2W,30,1010,11.07,-59.03
47,Camille,1.42,26.4,1969,Category 5 hurricane,181,19690814,0,TD,18.3N,79.7W,30,-999,18.05,-79.12
33,Agnes,2.1,26.0,1972,Category 1 hurricane,213,19720614,1200,TD,20.0N,89.0W,25,-999,20.0,-89.0


In [55]:
cost_df = sorted_new_df[['hurricane_id','name','year','norm_damage_usd','damage_usd']]
cost_df.head()

Unnamed: 0,hurricane_id,name,year,norm_damage_usd,damage_usd
0,584,Katrina,2005,116.9,125.0
6,405,Andrew,1992,106.0,27.3
3,718,Sandy,2012,73.5,68.7
1,780,Harvey,2017,62.2,125.0
4,640,Ike,2008,35.2,38.0


## Dataset 2 - Deadliest hurricanes in the atlantic
#### https://en.wikipedia.org/wiki/List_of_deadliest_Atlantic_hurricanes

In [56]:
url = 'https://en.wikipedia.org/wiki/List_of_deadliest_Atlantic_hurricanes'
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(url)
time.sleep(3)
print("Open web browser")
tables = pd.read_html(url)
most_deadliest_hurricanes = tables[2]
print("Read and save table")
browser.quit()
most_deadliest_hurricanes.head()

Open web browser
Read and save table


Unnamed: 0,Name,Dates active,Saffir-Simpson Category,Sustainedwind speeds,Pressure,Areas affected,Damage(US$),Deaths,Refs
0,San Marcos,"October 5–14, 1870",Category 3 hurricane,115 mph (185 km/h),959 hPa (28.32 inHg),"Cuba, Florida, Bahamas",,"800–2,000",[1]
1,Sea Islands,"August 15 – September 2, 1893",Category 3 hurricane,120 mph (195 km/h),954 hPa (28.17 inHg),"Georgia, South Carolina",,"1,000–2,000",
2,Chenier Caminada,"September 27 – October 5, 1893",Category 4 hurricane,135 mph (215 km/h),948 hPa (27.99 inHg),"Yucatán Peninsula, Louisiana, Mississippi",,"1,800–2,000",
3,San Ciriaco,"August 3 – September 4, 1899",Category 4 hurricane,150 mph (240 km/h),930 hPa (27.46 inHg),"Lesser Antilles, Puerto Rico, Eastern United S...",,3855,
4,Galveston,"August 27 – September 15, 1900",Category 4 hurricane,145 mph (230 km/h),936 hPa (27.64 inHg),"The Caribbean, Texas",,"8,000–12,000",


In [57]:
most_fatal_hurricanes = most_deadliest_hurricanes.head(22)

In [58]:
most_fatal_hurricanes.tail()

Unnamed: 0,Name,Dates active,Saffir-Simpson Category,Sustainedwind speeds,Pressure,Areas affected,Damage(US$),Deaths,Refs
17,Mitch,"October 22 – November 5, 1998",Category 5 hurricane,180 mph (285 km/h),905 hPa (26.72 inHg),"Central America, Yucatán Peninsula, South Florida",,"11,374–19,000",[7][8][9]
18,Jeanne,"September 13–28, 2004",Category 3 hurricane,120 mph (195 km/h),950 hPa (28.05 inHg),"The Caribbean, Eastern United States",,3037,[5][10][11][12]
19,Katrina,"August 23–30, 2005",Category 5 hurricane,175 mph (280 km/h),902 hPa (26.64 inHg),"Bahamas, United States Gulf Coast",,"1,245–1,836",[13]
20,Stan,"October 1–5, 2005",Category 1 hurricane,80 mph (130 km/h),977 hPa (28.85 inHg),"Mexico, Central America",,1668,[5][14]
21,Maria,"September 16 – October 2, 2017",Category 5 hurricane,175 mph (280 km/h),908 hPa (26.81 inHg),"Lesser Antilles (particularly Dominica), Puert...",,3059,[15]


In [59]:
len(most_fatal_hurricanes)

22

In [60]:
# Create a copy of the df
fatal_df = most_fatal_hurricanes
len(fatal_df)

22

In [61]:
dates_active = fatal_df['Dates active']
dates_active

0                  October 5–14, 1870
1       August 15 – September 2, 1893
2      September 27 – October 5, 1893
3        August 3 – September 4, 1899
4      August 27 – September 15, 1900
5                  August 20–28, 1909
6                September 6–20, 1928
7      August 29 – September 17, 1930
8                September 6–13, 1931
9      October 30 – November 13, 1932
10                    June 4–18, 1934
11                October 18–27, 1935
12              September 21–30, 1955
13    September 26 – October 12, 1963
14              September 14–24, 1974
15      August 25 – September 8, 1979
16                November 8–21, 1994
17      October 22 – November 5, 1998
18              September 13–28, 2004
19                 August 23–30, 2005
20                  October 1–5, 2005
21     September 16 – October 2, 2017
Name: Dates active, dtype: object

In [62]:
dates_list = []
for i in range(len(dates_active)):
    split_str = dates_active[i].split(",")[-1]
    print(split_str)
    dates_list.append(int(split_str))
print(len(dates_list))

 1870
 1893
 1893
 1899
 1900
 1909
 1928
 1930
 1931
 1932
 1934
 1935
 1955
 1963
 1974
 1979
 1994
 1998
 2004
 2005
 2005
 2017
22


In [63]:
fatal_df['year'] = dates_list
fatal_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fatal_df['year'] = dates_list


Unnamed: 0,Name,Dates active,Saffir-Simpson Category,Sustainedwind speeds,Pressure,Areas affected,Damage(US$),Deaths,Refs,year
0,San Marcos,"October 5–14, 1870",Category 3 hurricane,115 mph (185 km/h),959 hPa (28.32 inHg),"Cuba, Florida, Bahamas",,"800–2,000",[1],1870
1,Sea Islands,"August 15 – September 2, 1893",Category 3 hurricane,120 mph (195 km/h),954 hPa (28.17 inHg),"Georgia, South Carolina",,"1,000–2,000",,1893
2,Chenier Caminada,"September 27 – October 5, 1893",Category 4 hurricane,135 mph (215 km/h),948 hPa (27.99 inHg),"Yucatán Peninsula, Louisiana, Mississippi",,"1,800–2,000",,1893
3,San Ciriaco,"August 3 – September 4, 1899",Category 4 hurricane,150 mph (240 km/h),930 hPa (27.46 inHg),"Lesser Antilles, Puerto Rico, Eastern United S...",,3855,,1899
4,Galveston,"August 27 – September 15, 1900",Category 4 hurricane,145 mph (230 km/h),936 hPa (27.64 inHg),"The Caribbean, Texas",,"8,000–12,000",,1900


In [64]:
# Drop columns
fatal_df = fatal_df.drop(columns=['Dates active','Saffir-Simpson Category',
                                          'Sustainedwind speeds','Pressure','Areas affected','Damage(US$)','Refs'])

In [65]:
fatal_df = fatal_df.rename(columns={'Name': 'name', 'Deaths': 'deaths'})
fatal_df.head()

Unnamed: 0,name,deaths,year
0,San Marcos,"800–2,000",1870
1,Sea Islands,"1,000–2,000",1893
2,Chenier Caminada,"1,800–2,000",1893
3,San Ciriaco,3855,1899
4,Galveston,"8,000–12,000",1900


In [66]:
fatal_df.head()

Unnamed: 0,name,deaths,year
0,San Marcos,"800–2,000",1870
1,Sea Islands,"1,000–2,000",1893
2,Chenier Caminada,"1,800–2,000",1893
3,San Ciriaco,3855,1899
4,Galveston,"8,000–12,000",1900


In [67]:
fatal_df["deaths"] = fatal_df["deaths"].str.replace(',','')
fatal_df["deaths"] = fatal_df["deaths"].str.replace('–',',')
fatal_df.head()

Unnamed: 0,name,deaths,year
0,San Marcos,8002000,1870
1,Sea Islands,10002000,1893
2,Chenier Caminada,18002000,1893
3,San Ciriaco,3855,1899
4,Galveston,800012000,1900


In [68]:
fatal_list = fatal_df["deaths"]

In [69]:
num_avg = []
for i in range(len(fatal_list)):
    if "," in fatal_list[i]:
        num0 = fatal_list[i].split(",")[0]
        num1 = fatal_list[i].split(",")[1]
        num = (int(num1) + int(num0))/2
        num_avg.append(num)
    else: num_avg.append(fatal_list[i])
print(num_avg)

[1400.0, 1500.0, 1900.0, '3855', 10000.0, '4000', '4075', 5000.0, 2000.0, 2803.5, 2500.0, '2150', '1023', '7193', '8210', '2068', '1152', 15187.0, '3037', 1540.5, '1668', '3059']


In [70]:
fatal_df = fatal_df.drop(columns="deaths")

In [71]:
fatal_df['deaths'] = num_avg

In [72]:
fatal_df = fatal_df.head(22)

In [73]:
fatal_df.head()

Unnamed: 0,name,year,deaths
0,San Marcos,1870,1400
1,Sea Islands,1893,1500
2,Chenier Caminada,1893,1900
3,San Ciriaco,1899,3855
4,Galveston,1900,10000


In [74]:
len(fatal_df)

22

In [75]:
# Merge two dataframes on both 'name' and 'year' columns to avoid duplicated entries
new_fatal_merged_df = pd.merge(fatal_df, hurricanes_df,  how='left', 
                               left_on=['name', 'year'], right_on = ['name','year'])
new_fatal_merged_df.head()

Unnamed: 0,name,year,deaths,hurricane_id,date,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
0,San Marcos,1870,1400,,,,,,,,,,
1,Sea Islands,1893,1500,,,,,,,,,,
2,Chenier Caminada,1893,1900,,,,,,,,,,
3,San Ciriaco,1899,3855,,,,,,,,,,
4,Galveston,1900,10000,,,,,,,,,,


In [76]:
new_fatal_merged_df.tail(60)

Unnamed: 0,name,year,deaths,hurricane_id,date,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
0,San Marcos,1870,1400.0,,,,,,,,,,
1,Sea Islands,1893,1500.0,,,,,,,,,,
2,Chenier Caminada,1893,1900.0,,,,,,,,,,
3,San Ciriaco,1899,3855.0,,,,,,,,,,
4,Galveston,1900,10000.0,,,,,,,,,,
5,Monterrey,1909,4000.0,,,,,,,,,,
6,Okeechobee,1928,4075.0,,,,,,,,,,
7,San Zenon,1930,5000.0,,,,,,,,,,
8,Belize,1931,2000.0,,,,,,,,,,
9,Cuba,1932,2803.5,,,,,,,,,,


In [77]:
# Filter out unnamed hurricanes before 1975
fatal_df = new_fatal_merged_df[new_fatal_merged_df['year'] > 1975]
fatal_df.head()

Unnamed: 0,name,year,deaths,hurricane_id,date,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
15,David,1979,2068.0,278.0,19790825.0,1200.0,TD,11.7N,36.1W,25.0,1008.0,11.12,-36.02
16,Gordon,1994,1152.0,427.0,19941108.0,1200.0,TD,11.9N,82.3W,25.0,1009.0,11.15,-82.05
17,Mitch,1998,15187.0,481.0,19981022.0,0.0,TD,11.6N,76.1W,30.0,1002.0,11.1,-76.02
18,Jeanne,2004,3037.0,567.0,20040913.0,1800.0,TD,15.9N,60.0W,25.0,1010.0,15.15,-60.0
19,Katrina,2005,1540.5,584.0,20050823.0,1800.0,TD,23.1N,75.1W,30.0,1008.0,23.02,-75.02


In [78]:
fatal_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 15 to 21
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               7 non-null      object 
 1   year               7 non-null      int64  
 2   deaths             7 non-null      object 
 3   hurricane_id       7 non-null      float64
 4   date               7 non-null      float64
 5   time               7 non-null      float64
 6   status             7 non-null      object 
 7   latitude           7 non-null      object 
 8   longitude          7 non-null      object 
 9   max_wind           7 non-null      float64
 10  air_pressure       7 non-null      float64
 11  latitude_decimal   7 non-null      float64
 12  longitude_decimal  7 non-null      float64
dtypes: float64(7), int64(1), object(5)
memory usage: 784.0+ bytes


In [79]:
# Convert hurricane_id back to integer
fatal_df['hurricane_id'] = fatal_df['hurricane_id'].astype(int)
fatal_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fatal_df['hurricane_id'] = fatal_df['hurricane_id'].astype(int)


Unnamed: 0,name,year,deaths,hurricane_id,date,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
15,David,1979,2068.0,278,19790825.0,1200.0,TD,11.7N,36.1W,25.0,1008.0,11.12,-36.02
16,Gordon,1994,1152.0,427,19941108.0,1200.0,TD,11.9N,82.3W,25.0,1009.0,11.15,-82.05
17,Mitch,1998,15187.0,481,19981022.0,0.0,TD,11.6N,76.1W,30.0,1002.0,11.1,-76.02
18,Jeanne,2004,3037.0,567,20040913.0,1800.0,TD,15.9N,60.0W,25.0,1010.0,15.15,-60.0
19,Katrina,2005,1540.5,584,20050823.0,1800.0,TD,23.1N,75.1W,30.0,1008.0,23.02,-75.02


In [80]:
fatal_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 15 to 21
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               7 non-null      object 
 1   year               7 non-null      int64  
 2   deaths             7 non-null      object 
 3   hurricane_id       7 non-null      int64  
 4   date               7 non-null      float64
 5   time               7 non-null      float64
 6   status             7 non-null      object 
 7   latitude           7 non-null      object 
 8   longitude          7 non-null      object 
 9   max_wind           7 non-null      float64
 10  air_pressure       7 non-null      float64
 11  latitude_decimal   7 non-null      float64
 12  longitude_decimal  7 non-null      float64
dtypes: float64(6), int64(2), object(5)
memory usage: 784.0+ bytes


In [81]:
# Drop columns
fatal_df = fatal_df[['hurricane_id','name','year','deaths']]

In [82]:
fatal_df

Unnamed: 0,hurricane_id,name,year,deaths
15,278,David,1979,2068.0
16,427,Gordon,1994,1152.0
17,481,Mitch,1998,15187.0
18,567,Jeanne,2004,3037.0
19,584,Katrina,2005,1540.5
20,592,Stan,2005,1668.0
21,785,Maria,2017,3059.0


### Merge total_gross_info and domestic_gross_info tables on 'name' column

In [83]:
# # combine total and domestic tables on left using name of the movie column
# combined_gross_info = pd.merge(domestic_gross_info, total_gross_info, how='left', on="name")

In [84]:
# combined_gross_info.head()

In [85]:
# len(combined_gross_info)AssertionError

In [86]:
# # Add new column and compute the international_total_revenue
# combined_gross_info['international_revenue_usd'] = \
#         (combined_gross_info['total_revenue_usd'] - combined_gross_info['domestic_revenue_usd'])

In [87]:
# # Rename columns
# combined_gross_info = combined_gross_info.rename(columns={'star':'actor'})

In [88]:
# # Getting the duplicate movie by title and star released. 
# dupl_combined_df = combined_gross_info.loc[combined_gross_info.duplicated(subset=["name","director", "year"]), "name"].unique()
# dupl_combined_df

In [89]:
# # Get all the data for the duplicate movie 
# dupl_df = combined_gross_info.loc[combined_gross_info["name"] == "Clash of the titans"]
# dupl_df

#### (removed) Use pandas to push data into database

In [90]:
# # Check existing tables
# engine.table_names()

In [91]:
# # Create domesic gross revenue table in the database
# domestic_gross_info.to_sql(name='domestic_gross_info', con=engine, if_exists='append', index=False)

In [92]:
# type(total_gross_info['name'])

In [93]:
# total_gross_info

In [94]:
# domestic_gross_info

In [95]:
# # Remove the duplicates
# # clean_combined_df = combined_gross_info.drop_duplicates(subset=['name','director','year'], keep='first')
# clean_combined_df

In [96]:
# len(clean_combined_df)

### Create movies table from combined table

In [97]:
# # Re-arrange the columns and write into new table
# movies = clean_combined_df[['name','total_revenue_usd','international_revenue_usd','domestic_revenue_usd',
#                               'director','actor','writer','genre','rating','company','country','year']]

In [98]:
# movies.head()

In [99]:
# # Fill NaN values with zeroes
# movies[['total_revenue_usd','international_revenue_usd','domestic_revenue_usd']] = \
#     movies[['total_revenue_usd','international_revenue_usd','domestic_revenue_usd']].fillna(0)

### Extract Actor data into a separate table

In [100]:
# # Save star_actor data into a separate table
# actor = movies['actor'].unique()
# len(actor)

In [101]:
# actor[:5]

In [102]:
# actor = pd.DataFrame({'actor': actor})
# actor.head(2)

In [103]:
# actor.index += 1
# actor

In [104]:
# actor = actor.reset_index()
# actor.head(2)

In [105]:
# actor = actor.rename(columns={'index': 'actor_id'})
# actor.head()

In [106]:
# len(actor)

### Merge movie and actor df on actor column

In [107]:
# # combine movie and tables
# movie_actor = pd.merge(movies, actor, how='outer', on="actor")

In [108]:
# # Extract only movie_id and actor_id columns
# movie_actor = movie_actor[['movie_id', 'actor_id']]
# movie_actor

In [109]:
# # Save to csv
# movie_actor.to_csv('csv_files/movie_actor.csv')

### Create director_movie table

In [110]:
# # Save star_actor data into a separate table
# director = movies['director'].unique()
# len(director)

In [111]:
# director[:5]

In [112]:
# director = pd.DataFrame({'director': director})
# director.head(2)

In [113]:
# director.index += 1
# director

In [114]:
# director = director.reset_index()
# director.head(2)

In [115]:
# director = director.rename(columns={'index': 'director_id'})
# director.head()

In [116]:
# len(director)

### Merge movie and director df on actor column

In [117]:
# # combine movie and tables
# movie_director = pd.merge(movies, director, how='outer', on="director")

In [118]:
# # Extract only movie_id and actor_id columns
# movie_director = movie_director[['movie_id', 'director_id']]
# movie_director

In [119]:
# # Save to csv
# movie_director.to_csv('csv_files/movie_director.csv')

### Create company_movie table

In [120]:
# # Save company_movie data into a separate table
# company = movies['company'].unique()
# len(company)

In [121]:
# company[:5]

In [122]:
# company = pd.DataFrame({'company': company})
# company.head(2)

In [123]:
# company.index += 1
# company

In [124]:
# company = company.reset_index()
# company.head(2)

In [125]:
# company = company.rename(columns={'index': 'company_id'})
# company.head()

In [126]:
# len(company)

### Merge movie and company df on actor column

In [127]:
# # combine movie and tables
# company_movie = pd.merge(movies, company, how='outer', on="company")

In [128]:
# # Extract only movie_id and actor_id columns
# company_movie = company_movie[['movie_id', 'company_id']]
# company_movie

In [129]:
# # Save to csv
# company_movie.to_csv('csv_files/company_movie.csv')

## Extract Writer data into a separate table

In [130]:
# # Save writer data into a separate table
# writer = movies['writer'].unique()
# len(writer)

In [131]:
# writer[:5]

In [132]:
# writer = pd.DataFrame({'writer': writer})
# writer.head(2)

In [133]:
# writer.index += 1
# writer

In [134]:
# writer = writer.reset_index()
# writer.head(2)

In [135]:
# writer = writer.rename(columns={'index': 'writer_id'})
# writer.head()

In [136]:
# len(writer)

### Merge writer_id column to movies table

In [137]:
# # combine movie and tables
# movies = pd.merge(movies, writer, how='outer', on="writer")

In [138]:
# movies.head()

## Extract Genre data into a separate table

In [139]:
# # Save genre data into a separate table
# genre = movies['genre'].unique()
# len(genre)

In [140]:
# genre[:5]

In [141]:
# genre = pd.DataFrame({'genre': genre})
# genre.head(2)

In [142]:
# genre.index += 1
# genre

In [143]:
# genre = genre.reset_index()
# genre.head(2)

In [144]:
# genre = genre.rename(columns={'index': 'genre_id'})
# genre.head()

In [145]:
# len(genre)

### Merge genre_id column to movies table

In [146]:
# # combine movies and genre
# movies = pd.merge(movies, genre, how='outer', on="genre")

In [147]:
# movies.head()

In [148]:
# movies.to_csv('csv_files/movies_genre_ids.csv')

## Extract rating data into a separate table

In [149]:
# # Save rating data into a separate table
# rating = movies['rating'].unique()
# len(rating)

In [150]:
# rating[:5]

In [151]:
# rating = pd.DataFrame({'rating': rating})
# rating.head(2)

In [152]:
# rating.index += 1

In [153]:
# rating = rating.reset_index()
# rating.head(2)

In [154]:
# rating = rating.rename(columns={'index': 'rating_id'})
# rating.head()

In [155]:
# len(rating)

### Merge rating_id column to movies table

In [156]:
# # combine movies and rating
# movies = pd.merge(movies, rating, how='outer', on="rating")

In [157]:
# movies.head()

In [158]:
# movies.to_csv('csv_files/movies_rating_ids.csv')

# Remove unnecessary columns from movies

In [159]:
# movies.head(2)

In [160]:
# movies = movies[['movie_id','country','genre_id','writer_id','rating_id','name','year','domestic_revenue_usd',
#                  'international_revenue_usd', 'total_revenue_usd']]

## Push table to PostgreSQL

### !!! Please import QuickDBD-export.sql in the repo root folder file in pgAdmin
### !!! Run SQL file to generate the tables and reference links
### UNCOMMENT cell below to run the rest of the code

In [161]:
# Connect to the local database
connection_string = f'{username}:{password}@localhost:5432/hurricanes_db'
engine = create_engine(f'postgresql://{connection_string}')

In [162]:
hurricanes_df.to_sql(name="hurricanes", con=engine, if_exists='append', index=False)

In [163]:
master_df.to_sql(name="master", con=engine, if_exists='append', index=False)

In [164]:
cost_df.to_sql(name="cost", con=engine, if_exists='append', index=False)

In [165]:
fatal_df.to_sql(name="fatalities", con=engine, if_exists='append', index=False)

## Reading from postgreSQL for visualization