In [2]:
# Import modules
from bs4 import BeautifulSoup
import requests
from splinter import Browser
from selenium import webdriver
import pandas as pd
import time
from secret import username, password
from sqlalchemy import create_engine
import csv
import numpy as np
# import html5lib

### Load main hurricane dataset from csv to pandas dataframe

In [3]:
main_csv_df = pd.read_csv("Data/Hurricane Data 2019.csv")
main_csv_df

Unnamed: 0,Date,Name,Time,Status,Latitude,Longitude,MaximumWind,AirPressure
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999
...,...,...,...,...,...,...,...,...
29969,20191126,SEBASTIEN,1200,EX,52.2N,9.3W,45,970
29970,20191126,SEBASTIEN,1800,EX,52.2N,8.9W,40,972
29971,20191127,SEBASTIEN,0,EX,51.8N,8.2W,40,974
29972,20191127,SEBASTIEN,600,EX,51.4N,6.0W,40,976


In [4]:
main_df = main_csv_df #.drop(columns=("Unnamed: 8"))
main_df

Unnamed: 0,Date,Name,Time,Status,Latitude,Longitude,MaximumWind,AirPressure
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999
...,...,...,...,...,...,...,...,...
29969,20191126,SEBASTIEN,1200,EX,52.2N,9.3W,45,970
29970,20191126,SEBASTIEN,1800,EX,52.2N,8.9W,40,972
29971,20191127,SEBASTIEN,0,EX,51.8N,8.2W,40,974
29972,20191127,SEBASTIEN,600,EX,51.4N,6.0W,40,976


In [9]:
main_df.columns = main_df.columns.str.lower()

In [10]:
main_df.rename(columns={'maximumwind':'max_wind','airpressure':'air_pressure'}, inplace=True)
main_df

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999
...,...,...,...,...,...,...,...,...
29969,20191126,SEBASTIEN,1200,EX,52.2N,9.3W,45,970
29970,20191126,SEBASTIEN,1800,EX,52.2N,8.9W,40,972
29971,20191127,SEBASTIEN,0,EX,51.8N,8.2W,40,974
29972,20191127,SEBASTIEN,600,EX,51.4N,6.0W,40,976


In [11]:
# change format for latitudes and longitudes
longitudes = main_df['longitude']
long_list = []
for i in range(len(longitudes)):
    longitude = longitudes[i]
    W = 'W' in longitude
    d, m = map(float, longitude[:-1].split('.'))
    longitude = (d + m / 60.) * (-1 if W else 1)
    long_list.append(round(longitude, 2))
latitudes = main_df['latitude']
lat_list = []
for i in range(len(latitudes)):
    latitude = latitudes[i]
    N = 'N' in latitude
    d, m = map(float, latitude[:-1].split('.'))
    latitude = (d + m / 60.) * (1 if N else -1)
    lat_list.append(round(latitude, 2))
main_df['latitude_decimal'] = lat_list
main_df['longitude_decimal'] = long_list

In [12]:
len(main_df)

29974

In [14]:
clean_df = main_df.dropna()
clean_df

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999,17.02,-55.08
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999,17.12,-56.05
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999,19.00,-58.10
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999,20.00,-60.00
...,...,...,...,...,...,...,...,...,...,...
29969,20191126,SEBASTIEN,1200,EX,52.2N,9.3W,45,970,52.03,-9.05
29970,20191126,SEBASTIEN,1800,EX,52.2N,8.9W,40,972,52.03,-8.15
29971,20191127,SEBASTIEN,0,EX,51.8N,8.2W,40,974,51.13,-8.03
29972,20191127,SEBASTIEN,600,EX,51.4N,6.0W,40,976,51.07,-6.00


In [15]:
len(clean_df)

29974

In [16]:
# Get names of indexes for which column name is "Unnamed"
indexNames = clean_df[clean_df['name'] == "Unnamed" ].index
# Delete these row indexes from dataFrame
clean_df.drop(indexNames , inplace=True)

In [17]:
len(clean_df)

29974

### Create year column 
#### Group by hurricane name to assign unique ID

In [18]:
# Extract year based on date column
dates = clean_df['date']
years = []
for i in range(len(dates)):
    split_str = int(str(dates[i])[:4])
    years.append(split_str)
clean_df['year'] = years
clean_df.head()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,ABLE,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,19500812,ABLE,600,TS,17.7N,56.3W,40,-999,17.12,-56.05,1950
2,19500812,ABLE,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07,1950
3,19500812,ABLE,1800,TS,19.0N,58.6W,50,-999,19.0,-58.1,1950
4,19500813,ABLE,0,TS,20.0N,60.0W,50,-999,20.0,-60.0,1950


### Rename hurricanes to Capitalized

In [19]:
names = clean_df['name']
names_capitalized = names.str.title()
names_capitalized
clean_df['name'] = names_capitalized
clean_df

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,19500812,Able,600,TS,17.7N,56.3W,40,-999,17.12,-56.05,1950
2,19500812,Able,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07,1950
3,19500812,Able,1800,TS,19.0N,58.6W,50,-999,19.00,-58.10,1950
4,19500813,Able,0,TS,20.0N,60.0W,50,-999,20.00,-60.00,1950
...,...,...,...,...,...,...,...,...,...,...,...
29969,20191126,Sebastien,1200,EX,52.2N,9.3W,45,970,52.03,-9.05,2019
29970,20191126,Sebastien,1800,EX,52.2N,8.9W,40,972,52.03,-8.15,2019
29971,20191127,Sebastien,0,EX,51.8N,8.2W,40,974,51.13,-8.03,2019
29972,20191127,Sebastien,600,EX,51.4N,6.0W,40,976,51.07,-6.00,2019


In [20]:
len(clean_df)

29974

In [21]:
# Get names of indexes for which column name is "Unnamed"
indexNames = clean_df[clean_df['name'] == "Unnamed" ].index
# Delete these row indexes from dataFrame
clean_df.drop(indexNames , inplace=True)

In [22]:
len(clean_df)

25042

#### Save cleaned csv df with all hurricanes data - to use when combined with ids

In [23]:
# Save cleaned csv df with all hurricanes data - to use when combined with ids
clean_df_no_ids = clean_df

In [24]:
# Check negative wind speeds converted to 0
min_wind_val = clean_df_no_ids['max_wind'].min()
print(min_wind_val)

10


In [25]:
# Replace negative wind speed with zeroes
clean_df_no_ids[clean_df_no_ids['max_wind'] < 0] = 0
clean_df_no_ids['max_wind']

0        35
1        40
2        45
3        50
4        50
         ..
29969    45
29970    40
29971    40
29972    40
29973    40
Name: max_wind, Length: 25042, dtype: int64

### Extract unique hurricanes names to create and associate ids

In [26]:
# Getting the duplicate movie by title and star released. 
dupl_df = clean_df.loc[clean_df.duplicated(subset=["name", "year"]), "name"].unique()
dupl_df

array(['Able', 'Baker', 'Charlie', 'Dog', 'Easy', 'Fox', 'George', 'How',
       'Item', 'Jig', 'King', 'Love', 'Mike', 'Alice', 'Barbara', 'Carol',
       'Dolly', 'Edna', 'Florence', 'Gail', 'Hazel', 'Gilda', 'Brenda',
       'Connie', 'Diane', 'Edith', 'Flora', 'Gladys', 'Ione', 'Hilda',
       'Janet', 'Katie', 'Anna', 'Betsy', 'Carla', 'Dora', 'Ethel',
       'Flossy', 'Greta', 'Audrey', 'Bertha', 'Carrie', 'Debbie',
       'Esther', 'Frieda', 'Alma', 'Becky', 'Cleo', 'Daisy', 'Ella',
       'Fifi', 'Gerda', 'Helene', 'Ilsa', 'Janice', 'Arlene', 'Beulah',
       'Cindy', 'Debra', 'Gracie', 'Hannah', 'Irene', 'Judith', 'Abby',
       'Donna', 'Frances', 'Hattie', 'Jenny', 'Inga', 'Celia', 'Ginny',
       'Helena', 'Isbell', 'Elena', 'Dorothy', 'Faith', 'Hallie', 'Inez',
       'Lois', 'Chloe', 'Doria', 'Fern', 'Ginger', 'Heidi', 'Candy',
       'Blanche', 'Camille', 'Eve', 'Francelia', 'Holly', 'Kara',
       'Laurie', 'Martha', 'Felice', 'Beth', 'Kristy', 'Laura', 'Alpha',
       

In [27]:
len(dupl_df)

294

In [28]:
# Remove the duplicates 
clean_df = clean_df.drop_duplicates(subset=['name','year'], keep='first')
clean_df.head()
clean_df.to_csv("Data/clean_df.csv")

In [29]:
len(clean_df)

769

### Add hurricane index column

In [30]:
hurricanes_df = clean_df
hurricanes_df.head()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
51,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.0,-54.0,1950
111,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
172,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.1,1950
246,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950


In [31]:
hurricanes_df.reset_index()
hurricanes_df.head()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
51,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.0,-54.0,1950
111,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
172,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.1,1950
246,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950


In [32]:
hurricanes_df = hurricanes_df.reset_index(drop=True)
hurricanes_df

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.00,-54.00,1950
2,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.10,1950
4,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950
...,...,...,...,...,...,...,...,...,...,...,...
764,20191017,Nestor,1200,DB,22.2N,95.7W,35,1007,22.03,-95.12,2019
765,20191025,Olga,1200,TS,24.7N,94.8W,35,1004,24.12,-94.13,2019
766,20191023,Pablo,1800,EX,40.0N,38.3W,40,1000,40.00,-38.05,2019
767,20191027,Rebekah,0,EX,40.5N,51.5W,35,1004,40.08,-51.08,2019


In [33]:
# Start index from 1
hurricanes_df.index += 1
hurricanes_df.head()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.0,-54.0,1950
3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.1,1950
5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950


In [34]:
# Reset index
hurricanes_df = hurricanes_df.reset_index()
hurricanes_df.head()

Unnamed: 0,index,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.0,-54.0,1950
2,3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.1,1950
4,5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950


In [35]:
hurricanes_df = hurricanes_df.rename(columns={'index': 'hurricane_id'})
hurricanes_df.head()

Unnamed: 0,hurricane_id,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.0,-54.0,1950
2,3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.1,1950
4,5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950


In [36]:
hurricanes_df.to_csv("Data/00_Hurricane Data 2019_with_unique_IDs.csv")

In [37]:
# Get names of indexes for which column name is "Unnamed"
indexNames = hurricanes_df[hurricanes_df['name'] == "Unnamed" ].index
# Delete these row indexes from dataFrame
clean_df.drop(indexNames , inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [38]:
len(clean_df)

769

In [39]:
hurricanes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 769 entries, 0 to 768
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   hurricane_id       769 non-null    int64  
 1   date               769 non-null    int64  
 2   name               769 non-null    object 
 3   time               769 non-null    int64  
 4   status             769 non-null    object 
 5   latitude           769 non-null    object 
 6   longitude          769 non-null    object 
 7   max_wind           769 non-null    int64  
 8   air_pressure       769 non-null    int64  
 9   latitude_decimal   769 non-null    float64
 10  longitude_decimal  769 non-null    float64
 11  year               769 non-null    int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 72.2+ KB


In [40]:
# Check negative wind speeds displayed in plotly
min_wind_val = main_df['max_wind'].min()
print(min_wind_val)

-99


In [41]:
# Check minimum values
main_df['max_wind']

0        35
1        40
2        45
3        50
4        50
         ..
29969    45
29970    40
29971    40
29972    40
29973    40
Name: max_wind, Length: 29974, dtype: int64

In [42]:
hurricanes_df[hurricanes_df['max_wind'] < 0] = 0
hurricanes_df['max_wind']

0      35
1      30
2      35
3      80
4      40
       ..
764    35
765    35
766    40
767    35
768    35
Name: max_wind, Length: 769, dtype: int64

In [43]:
hurricanes_df

Unnamed: 0,hurricane_id,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.00,-54.00,1950
2,3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.10,1950
4,5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950
...,...,...,...,...,...,...,...,...,...,...,...,...
764,765,20191017,Nestor,1200,DB,22.2N,95.7W,35,1007,22.03,-95.12,2019
765,766,20191025,Olga,1200,TS,24.7N,94.8W,35,1004,24.12,-94.13,2019
766,767,20191023,Pablo,1800,EX,40.0N,38.3W,40,1000,40.00,-38.05,2019
767,768,20191027,Rebekah,0,EX,40.5N,51.5W,35,1004,40.08,-51.08,2019


In [44]:
len(hurricanes_df)

769

In [45]:
# Check negative wind speeds converted to 0
min_wind_val = hurricanes_df['max_wind'].min()
print(min_wind_val)

10


In [46]:
hurricanes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 769 entries, 0 to 768
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   hurricane_id       769 non-null    int64  
 1   date               769 non-null    int64  
 2   name               769 non-null    object 
 3   time               769 non-null    int64  
 4   status             769 non-null    object 
 5   latitude           769 non-null    object 
 6   longitude          769 non-null    object 
 7   max_wind           769 non-null    int64  
 8   air_pressure       769 non-null    int64  
 9   latitude_decimal   769 non-null    float64
 10  longitude_decimal  769 non-null    float64
 11  year               769 non-null    int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 72.2+ KB


#### Merge clean_df_no_ids with hurricanes_df to get a master df with all data

In [48]:
clean_df_no_ids

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,19500812,Able,600,TS,17.7N,56.3W,40,-999,17.12,-56.05,1950
2,19500812,Able,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07,1950
3,19500812,Able,1800,TS,19.0N,58.6W,50,-999,19.00,-58.10,1950
4,19500813,Able,0,TS,20.0N,60.0W,50,-999,20.00,-60.00,1950
...,...,...,...,...,...,...,...,...,...,...,...
29969,20191126,Sebastien,1200,EX,52.2N,9.3W,45,970,52.03,-9.05,2019
29970,20191126,Sebastien,1800,EX,52.2N,8.9W,40,972,52.03,-8.15,2019
29971,20191127,Sebastien,0,EX,51.8N,8.2W,40,974,51.13,-8.03,2019
29972,20191127,Sebastien,600,EX,51.4N,6.0W,40,976,51.07,-6.00,2019


In [49]:
# Filter columns from hurricanes_df before merge
temp_hurr_df = hurricanes_df[['name','year','hurricane_id']]
temp_hurr_df

Unnamed: 0,name,year,hurricane_id
0,Able,1950,1
1,Baker,1950,2
2,Charlie,1950,3
3,Dog,1950,4
4,Easy,1950,5
...,...,...,...
764,Nestor,2019,765
765,Olga,2019,766
766,Pablo,2019,767
767,Rebekah,2019,768


In [50]:
# Merge two dataframes on both 'name' and 'year' columns to avoid duplicated entries
master_df = pd.merge(clean_df_no_ids, temp_hurr_df,  
                  how='left', left_on=['name','year'], right_on = ['name','year'])

In [54]:
master_df.head()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year,hurricane_id
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950,1
1,19500812,Able,600,TS,17.7N,56.3W,40,-999,17.12,-56.05,1950,1
2,19500812,Able,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07,1950,1
3,19500812,Able,1800,TS,19.0N,58.6W,50,-999,19.0,-58.1,1950,1
4,19500813,Able,0,TS,20.0N,60.0W,50,-999,20.0,-60.0,1950,1


In [55]:
# Check negative wind speeds after merge
min_wind_val = master_df['max_wind'].min()
print(min_wind_val)

10


In [56]:
master_df.tail()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year,hurricane_id
25037,20191126,Sebastien,1200,EX,52.2N,9.3W,45,970,52.03,-9.05,2019,769
25038,20191126,Sebastien,1800,EX,52.2N,8.9W,40,972,52.03,-8.15,2019,769
25039,20191127,Sebastien,0,EX,51.8N,8.2W,40,974,51.13,-8.03,2019,769
25040,20191127,Sebastien,600,EX,51.4N,6.0W,40,976,51.07,-6.0,2019,769
25041,20191127,Sebastien,1200,EX,51.3N,2.1W,40,980,51.05,-2.02,2019,769


In [57]:
len(master_df)

25042

In [58]:
# Drop null values after merge
master_df = master_df.dropna()

In [59]:
len(master_df)

25042

In [60]:
master_df['hurricane_id'] = master_df['hurricane_id'].astype(int)
master_df

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year,hurricane_id
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950,1
1,19500812,Able,600,TS,17.7N,56.3W,40,-999,17.12,-56.05,1950,1
2,19500812,Able,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07,1950,1
3,19500812,Able,1800,TS,19.0N,58.6W,50,-999,19.00,-58.10,1950,1
4,19500813,Able,0,TS,20.0N,60.0W,50,-999,20.00,-60.00,1950,1
...,...,...,...,...,...,...,...,...,...,...,...,...
25037,20191126,Sebastien,1200,EX,52.2N,9.3W,45,970,52.03,-9.05,2019,769
25038,20191126,Sebastien,1800,EX,52.2N,8.9W,40,972,52.03,-8.15,2019,769
25039,20191127,Sebastien,0,EX,51.8N,8.2W,40,974,51.13,-8.03,2019,769
25040,20191127,Sebastien,600,EX,51.4N,6.0W,40,976,51.07,-6.00,2019,769


In [61]:
hurricanes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 769 entries, 0 to 768
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   hurricane_id       769 non-null    int64  
 1   date               769 non-null    int64  
 2   name               769 non-null    object 
 3   time               769 non-null    int64  
 4   status             769 non-null    object 
 5   latitude           769 non-null    object 
 6   longitude          769 non-null    object 
 7   max_wind           769 non-null    int64  
 8   air_pressure       769 non-null    int64  
 9   latitude_decimal   769 non-null    float64
 10  longitude_decimal  769 non-null    float64
 11  year               769 non-null    int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 72.2+ KB


### Merge main df with costliest df on name and keep hurricane_id as foreign key

### Mac (Irina) Connection

In [62]:
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

which: no chromedriver in (/c/Users/Clay/anaconda3/envs/PythonData:/c/Users/Clay/anaconda3/envs/PythonData/Library/mingw-w64/bin:/c/Users/Clay/anaconda3/envs/PythonData/Library/usr/bin:/c/Users/Clay/anaconda3/envs/PythonData/Library/bin:/c/Users/Clay/anaconda3/envs/PythonData/Scripts:/c/Users/Clay/anaconda3/envs/PythonData/bin:/c/Users/Clay/anaconda3/Scripts/condabin:/c/Users/Clay/bin:/mingw64/bin:/usr/local/bin:/usr/bin:/usr/bin:/mingw64/bin:/usr/bin:/c/Users/Clay/bin:/c/Program Files (x86)/Common Files/Oracle/Java/javapath:/c/Program Files (x86)/Intel/iCLS Client:/c/Program Files/Intel/iCLS Client:/c/Windows/system32:/c/Windows:/c/Windows/System32/Wbem:/c/Windows/System32/WindowsPowerShell/v1.0:/c/Program Files (x86)/Intel/Intel(R) Management Engine Components/DAL:/c/Program Files/Intel/Intel(R) Management Engine Components/DAL:/c/Program Files (x86)/Intel/Intel(R) Management Engine Components/IPT:/c/Program Files/Intel/Intel(R) Management Engine Components/IPT:/c/Program Files (x86)

In [52]:
# executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
# browser = Browser('chrome', **executable_path, headless=False)

### Windows (Clay) Connection

In [63]:
executable_path = {'executable_path': 'C:/Users/Clay/chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

### Amy Connection

In [None]:
#

### Neil Connection

In [None]:
#

# * * * MAKE SURE TO COMMENT IN / OUT YOUR CONNECTION / PATH * * *

### Dataset 1 - Web Scrape Wikipedia Most Costliest hurricanes

In [64]:
url = 'https://en.wikipedia.org/wiki/List_of_costliest_Atlantic_hurricanes'
# Irina Path
# executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
# Clay Path
executable_path = {'executable_path': 'C:/Users/Clay/chromedriver.exe'}
# Amy Path
#
# Neil Path
#
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(url)
time.sleep(3)
print("Open web browser")
tables = pd.read_html(url)
most_costly_hurricanes = tables[0]
print("Read and save table")
browser.quit()
most_costly_hurricanes.head()

Open web browser
Read and save table


Unnamed: 0,Name,Nominal damage(Billions USD),Normalized damage(Billions USD),Season,Storm classificationat peak intensity,Areas affected,References
0,Katrina,$125.0,$116.9,2005,Category 5 hurricane,Louisiana Mississippi The Bahamas United State...,[4][5]
1,Harvey,$125.0,$62.2,2017,Category 4 hurricane,Texas Louisiana South America Central America ...,[5][6]
2,Maria,$90.0,,2017,Category 5 hurricane,Puerto Rico Lesser Antilles Greater Antilles C...,[7]
3,Irma,$77.2,$31.0,2017,Category 5 hurricane,Lesser Antilles Greater Antilles Caribbean Sea...,[5]
4,Sandy,$68.7,$73.5,2012,Category 3 hurricane,The Caribbean United States East Coast Eastern...,[5][8]


In [65]:
# Create a copy of the df
renamed_cost_df = most_costly_hurricanes

In [66]:
# # Rename columns
renamed_cost_df = renamed_cost_df.rename(columns={'Name': 'name','Nominal damage(Billions USD)': 'damage_usd', 'Normalized damage(Billions USD)': 'norm_damage_usd', 
                'Season': 'year', 'Storm classificationat peak intensity': 'category', 'Areas affected':'states'})

In [67]:
# Drop Reference column
renamed_cost_df = renamed_cost_df.drop(columns=['References','states'])

In [68]:
renamed_cost_df.head()

Unnamed: 0,name,damage_usd,norm_damage_usd,year,category
0,Katrina,$125.0,$116.9,2005,Category 5 hurricane
1,Harvey,$125.0,$62.2,2017,Category 4 hurricane
2,Maria,$90.0,,2017,Category 5 hurricane
3,Irma,$77.2,$31.0,2017,Category 5 hurricane
4,Sandy,$68.7,$73.5,2012,Category 3 hurricane


In [69]:
# Save to the csv file
renamed_cost_df.to_csv("Data/01_most_costly_hurricanes_wiki_web.csv")

In [70]:
# Read csv file into df
renamed_cost_df = pd.read_csv('Data/01_most_costly_hurricanes_wiki_web.csv', index_col=[0])

In [71]:
renamed_cost_df.head()

Unnamed: 0,name,damage_usd,norm_damage_usd,year,category
0,Katrina,$125.0,$116.9,2005,Category 5 hurricane
1,Harvey,$125.0,$62.2,2017,Category 4 hurricane
2,Maria,$90.0,,2017,Category 5 hurricane
3,Irma,$77.2,$31.0,2017,Category 5 hurricane
4,Sandy,$68.7,$73.5,2012,Category 3 hurricane


In [72]:
len(renamed_cost_df)

61

## Prepare Cost data

In [73]:
cost_df = renamed_cost_df

In [74]:
cost_df["norm_damage_usd"] = cost_df["norm_damage_usd"].str.replace('$','')
cost_df["norm_damage_usd"] = cost_df["norm_damage_usd"].str.replace('<','')
cost_df["norm_damage_usd"] = cost_df["norm_damage_usd"].str.replace('>','')
cost_df["damage_usd"] = cost_df["damage_usd"].str.replace('$','')
cost_df["damage_usd"] = cost_df["damage_usd"].str.replace('<','')
cost_df["damage_usd"] = cost_df["damage_usd"].str.replace('>','')
cost_df['norm_damage_usd'] = cost_df['norm_damage_usd'].astype(float)
cost_df['norm_damage_usd'] = cost_df['norm_damage_usd'].dropna()
cost_df['damage_usd'] = cost_df['damage_usd'].astype(float)
cost_df['damage_usd'] = cost_df['damage_usd'].dropna()
cost_df = cost_df.dropna()
len(cost_df) # 54 count

54

In [75]:
len(cost_df)

54

In [76]:
len(cost_df['name'].unique())

54

In [77]:
cost_df.head()

Unnamed: 0,name,damage_usd,norm_damage_usd,year,category
0,Katrina,125.0,116.9,2005,Category 5 hurricane
1,Harvey,125.0,62.2,2017,Category 4 hurricane
3,Irma,77.2,31.0,2017,Category 5 hurricane
4,Sandy,68.7,73.5,2012,Category 3 hurricane
5,Ike,38.0,35.2,2008,Category 4 hurricane


In [78]:
hurricanes_df

Unnamed: 0,hurricane_id,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.00,-54.00,1950
2,3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.10,1950
4,5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950
...,...,...,...,...,...,...,...,...,...,...,...,...
764,765,20191017,Nestor,1200,DB,22.2N,95.7W,35,1007,22.03,-95.12,2019
765,766,20191025,Olga,1200,TS,24.7N,94.8W,35,1004,24.12,-94.13,2019
766,767,20191023,Pablo,1800,EX,40.0N,38.3W,40,1000,40.00,-38.05,2019
767,768,20191027,Rebekah,0,EX,40.5N,51.5W,35,1004,40.08,-51.08,2019


In [79]:
# Merge two dataframes on both 'name' and 'year' columns to avoid duplicated entries
new_df = pd.merge(cost_df, hurricanes_df,  
                  how='left', left_on=['name','year'], right_on = ['name','year'])

In [80]:
len(new_df)

54

In [81]:
new_df.head()

Unnamed: 0,name,damage_usd,norm_damage_usd,year,category,hurricane_id,date,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
0,Katrina,125.0,116.9,2005,Category 5 hurricane,532,20050823,1800,TD,23.1N,75.1W,30,1008,23.02,-75.02
1,Harvey,125.0,62.2,2017,Category 4 hurricane,724,20170816,600,LO,13.7N,45.8W,25,1013,13.12,-45.13
2,Irma,77.2,31.0,2017,Category 5 hurricane,725,20170830,0,TD,16.1N,26.9W,30,1008,16.02,-26.15
3,Sandy,68.7,73.5,2012,Category 3 hurricane,663,20121021,1800,LO,14.3N,77.4W,25,1006,14.05,-77.07
4,Ike,38.0,35.2,2008,Category 4 hurricane,586,20080901,600,TD,17.2N,37.0W,30,1006,17.03,-37.0


In [82]:
new_df = new_df.dropna()

In [84]:
# Sort merged df by norm_damage_usd column in descending order 
sorted_new_df = new_df.sort_values(by=['norm_damage_usd'], ascending=False)
sorted_new_df.head(54)

Unnamed: 0,name,damage_usd,norm_damage_usd,year,category,hurricane_id,date,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
0,Katrina,125.0,116.9,2005,Category 5 hurricane,532,20050823,1800,TD,23.1N,75.1W,30,1008,23.02,-75.02
6,Andrew,27.3,106.0,1992,Category 5 hurricane,363,19920816,1800,TD,10.8N,35.5W,25,1010,10.13,-35.08
3,Sandy,68.7,73.5,2012,Category 3 hurricane,663,20121021,1800,LO,14.3N,77.4W,25,1006,14.05,-77.07
1,Harvey,125.0,62.2,2017,Category 4 hurricane,724,20170816,600,LO,13.7N,45.8W,25,1013,13.12,-45.13
4,Ike,38.0,35.2,2008,Category 4 hurricane,586,20080901,600,TD,17.2N,37.0W,30,1006,17.03,-37.0
5,Wilma,27.4,31.9,2005,Category 5 hurricane,544,20051015,1800,TD,17.6N,78.5W,25,1004,17.1,-78.08
2,Irma,77.2,31.0,2017,Category 5 hurricane,725,20170830,0,TD,16.1N,26.9W,30,1008,16.02,-26.15
9,Charley,16.9,26.9,2004,Category 4 hurricane,508,20040809,1200,TD,11.4N,59.2W,30,1010,11.07,-59.03
47,Camille,1.42,26.4,1969,Category 5 hurricane,162,19690814,0,TD,18.3N,79.7W,30,-999,18.05,-79.12
33,Agnes,2.1,26.0,1972,Category 1 hurricane,192,19720614,1200,TD,20.0N,89.0W,25,-999,20.0,-89.0


In [85]:
cost_df = sorted_new_df[['hurricane_id','name','year','norm_damage_usd','damage_usd']]
cost_df.head()

Unnamed: 0,hurricane_id,name,year,norm_damage_usd,damage_usd
0,532,Katrina,2005,116.9,125.0
6,363,Andrew,1992,106.0,27.3
3,663,Sandy,2012,73.5,68.7
1,724,Harvey,2017,62.2,125.0
4,586,Ike,2008,35.2,38.0


In [86]:
# Re-save to the csv file
renamed_cost_df.to_csv("Data/01_most_costly_hurricanes_wiki_web.csv")

## Dataset 2 - Deadliest hurricanes in the atlantic
#### https://en.wikipedia.org/wiki/List_of_deadliest_Atlantic_hurricanes

In [87]:
url = 'https://en.wikipedia.org/wiki/List_of_deadliest_Atlantic_hurricanes'
# Irina Path
# executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
# Clay Path
executable_path = {'executable_path': 'C:/Users/Clay/chromedriver.exe'}
# Amy Path
#
# Neil Path
#
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(url)
time.sleep(3)
print("Open web browser")
tables = pd.read_html(url)
most_deadliest_hurricanes = tables[2]
print("Read and save table")
browser.quit()
most_deadliest_hurricanes.head()

Open web browser
Read and save table


Unnamed: 0,Name,Dates active,Saffir-Simpson Category,Sustainedwind speeds,Pressure,Areas affected,Damage(US$),Deaths,Refs
0,San Marcos,"October 5–14, 1870",Category 3 hurricane,115 mph (185 km/h),959 hPa (28.32 inHg),"Cuba, Florida, Bahamas",,"800–2,000",[1]
1,Sea Islands,"August 15 – September 2, 1893",Category 3 hurricane,120 mph (195 km/h),954 hPa (28.17 inHg),"Georgia, South Carolina",,"1,000–2,000",
2,Chenier Caminada,"September 27 – October 5, 1893",Category 4 hurricane,135 mph (215 km/h),948 hPa (27.99 inHg),"Yucatán Peninsula, Louisiana, Mississippi",,"1,800–2,000",
3,San Ciriaco,"August 3 – September 4, 1899",Category 4 hurricane,150 mph (240 km/h),930 hPa (27.46 inHg),"Lesser Antilles, Puerto Rico, Eastern United S...",,3855,
4,Galveston,"August 27 – September 15, 1900",Category 4 hurricane,145 mph (230 km/h),936 hPa (27.64 inHg),"The Caribbean, Texas",,"8,000–12,000",


In [88]:
most_fatal_hurricanes = most_deadliest_hurricanes.head(22)

In [89]:
most_fatal_hurricanes.tail()

Unnamed: 0,Name,Dates active,Saffir-Simpson Category,Sustainedwind speeds,Pressure,Areas affected,Damage(US$),Deaths,Refs
17,Mitch,"October 22 – November 5, 1998",Category 5 hurricane,180 mph (285 km/h),905 hPa (26.72 inHg),"Central America, Yucatán Peninsula, South Florida",,"11,374–19,000",[7][8][9]
18,Jeanne,"September 13–28, 2004",Category 3 hurricane,120 mph (195 km/h),950 hPa (28.05 inHg),"The Caribbean, Eastern United States",,3037,[5][10][11][12]
19,Katrina,"August 23–30, 2005",Category 5 hurricane,175 mph (280 km/h),902 hPa (26.64 inHg),"Bahamas, United States Gulf Coast",,"1,245–1,836",[13]
20,Stan,"October 1–5, 2005",Category 1 hurricane,80 mph (130 km/h),977 hPa (28.85 inHg),"Mexico, Central America",,1668,[5][14]
21,Maria,"September 16 – October 2, 2017",Category 5 hurricane,175 mph (280 km/h),908 hPa (26.81 inHg),"Lesser Antilles (particularly Dominica), Puert...",,3059,[15]


In [90]:
len(most_fatal_hurricanes)

22

In [133]:
# Create a copy of the df
fatal_df = most_fatal_hurricanes
len(fatal_df)

22

In [134]:
dates_active = fatal_df['Dates active']
dates_active

0                  October 5–14, 1870
1       August 15 – September 2, 1893
2      September 27 – October 5, 1893
3        August 3 – September 4, 1899
4      August 27 – September 15, 1900
5                  August 20–28, 1909
6                September 6–20, 1928
7      August 29 – September 17, 1930
8                September 6–13, 1931
9      October 30 – November 13, 1932
10                    June 4–18, 1934
11                October 18–27, 1935
12              September 21–30, 1955
13    September 26 – October 12, 1963
14              September 14–24, 1974
15      August 25 – September 8, 1979
16                November 8–21, 1994
17      October 22 – November 5, 1998
18              September 13–28, 2004
19                 August 23–30, 2005
20                  October 1–5, 2005
21     September 16 – October 2, 2017
Name: Dates active, dtype: object

In [135]:
dates_list = []
for i in range(len(dates_active)):
    split_str = dates_active[i].split(",")[-1]
    print(split_str)
    dates_list.append(int(split_str))
print(len(dates_list))

 1870
 1893
 1893
 1899
 1900
 1909
 1928
 1930
 1931
 1932
 1934
 1935
 1955
 1963
 1974
 1979
 1994
 1998
 2004
 2005
 2005
 2017
22


In [136]:
fatal_df['year'] = dates_list
fatal_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fatal_df['year'] = dates_list


Unnamed: 0,Name,Dates active,Saffir-Simpson Category,Sustainedwind speeds,Pressure,Areas affected,Damage(US$),Deaths,Refs,year
0,San Marcos,"October 5–14, 1870",Category 3 hurricane,115 mph (185 km/h),959 hPa (28.32 inHg),"Cuba, Florida, Bahamas",,"800–2,000",[1],1870
1,Sea Islands,"August 15 – September 2, 1893",Category 3 hurricane,120 mph (195 km/h),954 hPa (28.17 inHg),"Georgia, South Carolina",,"1,000–2,000",,1893
2,Chenier Caminada,"September 27 – October 5, 1893",Category 4 hurricane,135 mph (215 km/h),948 hPa (27.99 inHg),"Yucatán Peninsula, Louisiana, Mississippi",,"1,800–2,000",,1893
3,San Ciriaco,"August 3 – September 4, 1899",Category 4 hurricane,150 mph (240 km/h),930 hPa (27.46 inHg),"Lesser Antilles, Puerto Rico, Eastern United S...",,3855,,1899
4,Galveston,"August 27 – September 15, 1900",Category 4 hurricane,145 mph (230 km/h),936 hPa (27.64 inHg),"The Caribbean, Texas",,"8,000–12,000",,1900


In [137]:
# # Drop columns
# fatal_df = fatal_df.drop(columns=['Dates active','Saffir-Simpson Category',
#                                           'Sustainedwind speeds','Pressure','Areas affected','Damage(US$)','Refs'])

In [138]:
# Select columns 
fatal_df = fatal_df[['Name','Deaths','year']]

In [139]:
fatal_df = fatal_df.rename(columns={'Name': 'name', 'Deaths': 'deaths'})
fatal_df.head(30)

Unnamed: 0,name,deaths,year
0,San Marcos,"800–2,000",1870
1,Sea Islands,"1,000–2,000",1893
2,Chenier Caminada,"1,800–2,000",1893
3,San Ciriaco,3855,1899
4,Galveston,"8,000–12,000",1900
5,Monterrey,4000,1909
6,Okeechobee,4075,1928
7,San Zenon,"2,000–8,000",1930
8,Belize,"1,500–2,500",1931
9,Cuba,"2,500–3,107",1932


In [140]:
fatal_df.head()

Unnamed: 0,name,deaths,year
0,San Marcos,"800–2,000",1870
1,Sea Islands,"1,000–2,000",1893
2,Chenier Caminada,"1,800–2,000",1893
3,San Ciriaco,3855,1899
4,Galveston,"8,000–12,000",1900


In [141]:
fatal_df["deaths"] = fatal_df["deaths"].str.replace(',','')
fatal_df["deaths"] = fatal_df["deaths"].str.replace('–',',')
fatal_df.head()

Unnamed: 0,name,deaths,year
0,San Marcos,8002000,1870
1,Sea Islands,10002000,1893
2,Chenier Caminada,18002000,1893
3,San Ciriaco,3855,1899
4,Galveston,800012000,1900


In [142]:
fatal_list = fatal_df["deaths"]

In [143]:
num_avg = []
for i in range(len(fatal_list)):
    if "," in fatal_list[i]:
        num0 = fatal_list[i].split(",")[0]
        num1 = fatal_list[i].split(",")[1]
        num = (int(num1) + int(num0))/2
        num_avg.append(num)
    else: num_avg.append(fatal_list[i])
print(num_avg)

[1400.0, 1500.0, 1900.0, '3855', 10000.0, '4000', '4075', 5000.0, 2000.0, 2803.5, 2500.0, '2150', '1023', '7193', '8210', '2068', '1152', 15187.0, '3037', 1540.5, '1668', '3059']


In [144]:
fatal_df = fatal_df.drop(columns="deaths")

In [145]:
fatal_df['deaths'] = num_avg

In [146]:
fatal_df = fatal_df.head(22)

In [147]:
fatal_df.head()

Unnamed: 0,name,year,deaths
0,San Marcos,1870,1400
1,Sea Islands,1893,1500
2,Chenier Caminada,1893,1900
3,San Ciriaco,1899,3855
4,Galveston,1900,10000


In [148]:
len(fatal_df)

22

In [149]:
# Merge two dataframes on both 'name' and 'year' columns to avoid duplicated entries
new_fatal_merged_df = pd.merge(fatal_df, hurricanes_df,  how='left', 
                               left_on=['name', 'year'], right_on = ['name','year'])
new_fatal_merged_df.head(60)

Unnamed: 0,name,year,deaths,hurricane_id,date,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
0,San Marcos,1870,1400.0,,,,,,,,,,
1,Sea Islands,1893,1500.0,,,,,,,,,,
2,Chenier Caminada,1893,1900.0,,,,,,,,,,
3,San Ciriaco,1899,3855.0,,,,,,,,,,
4,Galveston,1900,10000.0,,,,,,,,,,
5,Monterrey,1909,4000.0,,,,,,,,,,
6,Okeechobee,1928,4075.0,,,,,,,,,,
7,San Zenon,1930,5000.0,,,,,,,,,,
8,Belize,1931,2000.0,,,,,,,,,,
9,Cuba,1932,2803.5,,,,,,,,,,


In [152]:
# Filter out unnamed hurricanes before 1975
fatal_df = new_fatal_merged_df[new_fatal_merged_df['year'] > 1973]
fatal_df.head()

Unnamed: 0,name,year,deaths,hurricane_id,date,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
14,Fifi-Orlene,1974,8210,,,,,,,,,,
15,David,1979,2068,249.0,19790825.0,1200.0,TD,11.7N,36.1W,25.0,1008.0,11.12,-36.02
16,Gordon,1994,1152,383.0,19941108.0,1200.0,TD,11.9N,82.3W,25.0,1009.0,11.15,-82.05
17,Mitch,1998,15187,435.0,19981022.0,0.0,TD,11.6N,76.1W,30.0,1002.0,11.1,-76.02
18,Jeanne,2004,3037,515.0,20040913.0,1800.0,TD,15.9N,60.0W,25.0,1010.0,15.15,-60.0


In [155]:
# Filter out unnamed hurricanes named Fifi-Orlene
fatal_df = fatal_df.drop(index = 14)
fatal_df.head()

Unnamed: 0,name,year,deaths,hurricane_id,date,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
15,David,1979,2068.0,249.0,19790825.0,1200.0,TD,11.7N,36.1W,25.0,1008.0,11.12,-36.02
16,Gordon,1994,1152.0,383.0,19941108.0,1200.0,TD,11.9N,82.3W,25.0,1009.0,11.15,-82.05
17,Mitch,1998,15187.0,435.0,19981022.0,0.0,TD,11.6N,76.1W,30.0,1002.0,11.1,-76.02
18,Jeanne,2004,3037.0,515.0,20040913.0,1800.0,TD,15.9N,60.0W,25.0,1010.0,15.15,-60.0
19,Katrina,2005,1540.5,532.0,20050823.0,1800.0,TD,23.1N,75.1W,30.0,1008.0,23.02,-75.02


In [156]:
len(fatal_df)

7

In [157]:
fatal_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 15 to 21
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               7 non-null      object 
 1   year               7 non-null      int64  
 2   deaths             7 non-null      object 
 3   hurricane_id       7 non-null      float64
 4   date               7 non-null      float64
 5   time               7 non-null      float64
 6   status             7 non-null      object 
 7   latitude           7 non-null      object 
 8   longitude          7 non-null      object 
 9   max_wind           7 non-null      float64
 10  air_pressure       7 non-null      float64
 11  latitude_decimal   7 non-null      float64
 12  longitude_decimal  7 non-null      float64
dtypes: float64(7), int64(1), object(5)
memory usage: 784.0+ bytes


In [158]:
# Convert hurricane_id back to integer
fatal_df['hurricane_id'] = fatal_df['hurricane_id'].astype(int)
fatal_df.head()

Unnamed: 0,name,year,deaths,hurricane_id,date,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal
15,David,1979,2068.0,249,19790825.0,1200.0,TD,11.7N,36.1W,25.0,1008.0,11.12,-36.02
16,Gordon,1994,1152.0,383,19941108.0,1200.0,TD,11.9N,82.3W,25.0,1009.0,11.15,-82.05
17,Mitch,1998,15187.0,435,19981022.0,0.0,TD,11.6N,76.1W,30.0,1002.0,11.1,-76.02
18,Jeanne,2004,3037.0,515,20040913.0,1800.0,TD,15.9N,60.0W,25.0,1010.0,15.15,-60.0
19,Katrina,2005,1540.5,532,20050823.0,1800.0,TD,23.1N,75.1W,30.0,1008.0,23.02,-75.02


In [99]:
fatal_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 15 to 21
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               7 non-null      object 
 1   year               7 non-null      int64  
 2   deaths             7 non-null      object 
 3   hurricane_id       7 non-null      int64  
 4   date               7 non-null      float64
 5   time               7 non-null      float64
 6   status             7 non-null      object 
 7   latitude           7 non-null      object 
 8   longitude          7 non-null      object 
 9   max_wind           7 non-null      float64
 10  air_pressure       7 non-null      float64
 11  latitude_decimal   7 non-null      float64
 12  longitude_decimal  7 non-null      float64
dtypes: float64(6), int64(2), object(5)
memory usage: 784.0+ bytes


In [159]:
# Drop columns
fatal_df = fatal_df[['hurricane_id','name','year','deaths']]

In [160]:
fatal_df

Unnamed: 0,hurricane_id,name,year,deaths
15,249,David,1979,2068.0
16,383,Gordon,1994,1152.0
17,435,Mitch,1998,15187.0
18,515,Jeanne,2004,3037.0
19,532,Katrina,2005,1540.5
20,540,Stan,2005,1668.0
21,729,Maria,2017,3059.0


## Prepare plotly data for maximum winds

In [161]:
# Use master df to plot maximum winds
master_df.head()

Unnamed: 0,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year,hurricane_id
0,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950,1
1,19500812,Able,600,TS,17.7N,56.3W,40,-999,17.12,-56.05,1950,1
2,19500812,Able,1200,TS,18.2N,57.4W,45,-999,18.03,-57.07,1950,1
3,19500812,Able,1800,TS,19.0N,58.6W,50,-999,19.0,-58.1,1950,1
4,19500813,Able,0,TS,20.0N,60.0W,50,-999,20.0,-60.0,1950,1


In [162]:
# Groupby based on name and year and aggregate to find max winds
test_grouped_df = master_df.groupby(['name','year']).agg(
    max_wind=pd.NamedAgg(column='max_wind', aggfunc=max))
test_grouped_df = test_grouped_df.reset_index()
test_grouped_df.head()

Unnamed: 0,name,year,max_wind
0,Abby,1960,70
1,Abby,1964,60
2,Abby,1968,65
3,Able,1950,110
4,Able,1951,80


In [163]:
# Add column with name and year
name_year = []
for i in range(len(test_grouped_df)):
    name_year.append(f"{test_grouped_df['name'][i]}_{test_grouped_df['year'][i]}")
test_grouped_df['name_year'] = name_year
test_grouped_df.head()

Unnamed: 0,name,year,max_wind,name_year
0,Abby,1960,70,Abby_1960
1,Abby,1964,60,Abby_1964
2,Abby,1968,65,Abby_1968
3,Able,1950,110,Able_1950
4,Able,1951,80,Able_1951


In [164]:
len(test_grouped_df)

769

In [165]:
# Create df for pushing to postgresql
max_wind_plotly_df = test_grouped_df[['name','max_wind','name_year']]

In [167]:
# Sort the df in descending order
max_wind_plotly_df = max_wind_plotly_df.sort_values(by=['max_wind'], ascending=False)
max_wind_plotly_df

Unnamed: 0,name,max_wind,name_year
24,Allen,165,Allen_1980
281,Dorian,160,Dorian_2019
766,Wilma,160,Wilma_2005
454,Gilbert,160,Gilbert_1988
561,Irma,155,Irma_2017
...,...,...,...
381,Fifteen,30,Fifteen_2019
764,Two,30,Two_2014
758,Ten,30,Ten_2011
752,Sixteen,25,Sixteen_2008


# Cost by State DAMAGES in millions (1980-2020) 

In [208]:
cost_by_state_df = pd.read_csv('https://www.ncdc.noaa.gov/billions/state-cost-data.csv', skiprows=1)

In [209]:
cost_by_state_df.head()

Unnamed: 0,state,drought,flooding,freeze,severe storm,tropical cyclone,wildfire,winter storm
0,AK,0.0,0.0,0.0,0.0,0.0,1887.9,0.0
1,AL,5632.4,116.6,124.7,10259.9,18565.6,614.9,1845.0
2,AR,5447.8,3897.7,244.6,5032.4,323.1,0.0,488.0
3,AZ,766.4,493.2,0.0,4522.0,0.0,803.6,0.0
4,CA,9828.2,10958.0,12694.0,2908.3,0.0,63781.0,0.0


In [210]:
hurricane_related_cost = cost_by_state_df[['state', 'flooding', 'severe storm', 'tropical cyclone']]
hurricane_related_cost.head()

Unnamed: 0,state,flooding,severe storm,tropical cyclone
0,AK,0.0,0.0,0.0
1,AL,116.6,10259.9,18565.6
2,AR,3897.7,5032.4,323.1
3,AZ,493.2,4522.0,0.0
4,CA,10958.0,2908.3,0.0


In [211]:
hurricane_related_cost.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   state             53 non-null     object 
 1   flooding          53 non-null     float64
 2   severe storm      53 non-null     float64
 3   tropical cyclone  53 non-null     float64
dtypes: float64(3), object(1)
memory usage: 1.8+ KB


In [212]:
hurricane_related_cost['flooding'] = hurricane_related_cost['flooding'].astype(int)
hurricane_related_cost['severe storm'] = hurricane_related_cost['severe storm'].astype(int)
hurricane_related_cost['tropical cyclone'] = hurricane_related_cost['tropical cyclone'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hurricane_related_cost['flooding'] = hurricane_related_cost['flooding'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hurricane_related_cost['severe storm'] = hurricane_related_cost['severe storm'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hurricane_related_cost['tropi

In [213]:
hurricane_related_cost.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   state             53 non-null     object
 1   flooding          53 non-null     int32 
 2   severe storm      53 non-null     int32 
 3   tropical cyclone  53 non-null     int32 
dtypes: int32(3), object(1)
memory usage: 1.2+ KB


In [214]:
hurricane_related_cost['total_damage'] = hurricane_related_cost['flooding'] + hurricane_related_cost['severe storm'] + hurricane_related_cost['tropical cyclone']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hurricane_related_cost['total_damage'] = hurricane_related_cost['flooding'] + hurricane_related_cost['severe storm'] + hurricane_related_cost['tropical cyclone']


In [215]:
hurricane_related_cost.head()

Unnamed: 0,state,flooding,severe storm,tropical cyclone,total_damage
0,AK,0,0,0,0
1,AL,116,10259,18565,28940
2,AR,3897,5032,323,9252
3,AZ,493,4522,0,5015
4,CA,10958,2908,0,13866


In [216]:
hurricane_related_cost.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   state             53 non-null     object
 1   flooding          53 non-null     int32 
 2   severe storm      53 non-null     int32 
 3   tropical cyclone  53 non-null     int32 
 4   total_damage      53 non-null     int32 
dtypes: int32(4), object(1)
memory usage: 1.4+ KB


In [217]:
hurricane_related_cost = hurricane_related_cost.sort_values(by='total_damage', ascending=False)
hurricane_related_cost.head(10)

Unnamed: 0,state,flooding,severe storm,tropical cyclone,total_damage
44,US,150556,269086,954069,1373711
43,TX,12091,48767,194110,254968
8,FL,1423,3074,205563,210060
17,LA,17048,8051,152872,177971
38,PR,0,0,101548,101548
26,NC,56,5494,54860,60410
24,MS,3138,4142,51528,58808
33,NY,1696,2642,47308,51646
30,NJ,968,2757,34801,38526
23,MO,12614,17916,432,30962


In [218]:
cost_by_state_in_millions_df = hurricane_related_cost[['state','total_damage']]
cost_by_state_in_millions_df.head()

Unnamed: 0,state,total_damage
44,US,1373711
43,TX,254968
8,FL,210060
17,LA,177971
38,PR,101548


In [None]:
# Prepare cost_by_state_in_millions_df to send to SQL, ***EDIT ETL*** (Note for Clay)

## Push table to PostgreSQL

### !!! Please import QuickDBD-export.sql in the repo root folder file in pgAdmin
### !!! Run SQL file to generate the tables and reference links
### UNCOMMENT cell below to run the rest of the code

In [168]:
# Connect to the local database
connection_string = f'{username}:{password}@localhost:5432/hurricanes_db'
engine = create_engine(f'postgresql://{connection_string}')

In [169]:
hurricanes_df

Unnamed: 0,hurricane_id,date,name,time,status,latitude,longitude,max_wind,air_pressure,latitude_decimal,longitude_decimal,year
0,1,19500812,Able,0,TS,17.1N,55.5W,35,-999,17.02,-55.08,1950
1,2,19500818,Baker,1200,TD,12.0N,54.0W,30,-999,12.00,-54.00,1950
2,3,19500821,Charlie,1200,TS,11.1N,26.4W,35,-999,11.02,-26.07,1950
3,4,19500830,Dog,1800,HU,15.2N,55.6W,80,-999,15.03,-55.10,1950
4,5,19500901,Easy,600,TS,19.7N,83.2W,40,-999,19.12,-83.03,1950
...,...,...,...,...,...,...,...,...,...,...,...,...
764,765,20191017,Nestor,1200,DB,22.2N,95.7W,35,1007,22.03,-95.12,2019
765,766,20191025,Olga,1200,TS,24.7N,94.8W,35,1004,24.12,-94.13,2019
766,767,20191023,Pablo,1800,EX,40.0N,38.3W,40,1000,40.00,-38.05,2019
767,768,20191027,Rebekah,0,EX,40.5N,51.5W,35,1004,40.08,-51.08,2019


In [170]:
hurricanes_df.to_sql(name="hurricanes", con=engine, if_exists='append', index=False)

In [171]:
master_df.to_sql(name="master", con=engine, if_exists='append', index=False)

In [172]:
cost_df.to_sql(name="cost", con=engine, if_exists='append', index=False)

In [173]:
fatal_df.to_sql(name="fatalities", con=engine, if_exists='append', index=False)

In [174]:
max_wind_plotly_df.to_sql(name="maxwinds", con=engine, if_exists='append', index=False)

In [None]:
# Send cost_by_state_in_millions_df to SQL (Note for Clay)