In [1]:
%load_ext autoreload
%autoreload 2

import functions
import pandas as pd

In [2]:
url = "https://www.sharkattackfile.net/spreadsheets/GSAF5.xls"

sharks_df = functions.load_dataframe(url)
sharks_df

Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,...,Species,Source,pdf,href formula,href,Case Number,Case Number.1,original order,Unnamed: 21,Unnamed: 22
0,2025-01-11 00:00:00,2025.0,Provoked,USA,Hawaii,Off Haleiwa Boat Harbour Oahu,Diving,Male not stated was a dive tour worker,M,23,...,Not stated,Kevin McMurray Trackingsharks.com,,,,,,,,
1,2025-01-02 00:00:00,2025.0,Unprovoked,New Caledonia,Grande Terre,Islet of Kendek near Koumac,Spearfishing,Robert Cuewapuru,M,40,...,Reportedly Tiger or Bull shark,Johannes Marchand Todd Smith,,,,,,,,
2,2025-01-02 00:00:00,2025.0,Unprovoked,Australia,South Australia,Granites Beach near Westall Streaky Bay,Surfing,Lance Appleby,M,28,...,Great White Shart estimated 4.8m 16 ft,Glen Folkard: Simon De Marchi News.com.au: The...,,,,,,,,
3,2024-12-29 00:00:00,2024.0,Unprovoked,Egypt,North of Marsa Alam,Red Sea,SCUBA Diving,Peppino Fappani,M,69,...,Reportedly a Tiger Shark,Todd Smith : Kevin McMurray Trackingsharks .co...,,,,,,,,
4,2024-12-29 00:00:00,2024.0,Unprovoked,Egypt,North of Marsa Alam,Red Sea,SCUBA Diving,Gianluca Di Gioia,M,48,...,Reportedly a Tiger Shark,Todd Smith : Kevin McMurray Trackingsharks .co...,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6985,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,,...,,"H. Taunton; N. Bartlett, p. 234",ND-0005-RoebuckBay.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0005,ND.0005,6.0,,
6986,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,,...,,"H. Taunton; N. Bartlett, pp. 233-234",ND-0004-Ahmun.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0004,ND.0004,5.0,,
6987,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,,...,,"F. Schwartz, p.23; C. Creswell, GSAF",ND-0003-Ocracoke_1900-1905.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0003,ND.0003,4.0,,
6988,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,,...,,"The Sun, 10/20/1938",ND-0002-JulesPatterson.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0002,ND.0002,3.0,,


In [3]:
# Cleaning General
# Dataframed based on the relevant columns
sharks_df = sharks_df.drop(columns=["Sex", "Age", "Name", "Injury", "Source", "pdf",'href',"href formula",'Case Number','Case Number.1','original order','Unnamed: 21','Unnamed: 22','Time'])

# Make columns lower case for readibility.
sharks_df.rename(columns={ col: col.rstrip().replace(" ", "_").lower() for col in sharks_df.columns}, inplace=True)


In [4]:

# Remove null values
column_list = ['date','type','country','state','location','activity','fatal_y/n','species']
functions.remove_null(sharks_df, column_list, "N/A")

# Check Amount of null values
sharks_df.isna().sum() #Output: no null values


Null values in column 'date' have been replaced with value: N/A.
Null values in column 'type' have been replaced with value: N/A.
Null values in column 'country' have been replaced with value: N/A.
Null values in column 'state' have been replaced with value: N/A.
Null values in column 'location' have been replaced with value: N/A.
Null values in column 'activity' have been replaced with value: N/A.
Null values in column 'fatal_y/n' have been replaced with value: N/A.
Null values in column 'species' have been replaced with value: N/A.


date         0
year         2
type         0
country      0
state        0
location     0
activity     0
fatal_y/n    0
species      0
dtype: int64

In [5]:
# Cleaning Fatal Y/N
# Removing invalid values for fatal_y/n column and setting it only for 'N', 'Y', or 'N/A'
sharks_df = functions.clean_y_n(sharks_df, "fatal_y/n")

# check_char(sharks_df['fatal_y/n'])
sharks_df['fatal_y/n'].value_counts()

fatal_y/n
N      4879
Y      1469
N/A     642
Name: count, dtype: int64

In [6]:
# Cleaning Dates
# Date column conversion

column_list = ['date']
# Converting 'date' column to datetime and converting invalid dates to NaT
sharks_df['date'] = pd.to_datetime(sharks_df['date'], errors='coerce')
# Setting a dataframe for invalid dates equal to N/A
invalid_dates = sharks_df[sharks_df['date'].isna()]
#s etting a variable and applying function to clean up NAs
cleaned_date_column = functions.remove_null(sharks_df, column_list,'N/A')
# Appling it to date column
sharks_df['date'] = cleaned_date_column
# Extracting the month in a new column
sharks_df['month'] = sharks_df['date'].apply(lambda x: x.month if isinstance(x, pd.Timestamp) else x)
# Determining the season of the year based on month column
sharks_df['season'] = sharks_df['month'].apply(
    lambda x: 'Spring' if isinstance(x, int) and 3 <= x <= 5 else
              ('Summer' if isinstance(x, int) and 6 <= x <= 8 else
               ('Fall' if isinstance(x, int) and 9 <= x <= 11 else
                'Winter' if isinstance(x, int) else 'N/A')))
# Formatting the date column to YYYY-MM-DD
sharks_df['date'] = sharks_df['date'].apply(lambda x: x.strftime('%Y-%m-%d') if isinstance(x, pd.Timestamp) else x)
sharks_df['season'].value_counts()

display(sharks_df)


Null values in column 'date' have been replaced with value: N/A.


Unnamed: 0,date,year,type,country,state,location,activity,fatal_y/n,species,month,season
0,2025-01-11,2025.0,Provoked,USA,Hawaii,Off Haleiwa Boat Harbour Oahu,Diving,N,Not stated,1,Winter
1,2025-01-02,2025.0,Unprovoked,New Caledonia,Grande Terre,Islet of Kendek near Koumac,Spearfishing,Y,Reportedly Tiger or Bull shark,1,Winter
2,2025-01-02,2025.0,Unprovoked,Australia,South Australia,Granites Beach near Westall Streaky Bay,Surfing,Y,Great White Shart estimated 4.8m 16 ft,1,Winter
3,2024-12-29,2024.0,Unprovoked,Egypt,North of Marsa Alam,Red Sea,SCUBA Diving,N,Reportedly a Tiger Shark,12,Winter
4,2024-12-29,2024.0,Unprovoked,Egypt,North of Marsa Alam,Red Sea,SCUBA Diving,Y,Reportedly a Tiger Shark,12,Winter
...,...,...,...,...,...,...,...,...,...,...,...
6985,,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,Y,,,
6986,,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Y,,,
6987,,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Y,,,
6988,,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Y,,,


In [7]:
# Cleaning the Year
sharks_df['year'] = sharks_df['year'].fillna(0)
sharks_df['year'] = sharks_df['year'].astype(int)

sharks_df = sharks_df.loc[1970 <= sharks_df["year"]]

# display(sharks_df.tail(50))

In [8]:
# Cleaning the Shark Species

species_mapping_regex = {
    r'.*white.*': 'White Shark',
    r'.*tiger.*': 'Tiger Shark',
    r'.*bull.*': 'Bull Shark',
    r'.*wobbegong.*': 'Wobbegon Shark',
    r'.*nurse.*': 'Nurse Shark',
    r'.*blacktip.*': 'Blacktip Shark',
    r'.*blue.*': 'Blue Shark',
    r'.*raggedtooth.*': 'Raggedtooth Shark',
    r'.*mako.*': 'Mako Shark',
    r'.*lemon.*': 'Lemon Shark',
    r'.*zambesi.*': 'Zambesi Shark',
    r'.*sandshark.*': 'Sandshark',
    r'.*bronze whaler.*': 'Bronze Whaler Shark',
    r'.*caribbean reef.*': 'Caribbean Reef Shark',
    r'.*grey reef.*': 'Grey Reef Shark',
    r'.*dusky.*': 'Dusky Shark',
    r'.*horn.*': 'Horn Shark',
    r'.*sevengill.*': 'Sevengill Shark',
    r'.*carpet.*': 'Carpet Shark',
    r'.*galapagos.*': 'Galapagos Shark',
    r'.*porbeagle.*': 'Porbeagle Shark',
    r'.*hammerhead.*': 'Hammerhead Shark',
    r'.*copper.*': 'Copper Shark',
    r'.*leopard.*': 'Leopard Shark',
    r'.*shovelnose.*': 'Shovelnose Shark',
    r'.*spinner.*': 'Spinner Shark',
    r'.*whaler.*': 'Whaler Shark',
    r'.*bonita.*': 'Bonita Shark',
    r'.*angel.*': 'Angel Shark',
    r'.*thresher.*': 'Thresher Shark',

    r'.*questionable.*': "N/A",
    r'.*unconfirmed.*': "N/A",
    r'.*not confirmed.*': "N/A",
    r'.*invalid.*': "N/A",
     r'.*unknown.*': "N/A",

    r".*\s[a-z0-9\'\"\)\.\?\[\]]+$": "N/A",
    r'na[\'\"\)\.\?\[\]]+': "N/A",
    r'.*n/a.*': "N/A",

    # Empty string and whitespace patterns
    r'^$': "N/A",               # Empty string
    r'^\s+$': "N/A",            # String containing only whitespace
    r'^["\']\s*["\']$': "N/A",  # Quoted empty string or whitespace
}

sharks_df = functions.clean_with_regex_dictionary(sharks_df, "species", species_mapping_regex)
sharks_df = sharks_df[sharks_df["species"].map((sharks_df["species"].value_counts() > 3))]


In [9]:
# Clean attack type (Provoked / Unprovoked)

def attack_type(x):
    x = x.rstrip()
    if x == "Provoked" or x == "Unprovoked":
        return x
    else: 
        return "N/A"

sharks_df["type"] = sharks_df["type"].apply(attack_type)
display(sharks_df["type"].value_counts())


type
Unprovoked    3204
N/A            552
Provoked       302
Name: count, dtype: int64

In [10]:
# #Location and County Clean-up

# #Check for commas in the 'location' column before splitting
# has_comma = sharks_df['location'].str.contains(',', na=False)

# # Split only rows with commas
# sharks_df.loc[has_comma, ['location', 'county']] = sharks_df.loc[has_comma, 'location'].str.split(',', n=1, expand=True)

# sharks_df

# # county_empty = ["county"]
# # functions.remove_null(sharks_df, county_empty, "N/A")

# # # Clean up by stripping extra spaces
# # sharks_df['location'] = sharks_df['location'].str.strip()
# # sharks_df['county'] = sharks_df['county'].str.strip()

# # # Fill NaN values in 'county' for rows without commas
# # sharks_df['county'] = sharks_df['county'].fillna('Unknown')
# # sharks_df['county'] = sharks_df['county'].replace('Unknown', 'N/A')

# # sharks_df[['location','county']].value_counts().head(50)

# # sharks_df['county'].unique()

# Cleaning county and location
# Check for commas in the 'location' column
has_comma = sharks_df['location'].str.contains(',', na=False)

# Split rows with commas into 'location' and 'county'
split_locations = sharks_df.loc[has_comma, 'location'].str.split(',', n=1, expand=True)

# Assign split results back to the DataFrame
sharks_df.loc[has_comma, 'location'] = split_locations[0].str.strip()  # Part before the comma
sharks_df.loc[has_comma, 'county'] = split_locations[1].str.strip()  # Part after the comma

# For rows without commas, set 'county' to 'N/A'
sharks_df['county'] = sharks_df['county'].fillna('N/A')

# Filter out rows where 'location' is empty, null, or 'N/A'
sharks_df = sharks_df[
    sharks_df['location'].notna() & (sharks_df['location'] != '') & (sharks_df['location'] != 'N/A')
]

# Display the first few rows of the cleaned DataFrame
print(sharks_df[['location', 'county']].head(50))

                                             location            county
0                       Off Haleiwa Boat Harbour Oahu               N/A
1                         Islet of Kendek near Koumac               N/A
2             Granites Beach near Westall Streaky Bay               N/A
3                                             Red Sea               N/A
4                                             Red Sea               N/A
5                    Humpy Island Great Keppel Island               N/A
6      Curtis Island Near Gladstone QLD central coast               N/A
7   Greenough River Mouth Beach Cape Burney 12km s...               N/A
8                                      Khao Lak Beach               N/A
9                                     Chatham Islands               N/A
10                                           Maafushi               N/A
11                       Sand Piles Waiehu Beach Park               N/A
12                                 Whitsunday Islands           

In [19]:
# Get the counts of each unique value in the 'location' column
location_counts = sharks_df['location'].value_counts()
location_counts

location
New Smyrna Beach      28
Nahoon                 8
Florida Keys           8
Myrtle Beach           7
Mossel Bay             7
                      ..
Pillar Point           1
Makenat                1
White Plains Beach     1
Pacific State          1
Shark tank             1
Name: count, Length: 1022, dtype: int64

In [11]:
# Country clean-up

# Strip whitespace and standardize capitalization
sharks_df['country'] = sharks_df['country'].str.strip().str.title()

# Replace common errors or inconsistencies
country_replacements = {
    'Usa': 'USA',
    'U.S.A.': 'USA',
    'United States Of America': 'USA',
    'Aus': 'Australia',
    'Uk': 'United Kingdom',
    'South Africa ': 'South Africa'
}

#sharks_df['country'] = sharks_df['country'].replace(country_replacements)

# Handle missing values
sharks_df['country'] = sharks_df['country'].fillna('Unknown')

sharks_df['country'].nunique()

114

In [12]:
# Filtering
# Filtering by Countries with fequency greater than 50
sharks_df = functions.filter_column_by_min_count(sharks_df, "country", 50)

# Filtering Species with value N/A
sharks_df = functions.filter_column_by_value(sharks_df, "species", "N/A")

sharks_df

Unnamed: 0,level_0,index,date,year,type,country,state,location,activity,fatal_y/n,species,month,season,county
0,1,1,2025-01-02,2025,Unprovoked,New Caledonia,Grande Terre,Islet of Kendek near Koumac,Spearfishing,Y,Tiger Shark,1,Winter,
1,2,2,2025-01-02,2025,Unprovoked,Australia,South Australia,Granites Beach near Westall Streaky Bay,Surfing,Y,White Shark,1,Winter,
2,6,9,2024-11-19,2024,Unprovoked,New Zealand,Bay of Waitangi,Chatham Islands,Diving,Y,White Shark,11,Fall,
3,7,12,2024-10-31,2024,Unprovoked,Australia,Queensland,Whitsunday Islands,Snorkeling,N,Tiger Shark,10,Fall,
4,8,13,2024-10-25,2024,Unprovoked,Usa,Florida,Bathtub Beach,Surfing,N,Tiger Shark,10,Fall,Martin County
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296,3307,4035,1971-04-11,1971,Unprovoked,South Africa,Western Cape Province,Buffels Bay,Swimming,Y,White Shark,4,Spring,
1297,3309,4040,1971-03-30,1971,Unprovoked,New Zealand,South Island,Dunedin,Surfing,N,White Shark,3,Spring,
1298,3310,4042,1971-01-01,1971,Unprovoked,Usa,California,Franklin Point,Free diving,N,White Shark,1,Winter,San Mateo County
1299,3311,4043,1970-01-01,1971,Unprovoked,Australia,South Australia,Southport,Surfing,N,White Shark,1,Winter,


In [13]:
# Exploring

# States sorted by most common species
most_common_species_by_state = sharks_df.groupby(['species'])['state'].agg(lambda x: x.value_counts().index[0]).sort_values().head(30)
most_common_species_by_state = sharks_df.groupby(['species'])['state'].value_counts().groupby(level=0).head(1).sort_values(ascending=False)

df_most_common = pd.DataFrame(most_common_species_by_state)
display(df_most_common)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
species,state,Unnamed: 2_level_1
White Shark,California,147
Tiger Shark,Hawaii,101
Blacktip Shark,Florida,62
Bull Shark,Florida,55
Nurse Shark,Florida,28
Spinner Shark,Florida,23
Raggedtooth Shark,Eastern Cape Province,23
Wobbegon Shark,New South Wales,19
Bronze Whaler Shark,New South Wales,17
Lemon Shark,Florida,14


In [14]:
# sharks_df = sharks_df.groupby(["species", 'fatal_y/n'])['fatal_y/n'].count().sort_values(ascending=False)

In [15]:
# sharks_df[sharks_df['fatal_y/n'] == "Y"].sort_values("species").groupby(["species"])['fatal_y/n'].value_counts().sort_values(ascending=False)

In [16]:
# sharks_df[sharks_df['fatal_y/n'] == "N"].sort_values("species").groupby(["species"])['fatal_y/n'].value_counts().sort_values(ascending=False)

In [17]:
# Order Shark Species by Fatality count

(sharks_df.groupby("species")['fatal_y/n']
.value_counts()
.unstack(fill_value=0)
.assign(Y_N_ratio=lambda x: (x['Y'] / x['N']).round(3))
.sort_values('N', ascending=False)
.assign(total=lambda x: x['Y'] + x['N'])
.sort_values('total', ascending=False)
.drop('N/A', axis=1))

fatal_y/n,N,Y,Y_N_ratio,total
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
White Shark,402,92,0.229,494
Tiger Shark,172,38,0.221,210
Bull Shark,133,16,0.12,149
Blacktip Shark,104,0,0.0,104
Bronze Whaler Shark,54,1,0.019,55
Nurse Shark,51,0,0.0,51
Raggedtooth Shark,40,0,0.0,40
Wobbegon Shark,33,0,0.0,33
Lemon Shark,26,0,0.0,26
Mako Shark,24,1,0.042,25


In [18]:
#Non fatal shark occurrences by season and country
non_fatal_df = sharks_df[sharks_df['fatal_y/n'] == 'N']  # Filter for non-fatal incidents
non_fatal_group = non_fatal_df.groupby(['month', 'country']).size().reset_index(name='non_fatal_count')
non_fatal_sorted = non_fatal_group.sort_values(by='non_fatal_count', ascending=False).head(30)
#fatal shark occurrences by season and country
fatal_df = sharks_df[sharks_df['fatal_y/n'] == 'Y']  # Filter for fatal incidents
fatal_group = fatal_df.groupby(['month', 'country']).size().reset_index(name='fatal_count')
fatal_sorted = fatal_group.sort_values(by='fatal_count', ascending=False).head(30)
#Fatal risk by month and country
fatal_risk_df = pd.merge(non_fatal_sorted, fatal_sorted, on=['month', 'country'], how='outer').fillna(0)
fatal_risk_df['total'] = fatal_risk_df['fatal_count'] + fatal_risk_df['non_fatal_count']
fatal_risk_df['fatal_ratio_%'] = round(fatal_risk_df['fatal_count'] / fatal_risk_df['non_fatal_count'],4) * 100
fatal_risk_df = fatal_risk_df.loc[fatal_risk_df['month'] != 'N/A']
fatal_risk_df = fatal_risk_df.loc[fatal_risk_df['fatal_ratio_%'] != float('inf')]
fatal_risk_df.sort_values(by=['fatal_ratio_%','month'], ascending=False)
#Average shark incidents per year in a given month
avg_shark_sight = sharks_df.groupby(['month', 'year','country']).size().reset_index(name='count')
avg_shark_attack_df = avg_shark_sight.groupby(['month','country'])['count'].mean().reset_index(name='average_incidents_per_year')
avg_shark_attack_df['average_incidents_per_year'] = avg_shark_attack_df['average_incidents_per_year'].round(2)
fatal_risk_df = pd.merge(fatal_risk_df, avg_shark_attack_df, on=['month', 'country'], how='left')
fatal_risk_df.sort_values(by=['total','month'], ascending=False)

Unnamed: 0,month,country,non_fatal_count,fatal_count,total,fatal_ratio_%,average_incidents_per_year
18,8,Usa,80.0,4.0,84.0,5.0,2.1
16,7,Usa,80.0,2.0,82.0,2.5,2.56
23,10,Usa,76.0,3.0,79.0,3.95,2.26
20,9,Usa,64.0,7.0,71.0,10.94,1.97
13,6,Usa,60.0,0.0,60.0,0.0,1.94
0,1,Australia,41.0,6.0,47.0,14.63,1.52
12,5,Usa,40.0,0.0,40.0,0.0,1.64
9,4,Usa,34.0,2.0,36.0,5.88,1.64
26,12,Australia,26.0,7.0,33.0,26.92,1.59
21,10,Australia,30.0,3.0,33.0,10.0,1.48
