In [1]:
# PropNa stands for 'Proportion of NaN'
# I like how it matches the in-built DropNa function name :)

def propna(df):
    
    """
    Quick check the proportion of entries in a given DataFrame that is NaN.
    Single argument: (dataframe)
    Returns: Dictionary {[Column Name] : [Percentage of Nan (in %)]}
    """
    
    total_nans = df.isna().sum().to_dict()
    propna = [round((nans / df.shape[0] * 100), 2) for nans in total_nans.values()]
    total_nans.update(dict(zip(total_nans.keys(), propna)))
    print('Percentage of data that is NaN (in %)')
    return total_nans

propna?

[1;31mSignature:[0m [0mpropna[0m[1;33m([0m[0mdf[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Quick check the proportion of entries in a given DataFrame that is NaN.
Single argument: (dataframe)
[1;31mFile:[0m      c:\users\lenovo\appdata\local\temp\ipykernel_6804\2986362422.py
[1;31mType:[0m      function

In [3]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_excel("https://www.sharkattackfile.net/spreadsheets/GSAF5.xls")

In [6]:
data.head(3)

Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,...,Species,Source,pdf,href formula,href,Case Number,Case Number.1,original order,Unnamed: 21,Unnamed: 22
0,2024-09-16 00:00:00,2024.0,Unprovoked,Morocco,Southern Morocco,West of Dakhla,Swimming - jumped off yacht,German Tourist,F,30,...,Reportedly a Great White,Andy Currie: Moroccan World News: Sky News,,,,,,,,
1,2024-08-26 00:00:00,2024.0,Unprovoked,Jamaica,Montego Bay,Falmouth,Spearfishing,Jahmari Reid,M,16,...,Reportedly Tiger Shark,Todd Smith: Daily Mail UK: Sky News: People .com,,,,,,,,
2,2024-08-06 00:00:00,2024.0,Unprovoked,Belize,Gulf of Honduras,Central America,Swimming - Diving,Annabelle Carlson,F,15,...,Reef shark 6ft (1.8m),Kevin McMurray Tracking Sharks.com New York Po...,,,,,,,,


In [7]:
data.shape

(6970, 23)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6970 entries, 0 to 6969
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            6970 non-null   object 
 1   Year            6968 non-null   float64
 2   Type            6952 non-null   object 
 3   Country         6920 non-null   object 
 4   State           6488 non-null   object 
 5   Location        6405 non-null   object 
 6   Activity        6384 non-null   object 
 7   Name            6750 non-null   object 
 8   Sex             6391 non-null   object 
 9   Age             3975 non-null   object 
 10  Injury          6935 non-null   object 
 11  Unnamed: 11     6408 non-null   object 
 12  Time            3443 non-null   object 
 13  Species         3838 non-null   object 
 14  Source          6951 non-null   object 
 15  pdf             6799 non-null   object 
 16  href formula    6794 non-null   object 
 17  href            6796 non-null   o

In [9]:
data.nunique()

Date              6021
Year               261
Type                12
Country            234
State              914
Location          4534
Activity          1596
Name              5708
Sex                  8
Age                245
Injury            4100
Unnamed: 11         12
Time               423
Species           1693
Source            5326
pdf               6789
href formula      6784
href              6776
Case Number       6777
Case Number.1     6775
original order    6797
Unnamed: 21          1
Unnamed: 22          2
dtype: int64

In [10]:
# Check proportions of NaN values in the data
propna(data)

Percentage of data that is NaN (in %)


{'Date': 0.0,
 'Year': 0.03,
 'Type': 0.26,
 'Country': 0.72,
 'State': 6.92,
 'Location': 8.11,
 'Activity': 8.41,
 'Name': 3.16,
 'Sex': 8.31,
 'Age': 42.97,
 'Injury': 0.5,
 'Unnamed: 11': 8.06,
 'Time': 50.6,
 'Species ': 44.94,
 'Source': 0.27,
 'pdf': 2.45,
 'href formula': 2.53,
 'href': 2.5,
 'Case Number': 2.47,
 'Case Number.1': 2.48,
 'original order': 2.45,
 'Unnamed: 21': 99.99,
 'Unnamed: 22': 99.97}

In [11]:
# hl_df stands for 'High Level DataFrame'
# We can use this DataFrame for analysis on a higher level, like checking countries, years, sex of the victims and their activies
# We can create another dataframe for more specific analysis, like months and states...

hl_df = data.drop(columns=[
    'Date',
    'State',
    'Location',
    'Name', 
    'Unnamed: 11',
    'Time',
    'Case Number',
    'Case Number.1', 
    'Source', 
    'pdf', 
    'href formula', 
    'href', 
    'original order', 
    'Unnamed: 21', 
    'Unnamed: 22'])

In [12]:
hl_df.rename(columns={
    'Year' : 'year',
    'Type' : 'type',
    'Country' : 'country',
    'Activity' : 'activity',
    'Sex' : 'sex',
    'Age' : 'age',
    'Injury' : 'injury',
    'Species ' : 'size',
    'Case Number' : 'date'
}, inplace=True)

In [13]:
col_order = [
    'year',
    'country',
    'sex',
    'age',
    'activity',
    'injury',
    'type',
    'size'
]

hl_df = hl_df[col_order]

In [14]:
propna(hl_df)
# Age and Species are the ones with more NaN
# Since age is numerical, maybe we can fill it in with the mean or median (or bfil/ffil)

Percentage of data that is NaN (in %)


{'year': 0.03,
 'country': 0.72,
 'sex': 8.31,
 'age': 42.97,
 'activity': 8.41,
 'injury': 0.5,
 'type': 0.26,
 'size': 44.94}

In [15]:
hl_df.sample(3)

Unnamed: 0,year,country,sex,age,activity,injury,type,size
3907,1974.0,HONG KONG,M,,Freedom swimming,FATAL,Unprovoked,
3149,1993.0,TONGA,M,,Sea disaster,Foot bitten,Sea Disaster,1.5 m [5'] shark
6293,1895.0,USA,M,,Fishing,Arm bitten,Unprovoked,


In [16]:
# List of species used as reference can be retrieved in the following url:
# https://www.floridamuseum.ufl.edu/shark-attacks/maps/world-interactive/
# I will use loc() to find all mentions of these species in the dataframe and standardize their entry

In [17]:
# I did A LOT of homework to address different names to the same species...
# Scientific names were retrieved through Google Search and Wikipedia

clean_species = {
    r'(?i)^.*white shark.*$': 'white shark', # Carcharodon carcharias
    r'(?i)^.*requiem.*$': 'requiem sharks', # Order Carcharhiniformes:
                                             # contains bull shark, lemon shark, blacktip shark and whitetip reef
    r'(?i)^.*wobbeg.*$': 'wobbegong', # Family Orectolobidae, contains 12 species. These are small (~1m), carpet sharks
    r'(?i)^.*carpet.*$' : 'wobbegong', 
    r'(?i)^.*blue pointer.*$': 'blue pointer', # Isurus oxyrinchus
    r'(?i)^.*shortfin.*$': 'blue pointer',
    r'(?i)^.*mako.*$' : 'blue pointer',
    r'(?i)^.*blue whaler.*$' : 'blue shark', # Prionace glauca
    r'(?i)^.*blue shark.*$' : 'blue shark',
    r'(?i)^.*blue nose.*$' : 'blue shark',
    r'(?i)^.*blue-nose.*$' : 'blue shark',
    r'(?i)^.*sand.*$': 'sand tiger', # Carcharias taurus
    r'(?i)^.*grey nurse.*$' : 'sand tiger',
    r'(?i)^.*ragged.*$' : 'sand tiger',
    r'(?i)^.*hammerhead.*$': 'hammerhead', # Family Sphyrnidae
    # r'(?i)^.*tiger.*$': 'tiger shark' #,
    r'(?i)^.*bull.*$': 'bull shark', # Carcharhinus leucas
    r'(?i)^.*leucas.*$' : 'sevengill shark',
    r'(?i)^.*zamb.*$' : 'bull shark',
    # Sometimes just zambi, or zambesi, or zambezi
    r'(?i)^.*blacktip reef.*$': 'blacktip reef', # Carcharhinus melanopterus
    r'(?i)^.*blacktip "reef".*$': 'blacktip reef',
    r'(?i)^.*blacktail reef.*$': 'blacktip reef',
    r'(?i)^.*blacktip shark.*$': 'blacktip shark', # Carcharhinus limbatus
    r'(?i)^.*blackfin.*$' : 'blacktip shark',
    r'(?i)^.*black-tipped.*$' : 'blacktip shark',
    r'(?i)^.*grey reef.*$': 'grey reef', # Carcharhinus amblyrhynchos
    r'(?i)^.*oceanic white.*$': 'whitetip shark', # Carcharhinus longimanus
    r'(?i)^.*whitetip shark.*$': 'whitetip shark',
    r'(?i)^.*lemon.*$': 'lemon shark', # Negaprion brevirostris
    r'(?i)^.*bronze.*$': 'copper shark', # Carcharhinus brachyurus
    r'(?i)^.*copper.*$' : 'copper shark',
    r'(?i)^.*narrowtooth.*$' : 'copper shark',
    r'(?i)^.*whaler shark.*$' : 'copper shark',
    r'(?i)^.*spinner.*$': 'spinner shark', # Carcharhinus brevipinna
    r'(?i)^.*cookie.*$': 'cookiecutter shark', # Isistius brasiliensis
    r'(?i)^.*nurse shark.*$': 'nurse shark', # Ginglymostoma cirratum
    r'(?i)^.*sickle.*$': 'sicklefin shark', # Negaprion acutidens
    r'(?i)^.*sharptooth.*$': 'sicklefin shark',
    r'(?i)^.*caribbean reef.*$': 'caribbean reef', # Carcharhinus perezi
    r'(?i)^.*seven.*$' : 'sevengill shark', # Notorynchus cepedianus
    r'(?i)^.*broadnose.*$' : 'sevengill shark',
    r'(?i)^.*7-gill.*$' : 'sevengill shark',
    r'(?i)^.*dog.*$' : 'dog shark',
    r'(?i)^.*bask.*$' : 'basking shark', # Cetorhinus maximus
    r'(?i)^.*whale shark.*$' : 'whale shark', # Rhyncodon typus
    r'(?i)^.*galapagos shark.*$' : 'galapagos shark', # Carcharhinus galapagensis
    r'(?i)^.*salmon.*$' : 'salmon shark', # Lamna ditropis
    r'(?i)^.*dusky.*$' : 'dusky shark', # Carcharhinus obscurus
    r'(?i)^.*obscur.*$' : 'dusky shark',
    r'(?i)^.*angel.*$' : 'angel shark',
    r'(?i)^.*questionable.*$' : 'questionable',
    r'(?i)^.*unconf.*$': 'shark involvement unconfirmed',
    r'(?i)^.*invalid.*$': 'invalid',
}

hl_df.replace(to_replace=clean_species, regex=True, inplace=True)

In [18]:
# We'll have tou courate tiger sharks separately because of sand tigers
tiger_shark = {
    r'(?i)^.*tiger shark.*$' : 'tiger shark'
}

hl_df['size'] = hl_df['size'].replace(tiger_shark, regex=True)

In [19]:
hl_df['size'].value_counts()

size
white shark                                                         699
tiger shark                                                         292
bull shark                                                          257
sand tiger                                                          144
Shark involvement prior to death was not confirmed                  105
                                                                   ... 
2.4 m to 3 m [8' to 10'] grey colored shark                           1
Shark involvement not confirmed, injury may be due to a stingray      1
Species unidentified                                                  1
60 cm [2'] captive shark                                              1
"A pack of sharks"                                                    1
Name: count, Length: 535, dtype: int64

In [20]:
species_size = {
    'white shark': 'big',
    'requiem sharks': 'medium',
    'wobbegong': 'small',
    'blue pointer': 'medium',
    'blue shark': 'medium',
    'sand tiger': 'big',
    'hammerhead': 'big',
    'bull shark': 'big',
    'blacktip reef': 'medium',
    'blacktip shark': 'medium',
    'grey reef': 'medium',
    'whitetip shark': 'medium',
    'lemon shark': 'big',
    'copper shark': 'medium',
    'spinner shark': 'medium',
    'cookiecutter shark': 'small',
    'nurse shark': 'medium',
    'sicklefin shark': 'medium',
    'caribbean reef': 'medium',
    'sevengill shark': 'medium',
    'dog shark': 'small',
    'basking shark': 'big',
    'whale shark': 'big',
    'galapagos shark': 'medium',
    'salmon shark': 'big',
    'dusky shark': 'big',
    'angel shark': 'medium',
}

hl_df['size'].replace(to_replace=species_size, regex=True, inplace=True)

In [21]:
hl_df['size'].value_counts()

size
big                                                                 1220
medium                                                               543
tiger shark                                                          292
Shark involvement prior to death was not confirmed                   105
invalid                                                              103
                                                                    ... 
3.5' to 4.5' shark                                                     1
2.4 m to 3 m [8' to 10'] grey colored shark                            1
Shark involvement not confirmed, injury may be due to a stingray       1
Species unidentified                                                   1
"A pack of sharks"                                                     1
Name: count, Length: 513, dtype: int64

In [22]:
shark_sizes_feet = {
    "1' shark" : 'small',
    "2' shark" : 'small',
    "3' shark" : 'small',
    "4' shark" : 'small',
    "5' shark" : 'small',
    "6' shark" : 'small',
    "7' shark" : 'medium',
    "8' shark" : 'medium',
    "9' shark" : 'medium',
    "10' shark" : 'medium',
    "11' shark" : 'medium',
    "12' shark" : 'medium',
    "13' shark" : 'medium',
    "14' shark" : 'medium',
    "15' shark" : 'medium',
    "16' shark" : 'medium',
}

hl_df['size'].replace(to_replace=shark_sizes_feet, regex=True, inplace=True)

In [23]:
hl_df['size'].value_counts()

size
big                                                                 1220
medium                                                               587
tiger shark                                                          292
small                                                                230
Shark involvement prior to death was not confirmed                   105
                                                                    ... 
3 m to 3.7 m [10' to 12'] shark                                        1
3.5' to 4.small                                                        1
2.4 m to 3 m [8' to 10'] grey colored shark                            1
Shark involvement not confirmed, injury may be due to a stingray       1
"A pack of sharks"                                                     1
Name: count, Length: 495, dtype: int64

In [24]:
shark_sizes_feet_2 = {
    r"(?i)^.*1' .*$" : 'small',
    r"(?i)^.*2' .*$" : 'small',
    r"(?i)^.*3' .*$" : 'small',
    r"(?i)^.*4' .*$" : 'small',
    r"(?i)^.*5' .*$" : 'small',
    r"(?i)^.*6' .*$" : 'small',
    r"(?i)^.*small.*$" : 'small',
    r"(?i)^.*7' .*$" : 'medium',
    r"(?i)^.*8' .*$" : 'medium',
    r"(?i)^.*9' .*$" : 'medium',
    r"(?i)^.*10' .*$" : 'medium',
    r"(?i)^.*11' .*$" : 'medium',
    r"(?i)^.*12' .*$" : 'medium',
    r"(?i)^.*13' .*$" : 'medium',
    r"(?i)^.*14' .*$" : 'medium',
    r"(?i)^.*15' .*$" : 'medium',
    r"(?i)^.*16' .*$" : 'medium',
     r"(?i)^.*16' .*$" : 'medium',
    r"(?i)^.*17' .*$" : 'big',
    r"(?i)^.*18' .*$" : 'big',
    r"(?i)^.*19' .*$" : 'big',
    r"(?i)^.*20' .*$" : 'big',
    r"(?i)^.*21' .*$" : 'big',
    r"(?i)^.*22' .*$" : 'big',
    r"(?i)^.*23' .*$" : 'big',
}

hl_df['size'].replace(to_replace=shark_sizes_feet_2, regex=True, inplace=True)

In [30]:
hl_df['size'].value_counts()

size
big                                                                 1226
medium                                                               607
small                                                                528
tiger shark                                                          292
Shark involvement prior to death was not confirmed                   105
                                                                    ... 
1 m  shark                                                             1
Shark involvement not confirmed, injury may be due to a stingray       1
Species unidentified                                                   1
60 cm [2'] captive shark                                               1
"A pack of sharks"                                                     1
Name: count, Length: 366, dtype: int64

In [38]:
shark_sizes_meters = {
    r"(?i)^.*1 m.*$" : 'small',
    r"(?i)^.*2 m.*$" : 'small',
    r"(?i)^.*3 m.*$" : 'medium',
    r"(?i)^.*4 m.*$" : 'medium',
    r"(?i)^.*5 m.*$" : 'medium',
    r"(?i)^.*6 m.*$" : 'big',
    r"(?i)^.*7 m.*$" : 'big',
    r"(?i)^.*8 m.*$" : 'big',
    r"(?i)^.*9 m.*$" : 'big',
    r"(?i)^.*10 m.*$" : 'big',
    r"(?i)^.*11 m.*$" : 'big',
    r"(?i)^.*12 m.*$" : 'big',
    r"(?i)^.*13 m.*$" : 'big',
    r"(?i)^.*14 m.*$" : 'big',
    r"(?i)^.*large.*$" : 'big'
    
}

hl_df['size'].replace(to_replace=shark_sizes_meters, regex=True, inplace=True)

In [39]:
hl_df['size'].value_counts()

size
big                                                                                          1363
medium                                                                                        795
small                                                                                         637
tiger shark                                                                                   292
Shark involvement prior to death was not confirmed                                            105
                                                                                             ... 
 reef shark, 1.8m                                                                               1
Starry smoothhound shark, 1m                                                                    1
Reported by media as shark attack, but shark involvement prior to death was not confirmed       1
Shark involvement not confirmed; thought to be a barracuda bite                                 1
"A pack of shar

In [40]:
shark_sizes_meters_2 = {
    r"(?i)^.*1m.*$" : 'small',
    r"(?i)^.*2m.*$" : 'small',
    r"(?i)^.*3m.*$" : 'medium',
    r"(?i)^.*4m.*$" : 'medium',
    r"(?i)^.*5m.*$" : 'medium',
    r"(?i)^.*6m.*$" : 'big',
    r"(?i)^.*7m.*$" : 'big',
    r"(?i)^.*8m.*$" : 'big',
    r"(?i)^.*9m.*$" : 'big',
    r"(?i)^.*10m.*$" : 'big',
    r"(?i)^.*11m.*$" : 'big',
    r"(?i)^.*12m.*$" : 'big',
    r"(?i)^.*13m.*$" : 'big',
    r"(?i)^.*14m.*$" : 'big',
    r"(?i)^.*large.*$" : 'big'
    
}

hl_df['size'].replace(to_replace=shark_sizes_meters_2, regex=True, inplace=True)

In [41]:
hl_df['size'].value_counts()

size
big                                                                           1365
medium                                                                         807
small                                                                          653
tiger shark                                                                    292
Shark involvement prior to death was not confirmed                             105
                                                                              ... 
Gray reef shark                                                                  1
24" to 30" shark                                                                 1
Shark involvement not confirmed, injury may have been caused by a bluefish       1
Whitetip reef shark                                                              1
"A pack of sharks"                                                               1
Name: count, Length: 189, dtype: int64

In [42]:
extras = {
    r"(?i)^.*unknown.*$" : 'size unknown',
    r"(?i)^.*small.*$" : 'small shark (<2 m)',
    r"(?i)^.*medium.*$" : 'medium shark (<5 m)',
    r"(?i)^.*big*$" : 'large shark (>5 m)',
    r"(?i)^.*involvement*$" : 'shark involvement not confirmed',
    r"(?i)^.*tiger shark.*$" : 'large shark (>5 m)',
    r"(?i)^.*unconfirmed.*$" : 'shark involvement not confirmed',
    r"(?i)^.*not confirmed.*$" : 'shark involvement not confirmed',
    r"(?i)^.*no shark.*$" : 'shark involvement not confirmed',
    r"(?i)^.*not a shark.*$" : 'shark involvement not confirmed',
    r"(?i)^.*not specified.*$" : 'size unknown',
    r"(?i)^.*unidentified.*$" : 'size unknown',
    r"(?i)^.*unknown.*$" : 'size unknown',
    r"(?i)^.*reef shark.*$" : 'medium shark (<5 m)'
}


hl_df['size'].replace(to_replace=extras, regex=True, inplace=True)

In [43]:
hl_df['size'].value_counts()

size
large shark (>5 m)                 1657
medium shark (<5 m)                 826
small shark (<2 m)                  653
shark involvement not confirmed     333
invalid                             103
                                   ... 
18" to 36" shark                      1
24" to 30" shark                      1
Goblin shark, 4.2'                    1
80 kg shark                           1
"A pack of sharks"                    1
Name: count, Length: 142, dtype: int64

In [44]:
overview = hl_df['size'].value_counts().to_dict()
overview

{'large shark (>5 m)': 1657,
 'medium shark (<5 m)': 826,
 'small shark (<2 m)': 653,
 'shark involvement not confirmed': 333,
 'invalid': 103,
 'questionable': 82,
 'size unknown': 24,
 ' ': 7,
 'juvenile shark': 3,
 '18" to 24" shark': 3,
 '3 sharks': 3,
 'Shark involvement not cofirmed': 2,
 '200-lb shark': 2,
 'Silvertip shark': 2,
 'Juvenile shark': 2,
 'Shark involvement probable': 2,
 'Shark involvement doubtful': 2,
 '\xa0 ': 2,
 "5'shark": 2,
 'Blacktip': 2,
 '2 sharks': 2,
 '20 to 30kg shark': 2,
 'A pack of 6 sharks': 2,
 '6 ft shark': 2,
 '"Attacked by a number of sharks"': 1,
 '4.5  m [14\'9"] shark': 1,
 'Bonita sharkk, 200-lb': 1,
 '1,100-lb shark': 1,
 '136-kg [300-lb] shark': 1,
 'Porbeagle shark': 1,
 '"grey shark"': 1,
 'Reportedly a Great White': 1,
 "3.7 [12'] shark": 1,
 '43" shark': 1,
 '8 sharks': 1,
 '193-lb shark': 1,
 '"A long thin brown-colored shark"': 1,
 '60 cm  shark ': 1,
 '36"  shark': 1,
 "Two shark's teeth recovered from canoe": 1,
 '80-lb hooked sha

In [45]:
further_cleaning = {
    'large shark (>5 m)': 1,
 'medium shark (<5 m)': 2,
 'small shark (<2 m)': 3,
 'shark involvement not confirmed': 5,
 'invalid': 6,
 'questionable': 5,
 'size unknown': 4,
 'juvenile shark': 4,
 '18" to 24" shark': 3,
 '3 sharks': 4,
 'Shark involvement not cofirmed': 5,
 '200-lb shark': 1,
 'Silvertip shark': 2,
 'Juvenile shark': 4,
 'Shark involvement probable': 5,
 'Shark involvement doubtful': 5,
 '\xa0 ': 4,
 "5'shark": 3,
 'Blacktip': 3,
 '2 sharks': 4,
 '20 to 30kg shark': 3,
 'A pack of 6 sharks': 4,
 '6 ft shark': 3,
 '"Attacked by a number of sharks"': 4,
 '4.5  m [14\'9"] shark': 2,
 'Bonita sharkk, 200-lb': 2,
 '1,100-lb shark': 1,
 '136-kg [300-lb] shark': 2,
 'Porbeagle shark': 2,
 '"grey shark"': 2,
 'Reportedly a Great White': 1,
 "3.7 [12'] shark": 2,
 '43" shark': 3,
 '8 sharks': 4,
 '193-lb shark': 2,
 '"A long thin brown-colored shark"': 1,
 '60 cm  shark ': 2,
 '36"  shark': 3,
 "Two shark's teeth recovered from canoe": 4,
 '80-lb hooked shark': 3,
 'Reported as  a shark bite but toothmarks appear to be those of a dolphin ': 5,
 '20 kg shark': 3,
 'Several sharks involved': 4,
 'Shark seen feeding on turtle scraps thrown overboard prior to incident.': 4,
 '8-lb shark': 3,
 '100-lb shark': 2,
 '"a little shark"': 3,
 'Tooth fragments recovered from hull': 4,
 '9-foot shark': 2,
 '"grey-colored shark"': 3,
 "Shovelnose shark, 5'": 2,
 'Shovelnose guitarfish, adult male ': 2,
 '"The fish was harpooned, dried, and presented to the sailor, who went round Europe exhibiting it  It was said to be 20 feet long.': 1,
 'Description of shark does not ring true': 5,
 '234-lb shark': 1,
 'Remains recovered from shark caught days later': 4,
 '"Shark caught later"': 4,
 'Identified as C. gangeticus by Dr. J. Fayrer': 2,
 '68" shark': 3,
 'Blue or porbeagle shark': 2,
 'a school of sharks': 4,
 'Said to involve 2 sharks': 4,
 'Shark was said to “have a very rough ½”-thick skin”': 4,
 '5\'7" shark': 3,
 '13\'10" shark': 2,
 '2 days later a 600-lb shark was caught 100 yards from the site': 1,
 '70 kg shark': 2,
 'Fishermen recovered partial remains from shark a week later': 4,
 'Remains recovered 5 days later': 4,
 'Allegedly a 33-foot shark': 1,
 'Remains recovered from 3 sharks': 4,
 '18-foot shark': 1,
 "15'": 2,
 '"A pack of 6 sharks"': 4,
 '"whiptail shark" (thresher shark?)': 1,
 'Dooley believed his Injury was caused by stingray (Dasyatidae family)': 5,
 "7 shark's teeth found embedded in the woodwork of the boat": 4,
 '"a school of sharks"': 4,
 '650-lb shark': 1,
 '500-lb shark': 2,
 'Reported as a shark attack, the story was a hoax': 5,
 '300-kg [662-lb] shark': 1,
 '2 sharks involved': 4,
 'Shark involvement prior to death still to be determined': 5,
 '3- to 4-foot shark': 3,
 "Silky shark, 6.5'": 2,
 '3+ m shark': 2,
 'Shark involvement highly doubtful': 5,
 'Reported as shark attacks but injuries caused by toadfish': 5,
 'Reported as shark bite but injury caused by stingray': 5,
 'shark pup': 3,
 '1+ m shark': 3,
 'Injuries not caused by a shark': 5,
 '8" shark': 3,
 '5.5 ft shark': 3,
 '8 ft shark': 2,
 "Tope shark, 6'": 3,
 'White xhark': 1,
 'Epaulette shark': 3,
 'While shark': 1,
 'Bu.ll': 1,
 'Tiger  shark?': 1,
 'Authorities report injury caused bya barracuda': 5,
 '18" - 23" Horn shark': 3,
 'Shark involvement not confirmes': 5,
 'Wfite shark': 1,
 'Broze whaler?': 2,
 'Great White': 1,
 '6ft shark': 3,
 'Goblin shark': 2,
 'Cow shark': 3,
 "Porbeagle shark, 7'": 2,
 '"a young shark"': 3,
 'Not authenticated': 5,
 'Considered a "Doubtful" incident': 5,
 '270 kg shark': 2,
 'C. maculpinnis or C. limbatus': 2,
 "13', 400-lb thresher shark": 2,
 '"gray shark"': 2,
 '30-kg [66-lb] shark': 3,
 "6', 100-lb shark": 3,
 '40 to 50 sharks attacked survivors in the water': 4,
 '150-lb shark': 3,
 'Shark involvement prior to death could not be determined': 5,
 'Leopard shark': 2,
 '"juvenile shark"': 3,
 'Thresher shark': 1,
 'Soupfin shark': 3,
 '200 to 300 kg shark': 2,
 'Miami, a 60 cm blacktip  shark and two 60 cm bamboo catsharks': 3,
 "60 cm [2'] captive shark": 3,
 '15 cm to 20 cm [6" to 8"] bite diameter just below left knee': 3,
 "106 cm [3.5']  shark": 3,
 '60 cm [23.6"] blind or brown shark': 3,
 '"black tipped" shark': 3,
 '18" to 36" shark': 3,
 '24" to 30" shark': 3,
 "Goblin shark, 4.2'": 2,
 '80 kg shark': 3,
 '"A pack of sharks"': 4
}

hl_df['size'].replace(further_cleaning, inplace=True)

In [46]:
categories = {
    1 : 'large shark (>5 m)',
    2 : 'medium shark (<5 m)',
    3 : 'small shark (<2 m)',
    4 : 'size unknown',
    5 : 'questionable',
    6 : 'invalid'
}

hl_df['size'].replace(categories, inplace=True)

In [47]:
hl_df['size'].value_counts()

size
large shark (>5 m)     1677
medium shark (<5 m)     859
small shark (<2 m)      698
questionable            435
invalid                 103
size unknown             58
                          7
                          1
Name: count, dtype: int64

In [51]:
hl_df['size'].fillna('size unknown', inplace=True)

In [56]:
hl_df['size'].value_counts()

size
size unknown           3190
large shark (>5 m)     1677
medium shark (<5 m)     859
small shark (<2 m)      698
questionable            435
invalid                 103
                          7
                          1
Name: count, dtype: int64

In [59]:
hl_df.eq(" ").sum()

year        0
country     0
sex         0
age         2
activity    1
injury      1
type        0
size        7
dtype: int64

In [60]:
final_touches = {
    ' ' : 'size unknown',
    '  ' : 'size unknown'
}

hl_df['size'].replace(final_touches, inplace=True)

In [67]:
hl_df['size'].value_counts()

size
size unknown           3198
large shark (>5 m)     1677
medium shark (<5 m)     859
small shark (<2 m)      698
questionable            435
invalid                 103
Name: count, dtype: int64

In [73]:
activities = hl_df['activity'].value_counts().to_dict()
activities

{'Surfing': 1124,
 'Swimming': 984,
 'Fishing': 489,
 'Spearfishing': 384,
 'Wading': 176,
 'Bathing': 163,
 'Diving': 142,
 'Snorkeling': 129,
 'Standing': 113,
 'Scuba diving': 84,
 'Body boarding': 63,
 'Body surfing': 50,
 'Swimming ': 47,
 'Boogie boarding': 42,
 'Kayaking': 39,
 'Treading water': 33,
 'Free diving': 33,
 'Pearl diving': 32,
 'Fell overboard': 32,
 'Windsurfing': 20,
 'Boogie Boarding': 17,
 'Walking': 17,
 'Floating': 16,
 'Canoeing': 16,
 'Fishing ': 16,
 'Shark fishing': 15,
 'Surf fishing': 14,
 'Surf-skiing': 13,
 'Playing': 13,
 'Surf skiing ': 12,
 'Freediving': 12,
 'Scuba Diving': 12,
 'Surf skiing': 12,
 'Rowing': 12,
 'sand tiger': 11,
 'Fishing for sharks': 11,
 'Paddle boarding': 11,
 'Sponge diving': 10,
 'Sailing': 9,
 'Kayak Fishing': 9,
 'Sitting on surfboard': 9,
 'Fell into the water': 9,
 'Diving for trochus': 9,
 'Stand-Up Paddleboarding': 9,
 'Spearfishing ': 8,
 'Sea disaster': 8,
 'Shipwreck': 7,
 'Kayak fishing': 7,
 'Floating on his back'

In [77]:
act_replace = {
    r"(?i)^.*surf.*$" : 'surfing',
    r"(?i)^.*swim.*$" : 'swimming',
    r"(?i)^.*div.*$" : 'diving',
    r"(?i)^.*fishing.*$" : 'fishing',
    r"(?i)^.*wading.*$" : 'bathing',
    r"(?i)^.*plane.*$" : 'watercraft',
    r"(?i)^.*air.*$" : 'watercraft',
    r"(?i)^.*boa.*$" : 'boat sports',
    r"(?i)^.*wreck.*$" : 'watercraft',
    r"(?i)^.*sink.*$" : 'watercraft',
    r"(?i)^.*sunk.*$" : 'watercraft',
    r"(?i)^.*sank.*$" : 'watercraft',
    r"(?i)^.*bath.*$" : 'bathing',
    r"(?i)^.*disaster.*$" : 'watercraft',
    r"(?i)^.*snorkel.*$" : 'diving',
    r"(?i)^.*stand.*$" : 'bathing',
    r"(?i)^.*walk.*$" : 'bathing',
    r"(?i)^.*kayak.*$" : 'boat sports',
    r"(?i)^.*canoe.*$" : 'boat sports',
    r"(?i)^.*sailing.*$" : 'boat sports',
    r"(?i)^.*fell.*$" : 'watercraft',
    r"(?i)^.*ding.*$" : 'watercraft',
    r"(?i)^.*net.*$" : 'fishing',
    r"(?i)^.*fish.*$" : 'fishing',
    r"(?i)^.*float.*$" : 'bathing',
    r"(?i)^.*playing.*$" : 'bathing',
    r"(?i)^.*splash.*$" : 'bathing',
    r"(?i)^.*jump.*$" : 'bathing',
    r"(?i)^.*watercraft.*$" : 'watercraft',
    r"(?i)^.*skii.*$" : 'boat sports',
    r"(?i)^.*rowi.*$" : 'boat sports',
    r"(?i)^.*film.*$" : 'diving',
    r"(?i)^.*clam.*$" : 'bathing',
    r"(?i)^.*sit.*$" : 'bathing',
    r"(?i)^.*shells.*$" : 'bathing',
    r"(?i)^.*lobster.*$" : 'bathing',
    r"(?i)^.*drift.*$" : 'watercraft',
    r"(?i)^.*sup.*$" : 'boat sports',
    r"(?i)^.*murder.*$" : 'others',
    r"(?i)^.*suicide.*$" : 'others',
    r"(?i)^.*pad.*$" : 'boat sports',
    r"(?i)^.*shark.*$" : 'teasing shark',
    r"(?i)^.*scull.*$" : 'boat sports',
    r"(?i)^.*founder.*$" : 'watercraft',
    r"(?i)^.*harp.*$" : 'watercraft',
    
}

hl_df['activity'].replace(to_replace=act_replace, regex=True, inplace=True)

In [80]:
hl_df['activity'].value_counts()

activity
surfing                                       1434
fishing                                       1244
swimming                                      1229
diving                                         752
bathing                                        649
                                              ... 
Arsinoe, a French tanker                         1
Collecting marine specimens                      1
Searching for remains of  Dr. Marais             1
Lying in 2 feet of water                         1
A group of survivors on a raft for 17-days       1
Name: count, Length: 180, dtype: int64

In [81]:
overview_2 = hl_df['activity'].value_counts().to_dict()
overview_2

{'surfing': 1434,
 'fishing': 1244,
 'swimming': 1229,
 'diving': 752,
 'bathing': 649,
 'boat sports': 492,
 'watercraft': 275,
 'teasing shark': 96,
 'others': 12,
 'sand tiger': 11,
 'Dangling feet in the water': 5,
 'Lifesaving drill': 4,
 'Unknown': 3,
 'Crabbing': 3,
 'Escaping from Alacatraz': 3,
 '.': 3,
 'Washing his feet': 2,
 'Ocean racing': 2,
 'Shrimping': 2,
 'Washing': 2,
 'Cruising': 2,
 'Washing clothes': 1,
 'Taking wife to beach & about 1 m from the shore': 1,
 'Cutter capsized': 1,
 'Knocked into the water': 1,
 'The coastwise steamer San Basilio capsized in a typhoon': 1,
 'No details': 1,
 'Reported swept away by waves while gathering opihi': 1,
 'Dismantling cable buoys of the cable ship All America': 1,
 'Dry shelling': 1,
 'Attempting to rescue drowning man': 1,
 'The steamer Tahiti collided with the ferry Greycliffe': 1,
 'Defecating in water beneath the docks': 1,
 'Retrieving meat from a cage in the water': 1,
 'On December 28, 1908, an earthquake, followed 

In [84]:
clean_activities = {
    'surfing': 1,
 'fishing': 5,
 'swimming': 3,
 'diving': 4,
 'watercraft': 7,
 'boat sports': 6,
 'bathing': 2,
 'teasing shark': 8,
 'others': 9,
 'Dangling feet in the water': 2,
 'Lifesaving drill': 3,
 'Escaping from Alacatraz': 7,
 'Unknown': 9,
 'Crabbing': 2,
 'Washing': 2,
 'Shrimping': 2,
 'Cruising': 2,
 'Washing his feet': 2,
 'Ocean racing': 6,
 'Dry shelling': 9,
 'Washing horses': 9,
 'Retrieving meat from a cage in the water': 9,
 'The steamer Tahiti collided with the ferry Greycliffe': 7,
 'Attempting to rescue drowning man': 9,
 'The British steamer Caribbee foundered': 7,
 'Dismantling cable buoys of the cable ship All America': 7,
 'The 168-ton Belmore foundered in heavy seas': 7,
 'Knocked into the water': 9,
 'Cutter capsized': 9,
 'Taking wife to beach & about 1 m from the shore': 2,
 'Washing clothes': 9,
 'The schooner Tahitienne foundered in a hurricane': 7,
 '3-masted steel barque Glenbank foundered during a cyclone': 7,
 'Reported swept away by waves while gathering opihi': 9,
 'On December 28, 1908, an earthquake, followed by tsunamis, destroyed coastal towns in Silcily and southern Italy, killing more than 100,000 people': 7,
 'The coastwise steamer San Basilio capsized in a typhoon': 7,
 'Yacht of Michael Howell capsized': 7,
 'Catching a turtle': 9,
 'Hurricane & Tidal Wave': 7,
 'Underwater photography': 4,
 '90 European civilians, many women & children, were placed on the deck of a Japanese submarine that submerged when it was well offshore': 7,
 'Parachuted into Pacific': 9,
 'The 6711-ton American freighter & troop transport Cape San Juan was torpedoed by the Japanese submarine I-21': 7,
 'B-24 crashed during a search mission. Survivors in raft for 47 days ': 7,
 'ship torpedoed 400 miles off the African coas. Man was clinging to hatch cover': 7,
 'Thrown from destroyer when shell hit': 7,
 'The 6015-ton British ship Empire Avocet was torpedoed by the German submarine U-125. ': 7,
 'Esso Bolivar was torpedoed & shelled by the German submarine U-126': 7,
 'Torpedoed & burning British  light cruiser with a crew of 450 men': 7,
 'SS Ethel Skakel foundered in Central America Hurricane of 1941': 7,
 'Washed off freighter Huncliff by a freak wave': 2,
 'Scooping prawns': 4,
 'She was on a ship that was torpedoes & was in the water awaiting rescue': 7,
 'The schooner Elizabeth, bound from Bluefields, Nicaragua to the river port of San Carlos foundered': 7,
 'Tzenny Chandris, a Greek freighter laden with scrap iron, foundered in heavy weather': 7,
 'Disappeared 11 days earlier, probable homicide victim': 9,
 'Catching crabs': 2,
 'Harpooning turtles': 5,
 '"Crossing the river"': 2,
 'Hooking into a whale': 5,
 'A junk foundered': 7,
 'Painting a ship': 9,
 'Coming ashore on a hawser': 6,
 'Reaching for life preserver': 7,
 'Trying to catch a wounded bird': 2,
 'Deserting the bark Nazarene': 3,
 'ship William Penn grounded & broke apart': 7,
 'Hilo': 9,
 'Unknown, but it was said to be the "First known attack in Sydney Harbour"': 7,
 'American schooner Orator capsized  ': 7,
 'The cutter Francis Adams foundered': 7,
 'Washing a dog': 2,
 'HBM Magpie foundered in a squall': 7,
 'Washing himself': 2,
 'Ship lay at anchor & man was working on its rudder': 2,
 'Swmming': 3,
 'Washing his pig in preparation for a religious ceremony': 2,
 'A dhow capsized': 7,
 'yachtsman in a zodiac': 7,
 'Crossing river on a raft': 6,
 'Burning of the S.S. Missouri': 7,
 'Washed off raft': 7,
 'The steamship Bonnie Dundee lost in collision': 7,
 'yachting accident': 7,
 'Trailing hand in the water': 2,
 'Hunting seals': 5,
 'Lifesaving exhibition': 9,
 'Abandoning burning steamship Don Juan': 7,
 'HMS Victoria collided with the HMS Camperdown': 7,
 'His balloon crashed in the harbor': 7,
 'Attempting to rescue shipmate': 9,
 'Fleeing across a river': 9,
 'Parachuted from balloon': 7,
 'Cleaning the side of a ship': 9,
 'The steamships Thingvalla and Geiser collided': 7,
 'Probabable drowning': 9,
 'The Dwarka foundered': 7,
 'Crossing the river mouth': 9,
 'The passenger ship Kapuna was run down the ore carrier Ada Melmore': 7,
 'British ship Macedon was thrown on her beam ends by a sudden squall': 7,
 'Oystering': 5,
 'Leicester abandoned in a hurricane': 7,
 'Climbing back on ship': 9,
 'No details': 9,
 'Defecating in water beneath the docks': 2,
 "Crouching in 2' of water": 2,
 'Swept out to sea by the tsunami, she clung  to a log for 24 hours': 7,
 'Collecting beche-de-mer': 5,
 'Hiking on the beach': 2,
 'Batin': 2,
 'Attempting to catch a crocodile': 5,
 'Conducting research': 9,
 'Attempting to illegally enter the USA': 3,
 "Lying prone in 2' of water": 2,
 "Scientific research (Dr. Sonny Gruber's student)": 9,
 'Watching seals': 9,
 'Attempting to attract dolphins': 9,
 'Boeing 757 enroute from Porta Plata plunged into the sea': 7,
 'Vessel caught fire & capsized, survivors in the water': 7,
 'Dropping anchor': 9,
 'NSB Meshing': 9,
 'Racing ski': 6,
 'The 426-ton cargo ship Mia, laden with cement, capsized in heavy seas ': 7,
 'Yacht race': 6,
 'Crawling': 2,
 'Fihing': 5,
 'Military ocean training': 2,
 'On a round-the-world expedition': 7,
 'Towing a dead whale out to sea': 9,
 'Watching the sardine run': 9,
 'Spearishing': 5,
 'Lifeguard Training Exercise': 3,
 'Lifeguard Exercises': 3,
 'Scalloping': 5,
 'Squatting in the water': 2,
 'Rescuing': 9,
 'Kite Foiling': 6,
 'Sightseeing': 3,
 'Picking opihi': 5,
 'Photo shoot': 3,
 'Kakaying': 6,
 'Washing hands': 2,
 'Kneeling in the water': 2,
 'Attempting to fix motor': 9,
 'Steinhart Aquarium': 8,
 'Ran into the water': 2,
 'In waist-deep water': 4,
 'In deep water about 100 yards from his ship': 4,
 'Collecting aquarium specimens': 8,
 'Pacific Seafarer of US Navy': 9,
 'Hand lining for shad': 5,
 'Pulling raft out to ride to shore': 5,
 'Testing classified underwater electronic gear for Raytheon Corporation, vessel torn apart by explosion': 7,
 'Gigging for flounder': 7,
 'On inflatable raft': 9,
 'Pulling anchor': 9,
 'Dragging banana seeds through the shallows': 4,
 'Attaching a line at sea': 5,
 'Cleaning hull of ship ': 9,
 '3 men & 2 boys picked up wearing life jackets and with inner tube': 9,
 'Attempting to set underwater endurance record': 4,
 'Hunting turtle': 5,
 'Crossing the bay at the ford': 6,
 'Crouching in the water': 2,
 'Body found on deserted luxury yacht, 38’ Christine': 7,
 'Swept off deck of S.S.Frontenac enroute from West Indies to US': 9,
 'Cleaning a tank': 9,
 'Rolled off raft': 9,
 'Yacht Trashman capsized in storm': 7,
 'Vehicle plunged over cliff into the water': 7,
 'Catching sardines': 5,
 'Foundering of the Israeli freighter Mezada': 7,
 'Exercising his dog in the shallows': 2,
 'Ferry capsized': 7,
 'Crabbing (spearing crabs)': 5,
 'Washing cooking pans': 2,
 'Sight-seeing': 3,
 'Leaving the water': 3,
 'Overturned skiff': 6,
 'Abandoning burning ship Captain George in raging seas': 3,
 'Arsinoe, a French tanker': 3,
 'Collecting marine specimens': 5,
 'Searching for remains of  Dr. Marais': 4,
 'Yacht Gooney Bird foundered, 4 survivors on raft': 7,
 'Lying in 2 feet of water': 2,
 'Greek steamship Lakonia caught fire, 98 of her 646 passengers, and 30 of her crew of 376 perished': 7,
 'Washed into sea while picking opihi': 5,
 'The 500-ton coastal trader Polurrian foundered ': 7,
 'A group of survivors on a raft for 17-days': 7}

hl_df['activity'].replace(clean_activities, inplace=True)

In [87]:
hl_df['activity'].value_counts()

activity
1                             1434
5                             1261
3                             1244
4                              759
2                              688
6                              502
7                              324
8                               98
9                               54
sand tiger                      11
.                                3
                                 1
pêcheur de bichiques             1
male                             1
                                 1
Loss of the schooner Nomad       1
Angling                          1
Name: count, dtype: int64

In [89]:
act_categories = {
    1 : 'surfing',
    2 : 'bathing',
    3 : 'swimming',
    4 : 'diving',
    5 : 'fishing',
    6 : 'boat sports',
    7 : 'watercraft',
    8 : 'teasing shark',
    9 : 'other'
}


hl_df['activity'].replace(act_categories, inplace=True)

In [91]:
hl_df['activity'].value_counts().to_dict()

{'surfing': 1434,
 'fishing': 1261,
 'swimming': 1244,
 'diving': 759,
 'bathing': 688,
 'boat sports': 502,
 'watercraft': 324,
 'teasing shark': 98,
 'other': 54,
 'sand tiger': 11,
 '.': 3,
 '   ': 1,
 'pêcheur de bichiques': 1,
 'male': 1,
 ' ': 1,
 'Loss of the schooner Nomad': 1,
 'Angling': 1}

In [93]:
final_final_touches = {
 '.': 9,
 '   ': 9,
 'pêcheur de bichiques': 5,
 'male': 9,
 ' ': 9,
 'Loss of the schooner Nomad': 7,
 'Angling': 5
}

hl_df['activity'].replace(final_final_touches, inplace=True)

In [95]:
hl_df['activity'].value_counts()

activity
surfing          1434
fishing          1261
swimming         1244
diving            759
bathing           688
boat sports       502
watercraft        324
teasing shark      98
other              54
sand tiger         11
9                   6
5                   2
7                   1
Name: count, dtype: int64

In [102]:
propna(hl_df)

Percentage of data that is NaN (in %)


{'year': 0.03,
 'country': 0.72,
 'sex': 8.31,
 'age': 42.97,
 'activity': 0.0,
 'injury': 0.5,
 'type': 0.26,
 'size': 0.0}

In [99]:
hl_df['activity'].fillna('other', inplace=True)