In [1]:
# This code shows the intial cleaning performed on the raw dataset
# We removed unnecessary columns and instances that were not shark bites 

In [2]:
# For output clarity 
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np 
import re

In [4]:
#Importing sharks data 
# From kaggle https://www.kaggle.com/datasets/thedevastator/shark-attacks-the-risks-of-coastal-water-activit
# FromShark Research Institute's website https://www.sharkattackfile.net/incidentlog.htm
file_path = "shark_bite_data_raw.csv"
raw_df = pd.read_csv(file_path)
raw_df.columns


Index(['index', 'Case Number', 'Date', 'Year', 'Type', 'Country', 'Area',
       'Location', 'Activity', 'Name',
       ...
       'Unnamed: 246', 'Unnamed: 247', 'Unnamed: 248', 'Unnamed: 249',
       'Unnamed: 250', 'Unnamed: 251', 'Unnamed: 252', 'Unnamed: 253',
       'Unnamed: 254', 'Unnamed: 255'],
      dtype='object', length=257)

In [5]:
#list(raw_df.columns)

In [6]:
# Importing without all the unnamed columns with empty values 
columns = [
    'index',
    'Case Number',
    'Date',
    'Year',
    'Type',
    'Country',
    'Area',
    'Location',
    'Activity',
    'Name',
    'Unnamed: 9',
    'Age',
    'Injury',
    'Fatal (Y/N)',
    'Time',
    'Species ',
    'Investigator or Source',
    'pdf',
    'href formula',
    'href',
    'Case Number.1',
    'Case Number.2',
    'original order'
]
selected_df = raw_df.loc[:, columns]
#selected_df

In [7]:
# Dropping duplicate columns 
selected_unique_df = selected_df.drop(['Case Number.1', 'Case Number.2'], axis=1)
#selected_unique_df

In [8]:
# Dropping Source info 
clean_species_df = selected_unique_df.drop([
    'Investigator or Source', 'pdf', 'href formula', 'href', 'original order'
], axis =1)
clean_species_df

Unnamed: 0,index,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Unnamed: 9,Age,Injury,Fatal (Y/N),Time,Species
0,0,2020.02.05,5-Feb-20,2020.0,Unprovoked,USA,Maui,,Stand-Up Paddle boarding,,,,"No injury, but paddleboard bitten",N,09h40,Tiger shark
1,1,2020.01.30.R,Reported 30-Jan-2020,2020.0,Provoked,BAHAMAS,Exumas,,Floating,Ana Bruna Avila,F,24,PROVOKED INCIDENT Scratches to left wrist,N,,
2,2,2020.01.17,17-Jan-20,2020.0,Unprovoked,AUSTRALIA,New South Wales,Windang Beach,Surfing,Will Schroeter,M,59,Laceration ot left ankle and foot,N,08h00,"""A small shark"""
3,3,2020.01.16,16-Jan-20,2020.0,Unprovoked,NEW ZEALAND,Southland,Oreti Beach,Surfing,Jordan King,F,13,Minor injury to lower leg,N,20h30,Broadnose seven gill shark?
4,4,2020.01.13,13-Jan-20,2020.0,Unprovoked,USA,North Carolina,"Rodanthe, Dare County",Surfing,Samuel Horne,M,26,Lacerations to foot,N,14h33,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6457,6457,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,,,FATAL,Y,,
6458,6458,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,,,FATAL,Y,,
6459,6459,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,,,FATAL,Y,,
6460,6460,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,,,FATAL,Y,,


In [9]:
clean_species_df.rename(columns = {'Species ': 'Species'}, inplace = True)
list(clean_species_df.Species.unique())

['Tiger shark',
 nan,
 '"A small shark"',
 'Broadnose seven gill shark?',
 'Lemon shark',
 'White shark',
 'Shovelnose "shark" which is a ray, not a shark)',
 '3.4 m tiger shark',
 "10'-12' tiger shark",
 'Juvenile blacktip shark',
 "5' blacktip shark",
 "4' to 5' blacktip shark",
 'White shark, 4 m',
 "10' tiger shark",
 "4' to 5' shark",
 'Oceanic whitetip shark',
 "7' shark",
 'White shark, 3.5 m',
 'Juvenile shark',
 '1+ m shark',
 "6' to 7' shark",
 "2' shark",
 "14' white shark",
 'Nurse shark',
 "6.5' shark",
 "10' to 12' shark",
 "Blacktip shark, 5'",
 'Blacktip or Spinner shark',
 'Sandtiger shark',
 'White shark, 3m',
 'shark pup',
 'White shark, juvenile',
 'Bull shark pup',
 "Bull shark, 4' to 5'",
 'Bull shark, 3.5 m',
 'Shark involvement unconfirmed but considered probable',
 "8' shark",
 'Grey nurse shark, 3 m female',
 'Cookiecutter shark',
 'Sand tiger shark',
 "12' shark",
 "Grey reef shark, 4'",
 "4' shark",
 'Spinner shark',
 '2.5 m shark',
 'Bull shark',
 'Reported

In [10]:
# Add 'Unknown' value to empty rows 
clean_species_df['Species'].fillna('Unknown', inplace = True)
# Remove rows with no shark involvement
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement unconfirmed but considered probable']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shovelnose "shark" which is a ray, not a shark)']
clean_species_df = clean_species_df[clean_species_df.Species != 'Reported as shark bite but injury caused by stingray']
clean_species_df = clean_species_df[clean_species_df.Species != 'Reported as shark attacks but injuries caused by toadfish']
clean_species_df = clean_species_df[clean_species_df.Species != 'Injury most likely caused by barracuda, not a shark']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement not confirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark invovlement not confirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement unconfirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'Injury believed caused by an eel, not a shark']
clean_species_df = clean_species_df[clean_species_df.Species != 'No shark invovlement']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement prior to death not confirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement questionable']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement highly doubtful']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement not cofirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'No shark invovlement - it ws a publicity stunt']
clean_species_df = clean_species_df[clean_species_df.Species != "Said to involve an 8' shark but more likely damage caused by debris"]
clean_species_df = clean_species_df[clean_species_df.Species != 'Death may have been due to drowning']
clean_species_df = clean_species_df[clean_species_df.Species != "Thought to involve a 3' to 4' shark, but shark involvement not confirmed"]
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement prior to death still to be determined']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement not confirmed & highly unlikely']
clean_species_df = clean_species_df[clean_species_df.Species != 'Salmon shark suspected, but unlikely']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement prior to death was not confirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'Reported by media as shark attack, but shark involvement prior to death was not confirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement not confirmed; thought to be a barracuda bite']
clean_species_df = clean_species_df[clean_species_df.Species != 'Not a shark attack; it was a hoax']
clean_species_df = clean_species_df[clean_species_df.Species != 'Questionable Incident']
clean_species_df = clean_species_df[clean_species_df.Species != 'Questionable incident']
clean_species_df = clean_species_df[clean_species_df.Species != 'Questionable']
clean_species_df = clean_species_df[clean_species_df.Species != "Mr. Burgess of ISAF announced the injury was the bite of a 1.8 m [6'], 2- to 3-year old white shark. Subsequent investigation revealed there was no shark involvement in this incident"]
clean_species_df = clean_species_df[clean_species_df.Species != 'shark involvement not confirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement probable, but not confirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement not confirmed, injury may have been caused by a bluefish']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement not confirmed, injury may be due to a stingray']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement prior to death unconfired']
clean_species_df = clean_species_df[clean_species_df.Species != 'Questionable incident - shark bite may have precipitated drowning']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement prior to death suspected but not confirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement prior to death could not be determined']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement suspected but not confirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'Doubtful / Unconfirmed attack / Unable to verify in local records']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement not confirmed; officials considered barracua']
clean_species_df = clean_species_df[clean_species_df.Species != 'Questionable incident; reported as shark attack but thought to involve a pinniped instead']
clean_species_df = clean_species_df[clean_species_df.Species != 'Reported as a shark attack, the story was a hoax']
clean_species_df = clean_species_df[clean_species_df.Species != 'Considered a "Doubtful" incident']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement prior to death remains unconfirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement doubtful']
clean_species_df = clean_species_df[clean_species_df.Species !='Not authenticated']
clean_species_df = clean_species_df[clean_species_df.Species != 'Invalid incident']
clean_species_df = clean_species_df[clean_species_df.Species != 'Invalid']
clean_species_df = clean_species_df[clean_species_df.Species != "Questionable incident, said to involve a 6' shark"]
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement prior to death unconfirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'Dooley believed his Injury was caused by stingray (Dasyatidae family)']
clean_species_df = clean_species_df[clean_species_df.Species != 'According to Benjamin, the injury was inflicted by a barracuda, not a shark']
clean_species_df = clean_species_df[clean_species_df.Species != 'Shark involvement prior to deaths was not confirmed']
clean_species_df = clean_species_df[clean_species_df.Species != 'No shark involvement']
clean_species_df = clean_species_df[clean_species_df.Species != 'Description of shark does not ring true']
clean_species_df = clean_species_df[clean_species_df.Species != '2\' "banjo shark"']
shark_attack_df = clean_species_df[clean_species_df.Species != 'Said to involve a white shark but shark involvement not confirmed']
shark_attack_df

Unnamed: 0,index,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Unnamed: 9,Age,Injury,Fatal (Y/N),Time,Species
0,0,2020.02.05,5-Feb-20,2020.0,Unprovoked,USA,Maui,,Stand-Up Paddle boarding,,,,"No injury, but paddleboard bitten",N,09h40,Tiger shark
1,1,2020.01.30.R,Reported 30-Jan-2020,2020.0,Provoked,BAHAMAS,Exumas,,Floating,Ana Bruna Avila,F,24,PROVOKED INCIDENT Scratches to left wrist,N,,Unknown
2,2,2020.01.17,17-Jan-20,2020.0,Unprovoked,AUSTRALIA,New South Wales,Windang Beach,Surfing,Will Schroeter,M,59,Laceration ot left ankle and foot,N,08h00,"""A small shark"""
3,3,2020.01.16,16-Jan-20,2020.0,Unprovoked,NEW ZEALAND,Southland,Oreti Beach,Surfing,Jordan King,F,13,Minor injury to lower leg,N,20h30,Broadnose seven gill shark?
4,4,2020.01.13,13-Jan-20,2020.0,Unprovoked,USA,North Carolina,"Rodanthe, Dare County",Surfing,Samuel Horne,M,26,Lacerations to foot,N,14h33,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6457,6457,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,,,FATAL,Y,,Unknown
6458,6458,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,,,FATAL,Y,,Unknown
6459,6459,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,,,FATAL,Y,,Unknown
6460,6460,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,,,FATAL,Y,,Unknown


In [11]:
clean_species_df.to_csv("shark_bite_confirmed.csv", index=False)