# Importing the necessary tools

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import re
import seaborn as sns
from IPython.display import display_html
from itertools import chain,cycle

In [2]:
# Importing the file.csv

sharks = pd.read_csv("../data/attacks.csv", header = 0, encoding= 'unicode_escape')

In [3]:
# This is a function to display multiple tables in the same row

def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

# First info about the dataset: shape, columns...

In [4]:
# First info

print(f"The shape is now {sharks.shape}")

The shape is now (25723, 24)


In [5]:
# Displaying Describe and Head tables
display_side_by_side(sharks.describe(), pd.DataFrame(sharks.columns),sharks.sample(6), titles=['Describe', "Column names", 'Head'])

Unnamed: 0,Year,original order
count,6300.0,6309.0
mean,1927.272381,3155.999683
std,281.116308,1821.396206
min,0.0,2.0
25%,1942.0,1579.0
50%,1977.0,3156.0
75%,2005.0,4733.0
max,2018.0,6310.0

Unnamed: 0,0
0,Case Number
1,Date
2,Year
3,Type
4,Country
5,Area
6,Location
7,Activity
8,Name
9,Sex

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
18669,,,,,,,,,,,,,,,,,,,,,,,,
8026,0,,,,,,,,,,,,,,,,,,,,,,,
1570,2005.07.15,15-Jul-2005,2005.0,Unprovoked,USA,North Carolina,"Holden Beach, Brunswick County",Swimming,Chris Humphrey,M,22.0,Lacerations of left forearm,N,16h40,[4.5' to 5'] shark,"C. Creswell, GSAF",2005.07.15.a-Humphrey.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2005.07.15.a-Humphrey.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2005.07.15.a-Humphrey.pdf,2005.07.15,2005.07.15,4733.0,,
23234,,,,,,,,,,,,,,,,,,,,,,,,
25510,,,,,,,,,,,,,,,,,,,,,,,,
12071,,,,,,,,,,,,,,,,,,,,,,,,


In [6]:
#Custom function to display errors

def display_errors(df):
    
    total_nas = pd.DataFrame(df.isna().sum())
    total_null = pd.DataFrame(df.isnull().sum())
    duplicated = pd.DataFrame(df.duplicated().value_counts())


    return display_side_by_side(total_nas, total_null, duplicated, titles=['Sum of NAs', "Sum of Nuls", "Number of Duplicates"])

In [7]:
# Display of the errors

display_errors(sharks)

Unnamed: 0,0
Case Number,17021
Date,19421
Year,19423
Type,19425
Country,19471
Area,19876
Location,19961
Activity,19965
Name,19631
Sex,19986

Unnamed: 0,0
Case Number,17021
Date,19421
Year,19423
Type,19425
Country,19471
Area,19876
Location,19961
Activity,19965
Name,19631
Sex,19986

Unnamed: 0,0
True,19411
False,6312


# Data cleaning

In [8]:
# We have to get rid of all unnecessary columns

filtered_sharks = sharks.drop(["Case Number", "Area", "pdf", "href formula", "Name", "href", "Case Number.1", "Case Number.2", "original order", "Unnamed: 22", "Unnamed: 23", "Investigator or Source"], axis = 1, inplace=False)

list(filtered_sharks.columns)

['Date',
 'Year',
 'Type',
 'Country',
 'Location',
 'Activity',
 'Sex ',
 'Age',
 'Injury',
 'Fatal (Y/N)',
 'Time',
 'Species ']

In [9]:
# It is useful to rewrite all column names in the correct format (deleting spaces, etc.)

sharks_fil_ren = filtered_sharks.rename(columns = {"Sex ":"Sex", "Species ":"Species"}, inplace = False)

list(sharks_fil_ren.columns)

['Date',
 'Year',
 'Type',
 'Country',
 'Location',
 'Activity',
 'Sex',
 'Age',
 'Injury',
 'Fatal (Y/N)',
 'Time',
 'Species']

In [10]:
# It's time to delete all the Null and NaN values

sharks_nona = sharks_fil_ren.dropna(axis = 0, how="all", inplace=False)
print(f"The shape is now {sharks_nona.shape}")

display_errors(sharks_nona)
display_side_by_side(sharks_nona.sample(5))

The shape is now (6302, 12)


Unnamed: 0,0
Date,0
Year,2
Type,4
Country,50
Location,540
Activity,544
Sex,565
Age,2831
Injury,28
Fatal (Y/N),539

Unnamed: 0,0
Date,0
Year,2
Type,4
Country,50
Location,540
Activity,544
Sex,565
Age,2831
Injury,28
Fatal (Y/N),539

Unnamed: 0,0
False,6295
True,7


Unnamed: 0,Date,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
750,28-Aug-2012,2012.0,Unprovoked,AUSTRALIA,Red Bluff near Quobba Station,Surfing,M,34.0,Lacerations to torso and arm,N,15h30,
2203,25-Jan-1998,1998.0,Unprovoked,REUNION,,Bathing,M,,FATAL,Y,Mid afternoon,
5143,23-Oct-1926,1926.0,Sea Disaster,BERMUDA,18 miles southwest of Bermuda,British patrol boat 1250-ton HMS Valerian foundered in a hurricane,,,"Of 104 people in the water, only 20 survived. 84 people were lost, many to sharks. Sharks 2 pulled crew off life rafts",N,01h30,
3271,04-Apr-1974,1974.0,Unprovoked,SOUTH AFRICA,"Inyoni Rocks, Amanzimtoti",Surfing,M,17.0,Right foot lacerated,N,16h30,Juvenile dusky shark
2531,01-Oct-1992,1992.0,Unprovoked,AUSTRALIA,"North Point Beach, Moreton Island",Surfing,M,28.0,FATAL,Y,14h30,4.2 m white shark


In [11]:
# After that, let's implement a listwise deletion of the rest of the missing values.
# Own criteria: if NaN values > 90 % of the total rows, delete those rows.

sharks_nona_2 = sharks_nona.dropna(axis=0, thresh=int((90/100)*sharks_nona.shape[1] + 1))
      
display_errors(sharks_nona_2)
print(f"The shape is now {sharks_nona_2.shape}")
display(sharks_nona_2.sample(6))

"""Looking good. No duplicates and NaN are significantly reduced. It's obvious that I still have a lot of NaNs, but removing
them only leads to ignoring a lot of data, useful for further investigations"""


Unnamed: 0,0
Date,0
Year,0
Type,0
Country,0
Location,13
Activity,16
Sex,8
Age,231
Injury,0
Fatal (Y/N),84

Unnamed: 0,0
Date,0
Year,0
Type,0
Country,0
Location,13
Activity,16
Sex,8
Age,231
Injury,0
Fatal (Y/N),84

Unnamed: 0,0
False,2975


The shape is now (2975, 12)


Unnamed: 0,Date,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
3078,05-Jan-1980,1980.0,Unprovoked,CHILE,"Punta Negra, Pichidangui",Hookah Diving,M,,FATAL,Y,11h00,White shark
3598,28-Dec-1965,1965.0,Unprovoked,AUSTRALIA,"Bathurst Reef, Rottnest Island","Spearfishing, but standing in knee-deep water",M,40.0,Thigh & swim fin bitten,N,,1.8 m [6'] carpet shark
2154,02-Nov-1998,1998.0,Provoked,JAPAN,460 miles off Iwakuni,Fishing for tuna,M,52.0,PROVOKED INCIDENT Knee bitten by shark trap...,N,,6' shark
1341,25-Aug-2007,2007.0,Unprovoked,USA,"New Smyrna Beach, Volusia County",Surfing,M,27.0,6 lacerations to left hand,N,Morning,3' shark
1958,12-Apr-2001,2001.0,Unprovoked,USA,"New Smyrna Beach, Volusia County",Body boarding,M,12.0,Foot & ankle lacerated,N,12h34,Possibly a juvenile blacktip or spinner shark
3431,Jun-1969,1969.0,Unprovoked,USA,Sarasota County,Wading,M,8.0,No injury,N,20h00,


"Looking good. No duplicates and NaN are significantly reduced. It's obvious that I still have a lot of NaNs, but removing\nthem only leads to ignoring a lot of data, useful for further investigations"

In [12]:
#Cleaning a bit more Year column (we don't want them to be a float).

sharks_nona_2["Year"] = sharks_nona_2["Year"].astype(int)
#sharks_nona_2.loc[:, "Year"].astype(int)

display(sharks_nona_2.sample())

"""Nice"""

Unnamed: 0,Date,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
2840,12-Oct-1985,1985,Unprovoked,USA,"Barbers Point, O'ahu",Floating on inner tube after diving for lobster,M,24,Left arm lacerated,N,,1.8 m to 2.4 m [6' to 8'] shark


'Nice'

In [13]:
# We don't need the day and year in the "Date" column

sharks_nona_2["Date"] = sharks_nona_2["Date"].str.extract(r"-(\w{3})-")

sharks_nona_2 = sharks_nona_2.rename(columns = {"Date": "Month"}, inplace=False)

sharks_nona_2 = sharks_nona_2.dropna(subset=["Month"], how="any", axis=0, inplace=False)

sharks_nona_2.sample()

Unnamed: 0,Month,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
3672,Jul,1964,Unprovoked,SOUTH AFRICA,Island Rock,Searching for remains of Dr. Marais,M,30,"No injury, shark charged & impaled itself on s...",N,,Zambesi shark


In [14]:
# We can filter the species by the proper name of the shark, everything else is redundant

sharks_nona_2["Species"] = sharks_nona_2["Species"].str.extract(r"([A-Z|a-z]{2,}\sshark)")
display_errors(sharks_nona_2)
sharks_nona_2.sample(5)

Unnamed: 0,0
Month,0
Year,0
Type,0
Country,0
Location,12
Activity,16
Sex,8
Age,221
Injury,0
Fatal (Y/N),78

Unnamed: 0,0
Month,0
Year,0
Type,0
Country,0
Location,12
Activity,16
Sex,8
Age,221
Injury,0
Fatal (Y/N),78

Unnamed: 0,0
False,2874


Unnamed: 0,Month,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
436,Apr,2015,Unprovoked,USA,"3 miles off Jupiter, Palm Beach County",Spearfishing,M,70.0,Injuries to head & torso,N,Afternoon,Bull shark
1847,Jun,2002,Unprovoked,USA,"New Smyrna Beach, Volusia County",Swimming,F,11.0,Small lacerations on right lower leg,N,14h25,
1774,May,2003,Unprovoked,USA,Between Magic Sands Beach and Kahaluu Beach on...,Swimming,M,20.0,Right calf & heel bitten,N,11h45,reef shark
1824,Sep,2002,Provoked,USA,"Key Largo, Monroe County",Fishing,M,,Left thumb lacerated PROVOKED INCIDENT,N,Afternoon,blacktip shark
2666,Sep,1989,Unprovoked,USA,"Salvo, Dare County",Surfing,M,31.0,Non-fatal,N,,Sandbar shark


In [15]:
#Renaming

sharks_cl1 = sharks_nona_2

In [16]:
# We can capitalize the shark types
sharks_cl1["Species"] = sharks_cl1["Species"].str.capitalize()

In [17]:
sharks_cl1.sample(3)

Unnamed: 0,Month,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
1900,Sep,2001,Invalid,USA,"2 miles off Pompano Beach, Broward County",Wreck / Technical diving,M,42,FATAL or drowning & scavenging,Y,13h20,
3094,Jun,1979,Sea Disaster,USA,10 miles off Cape Canaveral,Floating with life preserver after his boat fo...,M,56,Lacerations to leg,N,Morning,
938,Mar,2011,Unprovoked,FIJI,Nukudamu,Diving / fishing,M,30,"Left forearm severely bitten, surgically amput...",N,16h00,


In [18]:
sharks_cl1.reset_index()

sharks_cl1.head()

Unnamed: 0,Month,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,Jun,2018,Boating,USA,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
1,Jun,2018,Unprovoked,USA,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,
2,Jun,2018,Invalid,USA,"Habush, Oahu",Surfing,M,48,Injury to left lower leg from surfboard skeg,N,07h45,
6,Jun,2018,Unprovoked,BRAZIL,"Piedade Beach, Recife",Swimming,M,18,FATAL,Y,Late afternoon,Tiger shark
7,May,2018,Unprovoked,USA,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52,Minor injury to foot. PROVOKED INCIDENT,N,,Lemon shark


# Cleaned dataset

At this point, I consider the table ready to be analyzed properly be taking the right columns. Data extraction & visualization will be taken place in a separated notebook

In [21]:
sharks_cl1.to_csv("../data/attacks_cleaned.csv", index = False)