# Importing the necessary tools

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import re
import seaborn as sns
from IPython.display import display_html
from itertools import chain,cycle

In [2]:
# Importing the file.csv

sharks = pd.read_csv("../data/attacks.csv", header = 0, encoding= 'unicode_escape')

In [3]:
# This is a function to display multiple tables in the same row

def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

# First info about the dataset: shape, columns...

In [4]:
# First info

print(f"The shape is now {sharks.shape}")

The shape is now (25723, 24)


In [5]:
# Displaying Describe and Head tables
display_side_by_side(sharks.describe(), pd.DataFrame(sharks.columns),sharks.sample(6), titles=['Describe', "Column names", 'Head'])

Unnamed: 0,Year,original order
count,6300.0,6309.0
mean,1927.272381,3155.999683
std,281.116308,1821.396206
min,0.0,2.0
25%,1942.0,1579.0
50%,1977.0,3156.0
75%,2005.0,4733.0
max,2018.0,6310.0

Unnamed: 0,0
0,Case Number
1,Date
2,Year
3,Type
4,Country
5,Area
6,Location
7,Activity
8,Name
9,Sex

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
19502,,,,,,,,,,,,,,,,,,,,,,,,
14747,,,,,,,,,,,,,,,,,,,,,,,,
5719,1889.01.31.R,Reported 31-Jan-1889,1889.0,Provoked,USA,Florida,Hillsborough Bay,Fishing,"rowboat,",,,No injury to occupants. Gaffed shark capsized boat PROVOKED INCIDENT,N,,,"Timaru Herald, 1/31/1889",1889.01.31.R-Florida.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1889.01.31.R-Florida.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1889.01.31.R-Florida.pdf,1889.01.31.R,1889.01.31.R,584.0,,
16409,,,,,,,,,,,,,,,,,,,,,,,,
23531,,,,,,,,,,,,,,,,,,,,,,,,
13352,,,,,,,,,,,,,,,,,,,,,,,,


In [6]:
#Custom function to display errors

def display_errors(df):
    
    total_nas = pd.DataFrame(df.isna().sum())
    total_null = pd.DataFrame(df.isnull().sum())
    duplicated = pd.DataFrame(df.duplicated().value_counts())


    return display_side_by_side(total_nas, total_null, duplicated, titles=['Sum of NAs', "Sum of Nuls", "Number of Duplicates"])

In [7]:
# Display of the errors

display_errors(sharks)

Unnamed: 0,0
Case Number,17021
Date,19421
Year,19423
Type,19425
Country,19471
Area,19876
Location,19961
Activity,19965
Name,19631
Sex,19986

Unnamed: 0,0
Case Number,17021
Date,19421
Year,19423
Type,19425
Country,19471
Area,19876
Location,19961
Activity,19965
Name,19631
Sex,19986

Unnamed: 0,0
True,19411
False,6312


# Data cleaning

In [8]:
# We have to get rid of all unnecessary columns

filtered_sharks = sharks.drop(["Case Number", "Area", "pdf", "href formula", "Name", "href", "Case Number.1", "Case Number.2", "original order", "Unnamed: 22", "Unnamed: 23", "Investigator or Source"], axis = 1, inplace=False)

list(filtered_sharks.columns)

['Date',
 'Year',
 'Type',
 'Country',
 'Location',
 'Activity',
 'Sex ',
 'Age',
 'Injury',
 'Fatal (Y/N)',
 'Time',
 'Species ']

In [9]:
# It is useful to rewrite all column names in the correct format (deleting spaces, etc.)

sharks_fil_ren = filtered_sharks.rename(columns = {"Sex ":"Sex", "Species ":"Species"}, inplace = False)

list(sharks_fil_ren.columns)

['Date',
 'Year',
 'Type',
 'Country',
 'Location',
 'Activity',
 'Sex',
 'Age',
 'Injury',
 'Fatal (Y/N)',
 'Time',
 'Species']

In [10]:
# It's time to delete all the Null and NaN values

sharks_nona = sharks_fil_ren.dropna(axis = 0, how="all", inplace=False)
print(f"The shape is now {sharks_nona.shape}")

display_errors(sharks_nona)
display_side_by_side(sharks_nona.sample(5))

The shape is now (6302, 12)


Unnamed: 0,0
Date,0
Year,2
Type,4
Country,50
Location,540
Activity,544
Sex,565
Age,2831
Injury,28
Fatal (Y/N),539

Unnamed: 0,0
Date,0
Year,2
Type,4
Country,50
Location,540
Activity,544
Sex,565
Age,2831
Injury,28
Fatal (Y/N),539

Unnamed: 0,0
False,6295
True,7


Unnamed: 0,Date,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
3945,01-Dec-1960,1960.0,Unprovoked,AUSTRALIA,"Colledge's Crossing, Brisbane River",Fishing,M,,Bitten & survived,N,,"Bull shark, 1m"
2913,21-Dec-1983,1983.0,Unprovoked,SOUTH AFRICA,Nahoon,Swimming,M,47.0,Shin lacerated,N,08h15,"Raggedtooth shark, >1 m"
819,06-Feb-2012,2012.0,Unprovoked,AUSTRALIA,Wurtulla,Surfing,M,29.0,"No injury, but fin lost from surfboard",N,12h00,
3365,6-Apr-1971,1971.0,Unprovoked,MEXICO,"Copacabana Beach, Acapulco",Surfing,M,19.0,"FATAL, left thigh bitten",Y,11h00,
3904,17-Apr-1961,1961.0,Provoked,SOUTH AFRICA,Port Elizabeth Oceanarium,Diving,M,,Shark bit swimfin after diver kicked shark PROVOKED INCIDENT,N,,"2 m ""yellow belly"" captive shark. Shark destroyed by aquarium staff next day"


In [11]:
# After that, let's implement a listwise deletion of the rest of the missing values.
# Own criteria: if NaN values > 90 % of the total rows, delete those rows.

sharks_nona_2 = sharks_nona.dropna(axis=0, thresh=int((90/100)*sharks_nona.shape[1] + 1))
      
display_errors(sharks_nona_2)
print(f"The shape is now {sharks_nona_2.shape}")
display(sharks_nona_2.sample(6))

"""Looking good. No duplicates and NaN are significantly reduced. It's obvious that I still have a lot of NaNs, but removing
them only leads to ignoring a lot of data, useful for further investigations"""


Unnamed: 0,0
Date,0
Year,0
Type,0
Country,0
Location,13
Activity,16
Sex,8
Age,231
Injury,0
Fatal (Y/N),84

Unnamed: 0,0
Date,0
Year,0
Type,0
Country,0
Location,13
Activity,16
Sex,8
Age,231
Injury,0
Fatal (Y/N),84

Unnamed: 0,0
False,2975


The shape is now (2975, 12)


Unnamed: 0,Date,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
3649,25-Dec-1964,1964.0,Unprovoked,SOUTH AFRICA,Port Alfred,Swimming,M,19,"Hand, ankle & calf lacerated",N,17h00,Raggedtooth shark
848,Reported 28-Oct-2011,2011.0,Unprovoked,SCOTLAND,Spey Bay,Surfing,M,26,"No injury, shark bumped leg & board.",N,,8' to 10' shark
1206,07-Sep-2008,2008.0,Unprovoked,AUSTRALIA,"Clarks Beach, Byron Bay",Surfing,M,51,Shark became tangled in his surfboard leash. T...,N,12h00,3 m shark
526,27-Jul-2014,2014.0,Unprovoked,USA,"Sunset Beach, Brunswick County",Swimming,M,Teen,Left foot bitten,N,,Possibly juvenile tiger shark
1487,19-Apr-2006,2006.0,Unprovoked,USA,"Daytona Beach, Volusia County",Standing,F,13,3 tiny punctures & small lacerations on right ...,N,15h00,
3526,13 or 30-May-1967,1967.0,Provoked,AUSTRALIA,Outer Harbor,,M,30,Thigh abraded & lacerated Recorded as PROVOKED...,N,Morning,Bronze whaler shark


"Looking good. No duplicates and NaN are significantly reduced. It's obvious that I still have a lot of NaNs, but removing\nthem only leads to ignoring a lot of data, useful for further investigations"

In [12]:
#Cleaning a bit more Year column (we don't want them to be a float).

sharks_nona_2["Year"] = sharks_nona_2["Year"].astype(int)
#sharks_nona_2.loc[:, "Year"].astype(int)

display(sharks_nona_2.sample())

"""Nice"""

Unnamed: 0,Date,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
2137,03-Feb-1999,1999,Unprovoked,USA,"Hobe Sound, Martin County",Surfing,M,25,Left hand bitten,N,15h30,Spinner shark


'Nice'

In [13]:
# We don't need the day and year in the "Date" column

sharks_nona_2["Date"] = sharks_nona_2["Date"].str.extract(r"-(\w{3})-")

sharks_nona_2 = sharks_nona_2.rename(columns = {"Date": "Month"}, inplace=False)

sharks_nona_2.sample(10)

Unnamed: 0,Month,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
5183,Apr,1924,Unprovoked,AUSTRALIA,Kiama,"Fishing, fell in water & swimming strongly to ...",M,20,"FATAL, partial remains recovered",Y,Afternoon,
66,Oct,2017,Unprovoked,AUSTRALIA,Birubi Point,Surfing,M,31,Minor injury to foot,N,09h00,"Wobbegong shark, 1 m"
1113,Jul,2009,Unprovoked,SOUTH AFRICA,"Jogensfontein, Stilbaai",Surfing,M,37,Leg bitten,N,11h15,
5406,Feb,1910,Unprovoked,AUSTRALIA,Bunbury,Surf bathing,M,,"Shoulder, back & leg bitten",N,Night,5.5' to 6' shark
1731,Oct,2003,Invalid,USA,Palm Beach?,Swimming to shore from boat or kayak,M,,"Fatal, drowning or scavenging. Two hours late...",Y,--,Shark involvement prior to death was not confi...
1447,,2006,Unprovoked,USA,"Masonboro Island, New Hanover County",Wading,F,59,Lacerations & punctures to left foot,N,12h00,
2254,May,1997,Unprovoked,USA,"Flagler Beach, Flagler County",Surfing,M,12,Left ankle bitten,N,11h00,Possibly a sand shark
1552,Sep,2005,Unprovoked,AUSTRALIA,"Fishery Bay, Eyre Peninsula",Surfing,M,40,Lacerations to right arm & thigh,N,15h00,4 m [13'] white shark
3463,Aug,1968,Unprovoked,COLUMBIA,"Makuaka Caño, Taganga",Dynamite fishing,M,18,Lacerations to head,N,10h30,2 m shark
568,Apr,2014,Unprovoked,USA,"New Smyrna Beach, Volusia County",Surfing,M,teen,Lacerations to foot,N,13h30,


In [14]:
#sharks_nona_2.to_csv("../data/attacks_cleaned.csv")