In [127]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
from IPython.display import display_html
from itertools import chain,cycle
import random

In [2]:
# Importing the file.csv

sharks = pd.read_csv("../data/attacks.csv", header = 0, encoding= 'unicode_escape')

In [3]:
# This is a function to display multiple tables in the same row

def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

In [4]:
# First info

sharks.shape

(25723, 24)

In [33]:
# Displaying Describe and Head tables
display_side_by_side(sharks.describe(), pd.DataFrame(sharks.columns),sharks.sample(6), titles=['Describe', "Column names", 'Head'])

Unnamed: 0,Year,original order
count,6300.0,6309.0
mean,1927.272381,3155.999683
std,281.116308,1821.396206
min,0.0,2.0
25%,1942.0,1579.0
50%,1977.0,3156.0
75%,2005.0,4733.0
max,2018.0,6310.0

Unnamed: 0,0
0,Case Number
1,Date
2,Year
3,Type
4,Country
5,Area
6,Location
7,Activity
8,Name
9,Sex

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
811,2012.03.06.a,06-Mar-2012,2012.0,Provoked,AUSTRALIA,Victoria,"Shipwreck Cove, Melbourne Aquarium","Diving, feeding sharks",female,F,34.0,Superficial lacerations to right side of face PROVOKED ACCIDENT,N,11h30,"Tawny nurse shark, 40cm","The Age, 3/6/2012",2012.03.06.a-MelbourneAquarium.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2012.03.06.a-MelbourneAquarium.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2012.03.06.a-MelbourneAquarium.pdf,2012.03.06.a,2012.03.06.a,5492.0,,
5256,1920.01.24.R.b,Reported 24-Jan-1920,1920.0,Unprovoked,AUSTRALIA,Torres Strait,,Diving,male,M,,Lacerations to foot,N,,,"The Argus, 1/24/1920",1920.01.24.R.b-TorresStrait-diver.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1920.01.24.R.b-TorresStrait-diver.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1920.01.24.R.b-TorresStrait-diver.pdf,1920.01.24.R.b,1920.01.24.R.b,1047.0,,
13527,,,,,,,,,,,,,,,,,,,,,,,,
11495,,,,,,,,,,,,,,,,,,,,,,,,
13992,,,,,,,,,,,,,,,,,,,,,,,,
11784,,,,,,,,,,,,,,,,,,,,,,,,


In [16]:
#Custom function to display errors

def display_errors(df):
    
    total_nas = pd.DataFrame(df.isna().sum())
    total_null = pd.DataFrame(df.isnull().sum())
    duplicated = pd.DataFrame(df.duplicated().value_counts())


    return display_side_by_side(total_nas, total_null, duplicated, titles=['Sum of NAs', "Sum of Nuls", "Number of Duplicates"])

In [22]:
# Display of the errors

display_errors(sharks)

Unnamed: 0,0
Case Number,17021
Date,19421
Year,19423
Type,19425
Country,19471
Area,19876
Location,19961
Activity,19965
Name,19631
Sex,19986

Unnamed: 0,0
Case Number,17021
Date,19421
Year,19423
Type,19425
Country,19471
Area,19876
Location,19961
Activity,19965
Name,19631
Sex,19986

Unnamed: 0,0
True,19411
False,6312


In [27]:
# We have to get rid of all unnecessary columns

filtered_sharks = sharks.drop(["Case Number", "Area", "pdf", "href formula", "href", "Case Number.1", "Case Number.2", "original order", "Unnamed: 22", "Unnamed: 23", "Investigator or Source"], axis = 1, inplace=False)

list(filtered_sharks.columns)

['Date',
 'Year',
 'Type',
 'Country',
 'Location',
 'Activity',
 'Name',
 'Sex ',
 'Age',
 'Injury',
 'Fatal (Y/N)',
 'Time',
 'Species ']

In [39]:
# It is useful to rewrite all column names in the correct format (deleting spaces, etc.)

sharks_fil_ren = filtered_sharks.rename(columns = {"Sex ":"Sex", "Species ":"Species"}, inplace = False)

list(sharks_fil_ren.columns)

['Date',
 'Year',
 'Type',
 'Country',
 'Location',
 'Activity',
 'Name',
 'Sex',
 'Age',
 'Injury',
 'Fatal (Y/N)',
 'Time',
 'Species']

In [114]:
# It's time to delete all the Null and NaN values

sharks_nona = sharks_fil_ren.dropna(axis = 0, how="all", inplace=False)
print(sharks_nona.shape)

display_errors(sharks_nona)
display_side_by_side(sharks_nona.sample(5))

(6302, 13)


Unnamed: 0,0
Date,0
Year,2
Type,4
Country,50
Location,540
Activity,544
Name,210
Sex,565
Age,2831
Injury,28

Unnamed: 0,0
Date,0
Year,2
Type,4
Country,50
Location,540
Activity,544
Name,210
Sex,565
Age,2831
Injury,28

Unnamed: 0,0
False,6300
True,2


Unnamed: 0,Date,Year,Type,Country,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species
1990,17-Nov-2000,2000.0,Unprovoked,USA,"Bonita Springs, Lee County",Swimming,Colin Shadforth,M,73.0,Right calf lacerated,N,12h00,1.2 m to 1.8 m [4' to 6'] shark
112,23-Jul-2017,2017.0,Unprovoked,USA,Ventnor,,Isabella Smith,F,,Minor injury to hand,N,,Sandtiger shark 2'
2976,01-Jul-1982,1982.0,Unprovoked,USA,"Daytona Beach, Volusia County",Swimming,Janet Babb,F,19.0,Laceration to left leg,N,14h30,
5271,17-Jan-1919,1919.0,Unprovoked,AUSTRALIA,Newcastle Beach,Swimming,Douglas Arkell,M,,"Multiple injuries, left leg surgically amputated at knee",N,17h15,3.7 m to 4.3 m [12' to 14'] shark
5730,24-Jun-1888,1888.0,Boating,AUSTRALIA,Port Melbourne,Fishing,2 men,,,"No injury to occupants, shark holed boat",N,Evening,


In [131]:
# After that, let's implement a listwise deletion of the rest of the missing values.
# Own criteria: if NaN values > 25 % of the total rows, delete the rest of them randomly.

for i in list(sharks_nona.columns):
    if sharks_nona[i].isna().sum()/sharks_nona.shape[0] >= 0.25:
        sharks_nona_2 = sharks_nona.dropna(subset=[i], axis = 0, inplace=False)

display_errors(sharks_nona_2)
display_side_by_side(sharks_nona_2.sample(5))

Unnamed: 0,0
Date,0
Year,1
Type,0
Country,12
Location,176
Activity,229
Name,97
Sex,319
Age,1271
Injury,11

Unnamed: 0,0
Date,0
Year,1
Type,0
Country,12
Location,176
Activity,229
Name,97
Sex,319
Age,1271
Injury,11

Unnamed: 0,0
False,3463
True,1


Unnamed: 0,Date,Year,Type,Country,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species
5817,25-Jul-1880,1880.0,Boating,USA,The Narrows,Sailing,Captain Aleck Robertson,M,,"Shark bit stern, no injury to occupant",N,15h00,10' shark
6078,Ca. 1837,1837.0,Invalid,USA,Southern Wharf,,"adult male, a sailor",M,,7.6 m [25'] shark caught contained human remains,,,Shark involvement prior to death unconfirmed
4897,26-Aug-1935,1935.0,Unprovoked,AUSTRALIA,"At Flat Top, near Mackay","Fell overboard, hanging onto lifebuoy",Patrick Quinn,M,38.0,"FATAL. His body was not recovered, but about 3 weeks later 2 sharks were caught with human remains, thought to be those of Quinn",Y,Night,3.7 m [12'] shark
3829,25-Mar-1962,1962.0,Provoked,NEW ZEALAND,Alderman Islands,,W.T. Luxton,M,,Left foot bitten by hooked shark PROVOKED INCIDENT,N,,1.8 m [6'] shark
3889,07-Jul-1961,1961.0,Provoked,AUSTRALIA,Cape Moreton,Shark fishing,"35' motor launch, occupants: Bill Fulham & T. Fanning",,,"No injury to occupant, hooked shark bit boat's rudder PROVOKED INCIDENT",N,,"White shark, 5.2 m [17'], 2500-lb"


In [121]:
type(list(sharks_nona.columns))
print(list(sharks_nona.columns))

['Date', 'Year', 'Type', 'Country', 'Location', 'Activity', 'Name', 'Sex', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species']


In [None]:
# Let's implement a listwise deletion of the rest of the missing values