In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
from IPython.display import display_html
from itertools import chain,cycle
import random

In [2]:
# Importing the file.csv

sharks = pd.read_csv("../data/attacks.csv", header = 0, encoding= 'unicode_escape')

In [3]:
# This is a function to display multiple tables in the same row

def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

In [4]:
# First info

sharks.shape

(25723, 24)

In [5]:
# Displaying Describe and Head tables
display_side_by_side(sharks.describe(), pd.DataFrame(sharks.columns),sharks.sample(6), titles=['Describe', "Column names", 'Head'])

Unnamed: 0,Year,original order
count,6300.0,6309.0
mean,1927.272381,3155.999683
std,281.116308,1821.396206
min,0.0,2.0
25%,1942.0,1579.0
50%,1977.0,3156.0
75%,2005.0,4733.0
max,2018.0,6310.0

Unnamed: 0,0
0,Case Number
1,Date
2,Year
3,Type
4,Country
5,Area
6,Location
7,Activity
8,Name
9,Sex

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
16582,,,,,,,,,,,,,,,,,,,,,,,,
1755,2003.07.15,15-Jul-2003,2003.0,Unprovoked,USA,Florida,"Daytona Beach, Volusia County",Wading,C.K.,F,15.0,Heel & sole of left foot,N,14h37,,"S. Petersohn, GSAF",2003.07.15-CK.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2003.07.15-CK.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2003.07.15-CK.pdf,2003.07.15,2003.07.15,4548.0,,
16023,,,,,,,,,,,,,,,,,,,,,,,,
9710,,,,,,,,,,,,,,,,,,,,,,,,
1788,2003.04.18,18-Apr-2003,2003.0,Unprovoked,USA,Florida,"North Beach, Patrick AFB, Brevard County",Surfing,male,M,12.0,2 lacerations on left thigh,N,08h50,Unidentified species,"Local 6 News; Tallahassee Democrat (FL) 4/19/2003; J. Waymer, Florida Today (Melbourne), 4/19/2003",2003.04.18-PatrickAFB.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2003.04.18-PatrickAFB.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/2003.04.18-PatrickAFB.pdf,2003.04.18,2003.04.18,4515.0,,
16490,,,,,,,,,,,,,,,,,,,,,,,,


In [6]:
#Custom function to display errors

def display_errors(df):
    
    total_nas = pd.DataFrame(df.isna().sum())
    total_null = pd.DataFrame(df.isnull().sum())
    duplicated = pd.DataFrame(df.duplicated().value_counts())


    return display_side_by_side(total_nas, total_null, duplicated, titles=['Sum of NAs', "Sum of Nuls", "Number of Duplicates"])

In [7]:
# Display of the errors

display_errors(sharks)

Unnamed: 0,0
Case Number,17021
Date,19421
Year,19423
Type,19425
Country,19471
Area,19876
Location,19961
Activity,19965
Name,19631
Sex,19986

Unnamed: 0,0
Case Number,17021
Date,19421
Year,19423
Type,19425
Country,19471
Area,19876
Location,19961
Activity,19965
Name,19631
Sex,19986

Unnamed: 0,0
True,19411
False,6312


In [8]:
# We have to get rid of all unnecessary columns

filtered_sharks = sharks.drop(["Case Number", "Area", "pdf", "href formula", "href", "Case Number.1", "Case Number.2", "original order", "Unnamed: 22", "Unnamed: 23", "Investigator or Source"], axis = 1, inplace=False)

list(filtered_sharks.columns)

['Date',
 'Year',
 'Type',
 'Country',
 'Location',
 'Activity',
 'Name',
 'Sex ',
 'Age',
 'Injury',
 'Fatal (Y/N)',
 'Time',
 'Species ']

In [9]:
# It is useful to rewrite all column names in the correct format (deleting spaces, etc.)

sharks_fil_ren = filtered_sharks.rename(columns = {"Sex ":"Sex", "Species ":"Species"}, inplace = False)

list(sharks_fil_ren.columns)

['Date',
 'Year',
 'Type',
 'Country',
 'Location',
 'Activity',
 'Name',
 'Sex',
 'Age',
 'Injury',
 'Fatal (Y/N)',
 'Time',
 'Species']

In [10]:
# It's time to delete all the Null and NaN values

sharks_nona = sharks_fil_ren.dropna(axis = 0, how="all", inplace=False)
print(sharks_nona.shape)

display_errors(sharks_nona)
display_side_by_side(sharks_nona.sample(5))

(6302, 13)


Unnamed: 0,0
Date,0
Year,2
Type,4
Country,50
Location,540
Activity,544
Name,210
Sex,565
Age,2831
Injury,28

Unnamed: 0,0
Date,0
Year,2
Type,4
Country,50
Location,540
Activity,544
Name,210
Sex,565
Age,2831
Injury,28

Unnamed: 0,0
False,6300
True,2


Unnamed: 0,Date,Year,Type,Country,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species
2417,17-Oct-1994,1994.0,Unprovoked,BRAZIL,Piedade,Surfing,Ednaldo Jose da Silva,M,,Survived,N,,
1413,11-Dec-2006,2006.0,Unprovoked,NEW ZEALAND,"Raglan, Manu Bay",Surfing,Elliot Paerata-Reid,M,10.0,Foot bitten,N,11h00,2 to 3 m shark
3491,1968,1968.0,Invalid,USA,"Jensen Beach, Martin County",Surfing,,,,,,17h00,Questionable incident
3026,24-May-1981,1981.0,Unprovoked,USA,"Ha'ena Beach Park, Kaua'i","Scuba diving, reportedly also spearfishing",Roger B. Garletts,M,,"FATAL, disappeared, dive gear & shredded tooth-marked wetsuit were recovered",Y,,
426,07-May-2015,2015.0,Unprovoked,USA,"Cocoa Beach, Brevard County",Swimming,Josh Green,M,,"Lacerations to lower left leg, ankle & foot",N,15h00,


In [126]:
# After that, let's implement a listwise deletion of the rest of the missing values.
# Own criteria: if NaN values > 90 % of the total rows, delete those rows.

sharks_nona_2 = sharks_nona.dropna(axis=0, thresh=int((90/100)*sharks_nona.shape[1] + 1))
      
display_errors(sharks_nona_2)
sharks_nona_2.shape
"""Looking good. No duplicates and NaN are significantly reduced"""

Unnamed: 0,0
Date,0
Year,0
Type,0
Country,0
Location,13
Activity,16
Name,1
Sex,8
Age,230
Injury,0

Unnamed: 0,0
Date,0
Year,0
Type,0
Country,0
Location,13
Activity,16
Name,1
Sex,8
Age,230
Injury,0

Unnamed: 0,0
False,2971


'Looking good. No duplicates and NaN are significantly reduced'