# Importing the necessary tools

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import re
import seaborn as sns
from IPython.display import display_html
from itertools import chain,cycle

In [2]:
# Importing the file.csv

sharks = pd.read_csv("../data/attacks.csv", header = 0, encoding= 'unicode_escape')

In [3]:
# This is a function to display multiple tables in the same row

def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2 style="text-align: center;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

# First info about the dataset: shape, columns...

In [4]:
# First info

print(f"The shape is now {sharks.shape}")

The shape is now (25723, 24)


In [5]:
# Displaying Describe and Head tables
display_side_by_side(sharks.describe(), pd.DataFrame(sharks.columns),sharks.sample(6), titles=['Describe', "Column names", 'Head'])

Unnamed: 0,Year,original order
count,6300.0,6309.0
mean,1927.272381,3155.999683
std,281.116308,1821.396206
min,0.0,2.0
25%,1942.0,1579.0
50%,1977.0,3156.0
75%,2005.0,4733.0
max,2018.0,6310.0

Unnamed: 0,0
0,Case Number
1,Date
2,Year
3,Type
4,Country
5,Area
6,Location
7,Activity
8,Name
9,Sex

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
19461,,,,,,,,,,,,,,,,,,,,,,,,
11373,,,,,,,,,,,,,,,,,,,,,,,,
17722,,,,,,,,,,,,,,,,,,,,,,,,
4643,1944.08.20,20-Aug-1944,1944.0,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Margate,Swimming,Dennis Nissen,M,19.0,"FATAL, body not recovered",Y,14h00,,"T. Blake, M. Levine, GSAF",1944.08.20-Nissen.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1944.08.20-Nissen.pdf,http://sharkattackfile.net/spreadsheets/pdf_directory/1944.08.20-Nissen.pdf,1944.08.20,1944.08.20,1660.0,,
15266,,,,,,,,,,,,,,,,,,,,,,,,
23378,,,,,,,,,,,,,,,,,,,,,,,,


In [6]:
#Custom function to display errors

def display_errors(df):
    
    total_nas = pd.DataFrame(df.isna().sum())
    total_null = pd.DataFrame(df.isnull().sum())
    duplicated = pd.DataFrame(df.duplicated().value_counts())


    return display_side_by_side(total_nas, total_null, duplicated, titles=['Sum of NAs', "Sum of Nuls", "Number of Duplicates"])

In [7]:
# Display of the errors

display_errors(sharks)

Unnamed: 0,0
Case Number,17021
Date,19421
Year,19423
Type,19425
Country,19471
Area,19876
Location,19961
Activity,19965
Name,19631
Sex,19986

Unnamed: 0,0
Case Number,17021
Date,19421
Year,19423
Type,19425
Country,19471
Area,19876
Location,19961
Activity,19965
Name,19631
Sex,19986

Unnamed: 0,0
True,19411
False,6312


# Data cleaning

## Remove unnecessary columns

In [8]:
# We have to get rid of all unnecessary columns

filtered_sharks = sharks.drop(["Case Number", "Area", "pdf", "href formula", "Name", "href", "Case Number.1", "Case Number.2", "original order", "Unnamed: 22", "Unnamed: 23", "Investigator or Source"], axis = 1, inplace=False)

list(filtered_sharks.columns)

['Date',
 'Year',
 'Type',
 'Country',
 'Location',
 'Activity',
 'Sex ',
 'Age',
 'Injury',
 'Fatal (Y/N)',
 'Time',
 'Species ']

In [9]:
# It is useful to rewrite all column names in the correct format (deleting spaces, etc.)

sharks_fil_ren = filtered_sharks.rename(columns = {"Sex ":"Sex", "Species ":"Species"}, inplace = False)

list(sharks_fil_ren.columns)

['Date',
 'Year',
 'Type',
 'Country',
 'Location',
 'Activity',
 'Sex',
 'Age',
 'Injury',
 'Fatal (Y/N)',
 'Time',
 'Species']

## Deleting Null and NaNs

In [10]:
# It's time to delete all the Null and NaN values

sharks_nona = sharks_fil_ren.dropna(axis = 0, how="all", inplace=False)
print(f"The shape is now {sharks_nona.shape}")

display_errors(sharks_nona)
display_side_by_side(sharks_nona.sample(5))

The shape is now (6302, 12)


Unnamed: 0,0
Date,0
Year,2
Type,4
Country,50
Location,540
Activity,544
Sex,565
Age,2831
Injury,28
Fatal (Y/N),539

Unnamed: 0,0
Date,0
Year,2
Type,4
Country,50
Location,540
Activity,544
Sex,565
Age,2831
Injury,28
Fatal (Y/N),539

Unnamed: 0,0
False,6295
True,7


Unnamed: 0,Date,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
450,05-Feb-2015,2015.0,Unprovoked,AUSTRALIA,Mereweather Beach,Bodysurfing,M,,5 minor puncture wounds to lower left leg,N,,1.8 m shark
4921,24-Jan-1935,1935.0,Unprovoked,AUSTRALIA,"Off Ben Buckler, near Sydney",Fishing,M,,"No injury, sleeve ripped",N,,"Tiger shark, 12'"
1483,04-May-2006,2006.0,Unprovoked,USA,"Hutchinson Island, St. Lucie County",Swimming,F,20s,2 lacerations on lower right leg,N,16h30,
6241,1960s,0.0,Unprovoked,IRAQ,Shatt-al-Arab River,Fishing from a small boat & put his hand in the water while holding a dead fish,M,25,Right hand severed,N,Afternoon,
3133,31-Dec-1977,1977.0,Sea Disaster,USA,2 miles off Keahole Airport,"Swimming, after single-engine aircraft went down in the sea",M,49,Feet lacerated,N,Night,


In [11]:
# After that, let's implement a listwise deletion of the rest of the missing values.
# Own criteria: if NaN values > 90 % of the total rows, delete those rows.

sharks_nona_2 = sharks_nona.dropna(axis=0, thresh=int((90/100)*sharks_nona.shape[1] + 1))
      
display_errors(sharks_nona_2)
print(f"The shape is now {sharks_nona_2.shape}")
display(sharks_nona_2.sample(6))

"""Looking good. No duplicates and NaN are significantly reduced. It's obvious that I still have a lot of NaNs, but removing
them only leads to ignoring a lot of data, useful for further investigations"""


Unnamed: 0,0
Date,0
Year,0
Type,0
Country,0
Location,13
Activity,16
Sex,8
Age,231
Injury,0
Fatal (Y/N),84

Unnamed: 0,0
Date,0
Year,0
Type,0
Country,0
Location,13
Activity,16
Sex,8
Age,231
Injury,0
Fatal (Y/N),84

Unnamed: 0,0
False,2975


The shape is now (2975, 12)


Unnamed: 0,Date,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
959,11-Dec-2010,2010.0,Unprovoked,USA,"Tavares Bay, Maui",Surfing,M,46,Lacerations to right foot,N,13h51,
2175,01-Aug-1998,1998.0,Unprovoked,SOUTH AFRICA,Pringle Bay,Spearfishing,M,24,Leg bitten,N,13h00,"White shark, 4.9 m [16']"
1101,01-Aug-2009,2009.0,Unprovoked,USA,"Curlew Island, Breton Sound",Wade Fishing,M,56,Right ankle & foot bitten,N,10h00,Bull shark?
44,11-Feb-2018,2018.0,Invalid,BRAZIL,Espirito Santo,Cleaning fish,F,46,Lacerations to 4 toes of right foot,N,,"Injury believed caused by an eel, not a shark"
3458,15-Sep-1968,1968.0,Unprovoked,NEW ZEALAND,Otago Harbor,Spearfishing,M,24,"FATAL, left leg bitten, femoral artery severed",Y,10h15,"White shark, 4.3 m [14'], (tooth fragment reco..."
757,31-Jul-2012,2012.0,Unprovoked,USA,"Topanga Beach, Los Angeles County",Surfing,M,17,"No injury, surfer knocked off board when shark...",N,06h45,


"Looking good. No duplicates and NaN are significantly reduced. It's obvious that I still have a lot of NaNs, but removing\nthem only leads to ignoring a lot of data, useful for further investigations"

## Cleaning the Year column

In [12]:
#Cleaning a bit more Year column (we don't want them to be a float).

sharks_nona_2["Year"] = sharks_nona_2["Year"].astype(int)
#sharks_nona_2.loc[:, "Year"].astype(int)

display(sharks_nona_2.sample())

"""Nice"""

Unnamed: 0,Date,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
2165,14-Sep-1998,1998,Unprovoked,USA,"Stuart Beach, Martin County",Surfing,M,28,Right foot bitten,N,,1.8 m [6'] shark


'Nice'

In [13]:
# We don't need the day and year in the "Date" column

sharks_nona_2["Date"] = sharks_nona_2["Date"].str.extract(r"-(\w{3})-")

sharks_nona_2 = sharks_nona_2.rename(columns = {"Date": "Month"}, inplace=False)

sharks_nona_2 = sharks_nona_2.dropna(subset=["Month"], how="any", axis=0, inplace=False)

sharks_nona_2.sample()

Unnamed: 0,Month,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
80,Sep,2017,Unprovoked,USA,Fort Worth,Surfing,M,,Laceration to upper arm,N,14h00,4' to 5' shark


## Cleaning the Species column

In [14]:
list(sharks_nona_2["Species"])

['White shark',
 nan,
 nan,
 'Tiger shark',
 "Lemon shark, 3'",
 "Bull shark, 6'",
 nan,
 'Grey reef shark',
 nan,
 'Invalid incident',
 'Shark involvement not confirmed',
 'Tiger shark',
 'Questionable',
 '3 m shark',
 'White shark, 3.5 m',
 'Tiger shark',
 'White shark, 2.5 m',
 "6' shark",
 'Juvenile bull shark',
 nan,
 'Tiger shark',
 'Shark involvement not confirmed',
 'Bull shark',
 'White shark',
 "Tiger shark, 12'",
 'Wobbegong shark',
 '3.5 m shark',
 '1.8 m shark',
 'Blacktip shark',
 'Juvenile white shark,  2.7 to 3.2 m',
 nan,
 'Bull shark, 2 m',
 'Possibly a wobbegong',
 'Injury believed caused by an eel, not a shark',
 'Galapagos shark?',
 '2m shark',
 'Bull shark',
 'Bull shark, 3 m ',
 'Grey reef shark. 2 m',
 'White shark, 3.5 m',
 'small shark',
 'Tiger shark',
 nan,
 'Juvenile nurse shark',
 'Tiger shark, female',
 'Tiger shark, female',
 'White shark, 4.6 m',
 nan,
 '2 m shark',
 'Tiger shark',
 'Cookiecutter shark',
 'Wobbegong shark, 1 m',
 nan,
 nan,
 'White shar

In [15]:
# We can filter the species by the proper name of the shark, everything else is redundant

sharks_nona_2["Species"] = sharks_nona_2["Species"].str.extract(r"([A-Z|a-z]{4,}\sshark)")
display_errors(sharks_nona_2)
sharks_nona_2.sample(5)

Unnamed: 0,0
Month,0
Year,0
Type,0
Country,0
Location,12
Activity,16
Sex,8
Age,221
Injury,0
Fatal (Y/N),78

Unnamed: 0,0
Month,0
Year,0
Type,0
Country,0
Location,12
Activity,16
Sex,8
Age,221
Injury,0
Fatal (Y/N),78

Unnamed: 0,0
False,2874


Unnamed: 0,Month,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
4043,Nov,1959,Unprovoked,AUSTRALIA,"Fairhaven Beach, Lorne",Body surfing,M,19,"Thighs bitten, right hand lacerated",N,14h45,
5573,Aug,1899,Unprovoked,EGYPT,Port Said,Bathing,M,19,"Forearm, wrist & hand bitten",N,09h30,
196,Dec,2016,Provoked,AUSTRALIA,Merimbula,Surf fishing,M,20,Lacerations to both hands while attempting to ...,N,02h00,Wobbegong shark
4284,Aug,1955,Provoked,USA,"Zuma Beach, Santa Monica, Los Angeles County",Surfing,M,25,"Surfer grabbed shark, which turned & bit him a...",N,,blue shark
268,Jun,2016,Unprovoked,EGYPT,Ain Sokhna,Swimming,M,23,"Leg severely bitten, surgically amputated",N,Morning,Mako shark


In [16]:
# We can capitalize the shark types
sharks_nona_2["Species"] = sharks_nona_2["Species"].str.capitalize()

In [17]:
check = sharks_nona_2["Species"].value_counts()

print(check.to_string())

White shark           475
Tiger shark           186
Bull shark            135
Nurse shark            64
Blacktip shark         54
Whaler shark           52
Reef shark             43
Small shark            40
Spinner shark          40
Raggedtooth shark      35
Wobbegong shark        30
Mako shark             25
Hammerhead shark       23
Zambesi shark          22
Blue shark             21
Lemon shark            21
Whitetip shark         14
Sand shark             13
Sandtiger shark        13
Dusky shark            10
Sevengill shark         7
Galapagos shark         6
Copper shark            5
Porbeagle shark         5
Sandbar shark           5
Juvenile shark          5
Gill shark              5
Angel shark             4
Silky shark             4
Colored shark           4
Zambezi shark           3
Carpet shark            3
Brown shark             2
Foot shark              1
Banjo shark             1
Hooked shark            1
Unidentified shark      1
Larger shark            1
Another shar

In [18]:
sharks_nona_2.sample(3)

Unnamed: 0,Month,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
1538,Oct,2005,Provoked,USA,"Ponce Inlet, Volusia County",Wading,M,15,Minor cuts to dorsum & sole of left foot when ...,N,17h55,
1023,Jun,2010,Unprovoked,AUSTRALIA,"Conspicuous Beach, near Walpole",Surfing,M,40,Severe laceration to right knee,N,12h00,
1592,Apr,2005,Provoked,AUSTRALIA,Bermagui,Fishing,M,25,Laceration on left thigh PROVOKED INCIDENT,N,23h00,Mako shark


## Cleaning the Age column

In [19]:
list(sharks_nona_2["Age"])

['57',
 '11',
 '48',
 '18',
 '52',
 '15',
 '12',
 '32',
 '10',
 '21',
 nan,
 '30',
 nan,
 '60',
 nan,
 '32',
 '33',
 '29',
 '54',
 '37',
 nan,
 '56',
 '12',
 '19',
 '25',
 '10',
 '69',
 '18',
 '38',
 '55',
 '34',
 '35',
 nan,
 '46',
 '45',
 '14',
 '18',
 '40s',
 nan,
 '28',
 '20',
 '54',
 '35',
 '24',
 '26',
 '49',
 '25',
 '14',
 '25',
 '22',
 '7',
 '31',
 '17',
 '60',
 '15',
 nan,
 nan,
 '54',
 '40',
 nan,
 nan,
 '33',
 nan,
 '13',
 '28',
 '18',
 '35',
 '42',
 '17',
 '37',
 '3',
 '11',
 '69',
 '13',
 '20',
 '34',
 nan,
 '15',
 '10',
 '37',
 nan,
 '40',
 '40',
 '50',
 nan,
 nan,
 '46',
 '32',
 '13',
 '14',
 '34',
 '82',
 '48',
 nan,
 '19',
 '30',
 '32',
 '20',
 nan,
 '41',
 '29',
 '33',
 '18',
 '14',
 '33',
 '28',
 '20s',
 '40',
 '17',
 '28',
 '17',
 '35',
 '10',
 '21',
 '51',
 '39',
 '17',
 '58',
 '30',
 nan,
 '25',
 nan,
 '26',
 '22',
 nan,
 '21',
 '55',
 '57',
 '48',
 '47',
 '20',
 '16',
 nan,
 '55',
 '61',
 '19',
 '20',
 '65',
 '73',
 '58',
 '41',
 '42',
 '36',
 '36',
 '66',
 '25',

In [20]:
sharks_nona_2["Age"] = sharks_nona_2["Age"].str.extract(r"^(\d{1,2})")
list(sharks_nona_2["Age"])

['57',
 '11',
 '48',
 '18',
 '52',
 '15',
 '12',
 '32',
 '10',
 '21',
 nan,
 '30',
 nan,
 '60',
 nan,
 '32',
 '33',
 '29',
 '54',
 '37',
 nan,
 '56',
 '12',
 '19',
 '25',
 '10',
 '69',
 '18',
 '38',
 '55',
 '34',
 '35',
 nan,
 '46',
 '45',
 '14',
 '18',
 '40',
 nan,
 '28',
 '20',
 '54',
 '35',
 '24',
 '26',
 '49',
 '25',
 '14',
 '25',
 '22',
 '7',
 '31',
 '17',
 '60',
 '15',
 nan,
 nan,
 '54',
 '40',
 nan,
 nan,
 '33',
 nan,
 '13',
 '28',
 '18',
 '35',
 '42',
 '17',
 '37',
 '3',
 '11',
 '69',
 '13',
 '20',
 '34',
 nan,
 '15',
 '10',
 '37',
 nan,
 '40',
 '40',
 '50',
 nan,
 nan,
 '46',
 '32',
 '13',
 '14',
 '34',
 '82',
 '48',
 nan,
 '19',
 '30',
 '32',
 '20',
 nan,
 '41',
 '29',
 '33',
 '18',
 '14',
 '33',
 '28',
 '20',
 '40',
 '17',
 '28',
 '17',
 '35',
 '10',
 '21',
 '51',
 '39',
 '17',
 '58',
 '30',
 nan,
 '25',
 nan,
 '26',
 '22',
 nan,
 '21',
 '55',
 '57',
 '48',
 '47',
 '20',
 '16',
 nan,
 '55',
 '61',
 '19',
 '20',
 '65',
 '73',
 '58',
 '41',
 '42',
 '36',
 '36',
 '66',
 '25',
 

## Cleaning the Country column

In [21]:
check2 = sharks_nona_2["Country"].value_counts()
print(check2.to_string())

USA                                      1359
AUSTRALIA                                 626
SOUTH AFRICA                              342
BAHAMAS                                    47
NEW ZEALAND                                44
BRAZIL                                     42
REUNION                                    35
MEXICO                                     26
PAPUA NEW GUINEA                           22
MOZAMBIQUE                                 20
NEW CALEDONIA                              19
EGYPT                                      16
FIJI                                       15
SPAIN                                      14
ITALY                                      13
JAPAN                                      12
FRENCH POLYNESIA                           11
CROATIA                                     9
CUBA                                        7
PHILIPPINES                                 6
TAIWAN                                      6
PANAMA                            

In [22]:
sharks_nona_2["Country"] = sharks_nona_2["Country"].str.title().replace("Islandas", "Islands").replace("St Helena, British Overseas Territory", "St Helena").replace("Usa", "USA").replace("United Arab Emirates (Uae)", "UAE").replace(" Tonga", "Tonga")

check3 = sharks_nona_2["Country"].value_counts()
print(check3.to_string())

USA                               1359
Australia                          626
South Africa                       342
Bahamas                             47
New Zealand                         44
Brazil                              42
Reunion                             35
Mexico                              26
Papua New Guinea                    22
Mozambique                          20
New Caledonia                       19
Egypt                               16
Fiji                                15
Spain                               14
Italy                               13
Japan                               12
French Polynesia                    11
Croatia                              9
Cuba                                 7
Panama                               6
Ecuador                              6
Tonga                                6
Taiwan                               6
Philippines                          6
Chile                                5
Indonesia                

## Cleaning the Sex column

In [23]:
display(sharks_nona_2["Sex"].unique())

sharks_nona_2["Sex"] = sharks_nona_2["Sex"].str.replace("M ", "M")

display(sharks_nona_2["Sex"].unique())

array(['F', 'M', 'M ', 'lli', nan], dtype=object)

array(['F', 'M', 'lli', nan], dtype=object)

## Cleaning the Fatality column

In [24]:
display(sharks_nona_2["Fatal (Y/N)"].unique())

sharks_nona_2["Fatal (Y/N)"] = sharks_nona_2["Fatal (Y/N)"].str.replace(" N", "N")
sharks_nona_2["Fatal (Y/N)"] = sharks_nona_2["Fatal (Y/N)"].str.extract(r"(N$|Y$)")

display(sharks_nona_2["Fatal (Y/N)"].unique())

array(['N', 'Y', nan, 'M', '2017', 'UNKNOWN', ' N'], dtype=object)

array(['N', 'Y', nan], dtype=object)

## Cleaning the Type column

In [25]:
display(sharks_nona_2["Type"].unique())
display(sharks_nona_2["Type"].value_counts())

sharks_nona_2["Type"] = sharks_nona_2["Type"].str.replace("Boatinging", "Boating").replace("Boat", "Boating").replace("Sea Disaster", "Unprovoked")
sharks_nona_2["Type"] = sharks_nona_2["Type"].str.extract(r"(Unprovoked|Provoked)")
display(sharks_nona_2["Type"].value_counts())

array(['Boating', 'Unprovoked', 'Invalid', 'Provoked', 'Questionable',
       'Boat', 'Sea Disaster'], dtype=object)

Unprovoked      2462
Provoked         239
Invalid           95
Boating           45
Sea Disaster      21
Boat              10
Questionable       2
Name: Type, dtype: int64

Unprovoked    2483
Provoked       239
Name: Type, dtype: int64

## Renaming

In [26]:
sharks_clean = sharks_nona_2

## Resetting indexes

In [27]:
sharks_clean.reset_index()

Unnamed: 0,index,Month,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,0,Jun,2018,,USA,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
1,1,Jun,2018,Unprovoked,USA,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,
2,2,Jun,2018,,USA,"Habush, Oahu",Surfing,M,48,Injury to left lower leg from surfboard skeg,N,07h45,
3,6,Jun,2018,Unprovoked,Brazil,"Piedade Beach, Recife",Swimming,M,18,FATAL,Y,Late afternoon,Tiger shark
4,7,May,2018,Unprovoked,USA,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52,Minor injury to foot. PROVOKED INCIDENT,N,,Lemon shark
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2869,5991,Mar,1858,Unprovoked,Australia,Hobson Bay,Bathing,M,22,FATAL,Y,15h00,
2870,6021,Feb,1852,Unprovoked,South Africa,Danger Point,Wreck of the steamship Birkenhead,M,,FATAL. All of the women & children on board su...,Y,01h50,White shark
2871,6077,Jan,1837,Unprovoked,Australia,Macleay River,Washing his feet,M,12,"FATAL Injured by shark, died of tetanus",Y,Evening,
2872,6114,May,1817,Unprovoked,Sri Lanka,Colombo,Swimming,M,22,FATAL,Y,Evening,


# Reviewing the final dataset

In [28]:
display(sharks_clean.head())

display_errors(sharks_clean)

sharks_clean.shape

Unnamed: 0,Month,Year,Type,Country,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,Jun,2018,,USA,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
1,Jun,2018,Unprovoked,USA,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,
2,Jun,2018,,USA,"Habush, Oahu",Surfing,M,48,Injury to left lower leg from surfboard skeg,N,07h45,
6,Jun,2018,Unprovoked,Brazil,"Piedade Beach, Recife",Swimming,M,18,FATAL,Y,Late afternoon,Tiger shark
7,May,2018,Unprovoked,USA,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52,Minor injury to foot. PROVOKED INCIDENT,N,,Lemon shark


Unnamed: 0,0
Month,0
Year,0
Type,152
Country,0
Location,12
Activity,16
Sex,8
Age,244
Injury,0
Fatal (Y/N),80

Unnamed: 0,0
Month,0
Year,0
Type,152
Country,0
Location,12
Activity,16
Sex,8
Age,244
Injury,0
Fatal (Y/N),80

Unnamed: 0,0
False,2874


(2874, 12)

By doing all these extractions, the non-extracted values remain as NaN. We can confirm it because the rows haven't changed. This is what I wanted, because deleting the non-extracted may lead to deleting information in other cells containing valuable information.

# Cleaned dataset

At this point, I consider the table ready to be analyzed properly be taking the right columns. Data extraction & visualization will be taken place in a separated notebook

In [29]:
sharks_clean.to_csv("../data/attacks_cleaned.csv", index = False)