In [1042]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [1043]:
# Load csv file into dataset
attacks_raw = pd.read_csv("../data/attacks.csv", encoding='unicode_escape')

In [1044]:
# Rename columns where their name is ending in a blank space
attacks_raw.rename(columns={'Sex ': 'Sex',
                            'Species ': 'Species',
                            'Fatal (Y/N)': 'Fatal'}, inplace=True, errors='raise')

In [1045]:
# Drop columns of information not valuable to study
columns_to_drop = ["pdf", "Investigator or Source", "href formula", "href", "Case Number.1", "Case Number.2", "original order", "Unnamed: 22", "Unnamed: 23"]
attacks_raw.drop(columns=columns_to_drop, inplace=True)

In [1046]:
# Clear dataset of rows with at least three values as NA
attacks_raw.dropna(how="all", inplace=True)
attacks_raw.dropna(thresh=6, inplace=True)

In [1047]:
# Clear dataset of rows where Year is before 1900, when no year given or bad data
attacks_raw.drop(attacks_raw[attacks_raw.Year < 1900].index, inplace=True)
attacks_raw = attacks_raw[attacks_raw['Year'].notna()]

In [1048]:
# Refactor Country as upper strings
attacks_raw['Country'] = attacks_raw['Country'].str.upper()

In [1049]:
# Refactor and clean Date column to dateformat and drop other date related columns
attacks_raw['Date'] = attacks_raw['Date'].str.extract('(\d{2}-\w{3}-\d{4})')
attacks_raw = attacks_raw[attacks_raw['Date'].notna()]
attacks_raw["datetime"] = pd.to_datetime(attacks_raw['Date'], format='%d-%b-%Y')
col = attacks_raw.pop("datetime")
attacks_raw.insert(0, col.name, col)
date_columns = ["Case Number", "Date", "Year", "Time"]
attacks_raw.drop(columns=date_columns, inplace=True)

In [1050]:
# Refactor Type column to have similar values and drop invalids
attacks_raw.Type.replace('Boatomg', 'Boat', inplace=True)
attacks_raw.Type.replace('Boating', 'Boat', inplace=True)
attacks_raw.drop(attacks_raw[attacks_raw.Type == 'Invalid'].index, inplace=True)

In [1051]:
# Refactor Sex column to: M(male), F(female), nan(other)
attacks_raw.Sex = attacks_raw.Sex.str.strip()
attacks_raw.Sex.replace('lli', 'M', inplace=True)
attacks_raw.Sex.replace('N', np.nan, inplace=True)
attacks_raw.Sex.replace('.', np.nan, inplace=True)

In [1052]:
# Refactor Fatal column to: Y(yes), N(no), nan(unknown)
attacks_raw.Fatal.replace('UNKNOWN', np.nan, inplace=True)
attacks_raw.Fatal.replace('2017', np.nan, inplace=True)
attacks_raw.Fatal.replace(' N', np.nan, inplace=True)
attacks_raw.Fatal.replace('M', np.nan, inplace=True)

In [1053]:
# Cleaning Age
print("Before: ",len(list(attacks_raw["Age"].unique())))
attacks_raw.Age = attacks_raw.Age.str.strip()


print("After: ",len(list(attacks_raw["Age"].unique())))
print(list(attacks_raw["Age"].unique()))


Before:  138
After:  130
['57', '11', nan, '18', '52', '15', '12', '32', '10', '30', '60', '33', '29', '54', '34', '41', '37', '19', '25', '38', '55', '35', '45', '40s', '28', '20', '24', '26', '49', '14', '22', '7', '31', '17', '40', '13', '42', '3', '69', '50', '46', '16', '82', '48', '20s', '21', '51', '39', '58', 'Teen', '47', '61', '65', '73', '36', '66', '43', '60s', '9', '72', '59', '6', '64', '23', '71', '44', '27', '62', '68', '63', '70', '18 months', '53', '30s', '50s', '8', 'teen', '77', '74', '56', '28 & 26', '5', '86', '18 or 20', '12 or 13', '46 & 34', '28, 23 & 30', 'Teens', '36 & 26', '84', '', '30 or 36', '6½', '21 & ?', '33 or 37', 'mid-30s', '7      &    31', '20?', '32 & 30', '87', 'Elderly', '75', '21 or 26', '>50', '18 to 22', 'adult', '9 & 12', '(adult)', '33 & 37', '25 or 28', '30 & 32', '50 & 30', '17 & 35', 'X', '13 or 18', '33 & 26', 'MAKE LINE GREEN', '81', '"young"', '17 & 16', 'F', 'Both 11', '9 or 10', 'young', '36 & 23', '78', 'A.M.', '?    &   14', '10 

In [1054]:
# Cleaning Activity
attacks_raw.Activity = attacks_raw.Activity.str.lower()
attacks_raw.Activity = attacks_raw.Activity.str.strip()

activities_to_clean = [['surf','surfing'], ['board','surfing'], ['fish','fishing'], ['swim','swimming'], ['diving','diving'], 
                       ['boat','boat'], ['float', 'floating'], ['bath','bathing'], ['snork','snorkeling'], ['jump','jumping'], ['sail', 'boat'], 
                       ['row','rowing sports'], ['kayak', 'rowing sports'], ['canoeing','rowing sports'], ['padd','rowing sports'],
                       ['stand','standing'], ['drift', 'drifting'], ['tread','treading'], ['disaster', 'shipwreck'], ['feed','feeding sharks'],
                       ['sank','shipwreck'], ['sit','sitting'], ['walk','walking'], ['sink','shipwreck'], ['film','filming']]
for match, new in activities_to_clean:
    attacks_raw.Activity = attacks_raw.Activity.str.replace('(^.*'+match+'.*$)', new, regex = True)
popular_activities = attacks_raw.Activity.value_counts()[0:20].index.tolist()
attacks_raw.loc[~attacks_raw["Activity"].isin(popular_activities), "Activity"] = "other"


In [1056]:
# Cleaning Species
attacks_raw.Species = attacks_raw.Species.str.lower()
attacks_raw.Species = attacks_raw.Species.str.strip()

species_to_clean = [['white','white'], ['tiger','tiger'], ['lemon','lemon'], ['bull','bull'], ['grey reef','grey reef'], ['reef','reef'],
                    ['wobbegong','wobbegong'], ['black','blacktip'], ['galapagos', 'galapagos'], ['nurse','nurse'], ['catshark','cat'],
                    ['cookie','cookiecutter'], ['spinner','spinner'], ['blue','blue'], ['caribbean reef','caribbean reef'],
                    ['smooth hound','smooth hound'], ['sevengill','sevengill'], ['seven-gill','sevengill'], ['angel','angel'], ['copper','copper'],
                    ['dogfish','dogfish'], ['mako','mako'], ['bronze whale','copper'], ['hammerhead','hammerhead'], ['raggedtooth', 'raggedtooth'], 
                    ['goblin', 'goblin'], ['silky', 'silky'], ['sandbar','sandbar'], ['sand shark','raggedtooth'], ['porbeagle','porbeagle'],
                    ['7-gill','sevengill'], ['salmon','salmon'], ['zambesi','zambesi'], ['thresher','thresher'], ['spurdog','spurdog'],
                    ['dusky','dusky'], ['basking','basking'], ['whale','whale'], ['soupfin','soupfin'], ['zambezi','zambezi'], ['carpet','carpet'],
                    ['bonita','bonita'], ['leopard','leopard'], ['shovelnose','shovelnose'], ['leucas','bull'], ['shark','unkown'], ['unident','unkown']]
for match, new in species_to_clean:
    attacks_raw.Species = attacks_raw.Species.str.replace('(^.*'+match+'.*$)', new, regex = True)
attacks_raw.Species = attacks_raw.Species.fillna('unkown')