In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import time
import datetime

In [3]:
path = 'attacks.csv'

In [4]:
# Reading attacks dataset. Source: https://stackoverflow.com/questions/42339876/error-unicodedecodeerror-utf-8-codec-cant-decode-byte-0xff-in-position-0-in
attacks_raw_df = pd.read_csv(path, encoding='cp1252')


In [5]:
attacks_raw_df.head()

Unnamed: 0,Country,Case_Number,Date,Year,Type,Area,Location,Activity,Name,Sex,Age,Injury,Fatal_Y_N,Time,Species,Investigator_ or_ Source,pdf,href
0,USA,2018.06.25,6/25/18,2018.0,Boating,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
1,USA,2018.06.18,6/18/18,2018.0,Unprovoked,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
2,USA,2018.06.09,6/9/18,2018.0,Invalid,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
3,AUSTRALIA,2018.06.08,6/8/18,2018.0,Unprovoked,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
4,MEXICO,2018.06.04,6/4/18,2018.0,Provoked,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...


In [6]:
# Drop irrelevant columns
attacks_df = attacks_raw_df.drop(
    ['Name', 'Investigator_ or_ Source', 'pdf', 'href', 'Time'], axis=1)

In [7]:
attacks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6302 entries, 0 to 6301
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Country      6252 non-null   object 
 1   Case_Number  6302 non-null   object 
 2   Date         6302 non-null   object 
 3   Year         6300 non-null   float64
 4   Type         6298 non-null   object 
 5   Area         5847 non-null   object 
 6   Location     5762 non-null   object 
 7   Activity     5758 non-null   object 
 8   Sex          5771 non-null   object 
 9   Age          3471 non-null   object 
 10  Injury       6274 non-null   object 
 11  Fatal_Y_N    5766 non-null   object 
 12  Species      3464 non-null   object 
dtypes: float64(1), object(12)
memory usage: 640.2+ KB


In [8]:
# Keep only unprovoked attacks
unprovoked_attacks_df = attacks_df[attacks_df['Type'] == "Unprovoked"] 

In [9]:
unprovoked_attacks_df.head()

Unnamed: 0,Country,Case_Number,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal_Y_N,Species
1,USA,2018.06.18,6/18/18,2018.0,Unprovoked,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,
3,AUSTRALIA,2018.06.08,6/8/18,2018.0,Unprovoked,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,2 m shark
5,AUSTRALIA,2018.06.03.b,6/3/18,2018.0,Unprovoked,New South Wales,"Flat Rock, Ballina",Kite surfing,M,,"No injury, board bitten",N,
6,BRAZIL,2018.06.03.a,6/3/18,2018.0,Unprovoked,Pernambuco,"Piedade Beach, Recife",Swimming,M,18.0,FATAL,Y,Tiger shark
7,USA,2018.05.27,5/27/18,2018.0,Unprovoked,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52.0,Minor injury to foot. PROVOKED INCIDENT,N,"Lemon shark, 3'"


In [10]:
# Add Month column to extract the month of the attack
unprovoked_attacks_df['Month'] = ''

In [11]:
unprovoked_attacks_df.head(5)

Unnamed: 0,Country,Case_Number,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal_Y_N,Species,Month
1,USA,2018.06.18,6/18/18,2018.0,Unprovoked,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,,
3,AUSTRALIA,2018.06.08,6/8/18,2018.0,Unprovoked,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,2 m shark,
5,AUSTRALIA,2018.06.03.b,6/3/18,2018.0,Unprovoked,New South Wales,"Flat Rock, Ballina",Kite surfing,M,,"No injury, board bitten",N,,
6,BRAZIL,2018.06.03.a,6/3/18,2018.0,Unprovoked,Pernambuco,"Piedade Beach, Recife",Swimming,M,18.0,FATAL,Y,Tiger shark,
7,USA,2018.05.27,5/27/18,2018.0,Unprovoked,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52.0,Minor injury to foot. PROVOKED INCIDENT,N,"Lemon shark, 3'",


## Populating Month column

In [12]:
# Drop rows with a year less than 2000
unprovoked_attacks_2000 = unprovoked_attacks_df[unprovoked_attacks_df["Year"]>=2000]
unprovoked_attacks_2000.head()

Unnamed: 0,Country,Case_Number,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal_Y_N,Species,Month
1,USA,2018.06.18,6/18/18,2018.0,Unprovoked,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,,
3,AUSTRALIA,2018.06.08,6/8/18,2018.0,Unprovoked,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,2 m shark,
5,AUSTRALIA,2018.06.03.b,6/3/18,2018.0,Unprovoked,New South Wales,"Flat Rock, Ballina",Kite surfing,M,,"No injury, board bitten",N,,
6,BRAZIL,2018.06.03.a,6/3/18,2018.0,Unprovoked,Pernambuco,"Piedade Beach, Recife",Swimming,M,18.0,FATAL,Y,Tiger shark,
7,USA,2018.05.27,5/27/18,2018.0,Unprovoked,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52.0,Minor injury to foot. PROVOKED INCIDENT,N,"Lemon shark, 3'",


In [13]:
unprovoked_attacks_2000['Month'] = unprovoked_attacks_2000['Case_Number'].str[5:7]

In [14]:
unprovoked_attacks_2000.head()

Unnamed: 0,Country,Case_Number,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal_Y_N,Species,Month
1,USA,2018.06.18,6/18/18,2018.0,Unprovoked,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,,6
3,AUSTRALIA,2018.06.08,6/8/18,2018.0,Unprovoked,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,2 m shark,6
5,AUSTRALIA,2018.06.03.b,6/3/18,2018.0,Unprovoked,New South Wales,"Flat Rock, Ballina",Kite surfing,M,,"No injury, board bitten",N,,6
6,BRAZIL,2018.06.03.a,6/3/18,2018.0,Unprovoked,Pernambuco,"Piedade Beach, Recife",Swimming,M,18.0,FATAL,Y,Tiger shark,6
7,USA,2018.05.27,5/27/18,2018.0,Unprovoked,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52.0,Minor injury to foot. PROVOKED INCIDENT,N,"Lemon shark, 3'",5


In [15]:
# Convert Month column to integer
unprovoked_attacks_2000['Month'] = unprovoked_attacks_2000['Month'].astype('int')

In [16]:
# Convert Year column to integer
unprovoked_attacks_2000.Year = unprovoked_attacks_2000.Year.round(decimals=0)
unprovoked_attacks_2000['Year'].astype('int')

1       2018
3       2018
5       2018
6       2018
7       2018
        ... 
2071    2000
2072    2000
2074    2000
2075    2000
2077    2000
Name: Year, Length: 1617, dtype: int64

In [17]:
unprovoked_attacks_2000.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1617 entries, 1 to 2077
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Country      1616 non-null   object 
 1   Case_Number  1617 non-null   object 
 2   Date         1617 non-null   object 
 3   Year         1617 non-null   float64
 4   Type         1617 non-null   object 
 5   Area         1565 non-null   object 
 6   Location     1565 non-null   object 
 7   Activity     1557 non-null   object 
 8   Sex          1617 non-null   object 
 9   Age          1299 non-null   object 
 10  Injury       1616 non-null   object 
 11  Fatal_Y_N    1617 non-null   object 
 12  Species      980 non-null    object 
 13  Month        1617 non-null   int64  
dtypes: float64(1), int64(1), object(12)
memory usage: 189.5+ KB


In [18]:
unprovoked_attacks_2000.head()

Unnamed: 0,Country,Case_Number,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal_Y_N,Species,Month
1,USA,2018.06.18,6/18/18,2018.0,Unprovoked,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,,6
3,AUSTRALIA,2018.06.08,6/8/18,2018.0,Unprovoked,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,2 m shark,6
5,AUSTRALIA,2018.06.03.b,6/3/18,2018.0,Unprovoked,New South Wales,"Flat Rock, Ballina",Kite surfing,M,,"No injury, board bitten",N,,6
6,BRAZIL,2018.06.03.a,6/3/18,2018.0,Unprovoked,Pernambuco,"Piedade Beach, Recife",Swimming,M,18.0,FATAL,Y,Tiger shark,6
7,USA,2018.05.27,5/27/18,2018.0,Unprovoked,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52.0,Minor injury to foot. PROVOKED INCIDENT,N,"Lemon shark, 3'",5


In [19]:
# Drop Case_Number, Date columns
unprovoked_attacks_2000 = unprovoked_attacks_2000.drop(['Case_Number', "Date", 'Type'], axis=1)

In [20]:
unprovoked_attacks_2000.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1617 entries, 1 to 2077
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    1616 non-null   object 
 1   Year       1617 non-null   float64
 2   Area       1565 non-null   object 
 3   Location   1565 non-null   object 
 4   Activity   1557 non-null   object 
 5   Sex        1617 non-null   object 
 6   Age        1299 non-null   object 
 7   Injury     1616 non-null   object 
 8   Fatal_Y_N  1617 non-null   object 
 9   Species    980 non-null    object 
 10  Month      1617 non-null   int64  
dtypes: float64(1), int64(1), object(9)
memory usage: 151.6+ KB


In [21]:
# Fill 'Age' NaN with 0
unprovoked_attacks_2000['Age']= unprovoked_attacks_2000['Age'].fillna(0)

In [22]:
unprovoked_attacks_2000

Unnamed: 0,Country,Year,Area,Location,Activity,Sex,Age,Injury,Fatal_Y_N,Species,Month
1,USA,2018.0,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,,6
3,AUSTRALIA,2018.0,New South Wales,Arrawarra Headland,Surfing,M,0,Minor injury to lower leg,N,2 m shark,6
5,AUSTRALIA,2018.0,New South Wales,"Flat Rock, Ballina",Kite surfing,M,0,"No injury, board bitten",N,,6
6,BRAZIL,2018.0,Pernambuco,"Piedade Beach, Recife",Swimming,M,18,FATAL,Y,Tiger shark,6
7,USA,2018.0,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52,Minor injury to foot. PROVOKED INCIDENT,N,"Lemon shark, 3'",5
...,...,...,...,...,...,...,...,...,...,...,...
2071,USA,2000.0,Florida,"Riviera Beach, Palm Beach County",,M,27,Right calf bitten,N,,2
2072,SOUTH AFRICA,2000.0,Western Cape Province,Struis Bay,Body surfing,M,0,Foot bitten,N,"White shark, 2.5 m",2
2074,NEW ZEALAND,2000.0,South Island,Oreti Beach (reported as the 4th person bitten...,Surfing,M,12,"No injury, wetsuit punctured",N,,2
2075,AUSTRALIA,2000.0,South Australia,"Point Sinclair, Cactus Beach near Penong",Surfing,M,26,Hand bitten,N,3 m [10'] shark,2


In [23]:
# Fill Nan with string 'other'
unprovoked_attacks_2000[['Area', 'Country', 'Location', 'Activity', 'Injury', 'Species']] = unprovoked_attacks_2000[['Area', 'Country', 'Location', 'Activity', 'Injury', 'Species']].fillna('other')


In [24]:
# Fill Nan with string 'U'
unprovoked_attacks_2000[['Sex']] = unprovoked_attacks_2000[['Sex']].fillna('U')


In [25]:
unprovoked_attacks_2000.tail(50)

Unnamed: 0,Country,Year,Area,Location,Activity,Sex,Age,Injury,Fatal_Y_N,Species,Month
2016,REUNION,2000.0,Saint-Pierre,Pic du Diable,Surfing,M,27,Left arm bitten,N,"Tiger shark, 3 m [10']",9
2017,TANZANIA,2000.0,other,"Coco Beach, Dar-es-Salaam",Swimming,UNKNOWN,0,FATAL,Y,Thought to involve a Zambesi shark,9
2018,USA,2000.0,Florida,"New Smyrna Beach, Volusia County",Swimming,M,47,Punctures & lacerations on right foot,N,A 2' shark was seen in the area by witnesses,8
2019,USA,2000.0,Florida,"Boca Ciega Bay, Tampa, Pinellas County",Jumped into the water,M,69,FATAL,Y,"Thought to involve a 2.7 m [9'], 400-lb bull s...",8
2021,USA,2000.0,North Carolina,"Bouges Bank, Emerald Isle, Carteret County",Swimming out to porpoises,M,0,"Severe gash to left hand above wrist, almost s...",N,other,8
2022,USA,2000.0,Hawaii,"Kanaha Beach, Maui","Windsurfing, but sitting on his board",M,53,Left calf lacerated,N,"Tiger shark, 3.7 m to 4.5 m [12' to 14'9""]",8
2023,USA,2000.0,Florida,"South Jacksonville Beach, Duval County",Surfing / Wading,M,27,Minor lacerations to the dorsum of the right foot,N,juvenile shark,8
2024,USA,2000.0,Florida,"St. Augustine, St. Johns County",Standing,F,44,Severely bitten on lower leg,N,"Blacktip shark, 2.4 m to 3 m [8' to 10']",8
2027,TANZANIA,2000.0,other,"Coco Beach, Dar-es-Salaam",Swimming,UNKNOWN,0,FATAL,Y,Thought to involve a Zambesi shark,8
2028,USA,2000.0,Florida,"New Smyrna Beach, Volusia County",other,M,5,Minor laceration on left leg,N,other,7


In [26]:
# count fatal attacks
unprovoked_attacks_2000[unprovoked_attacks_2000['Fatal_Y_N']=='Y'].count()

Country      156
Year         156
Area         156
Location     156
Activity     156
Sex          156
Age          156
Injury       156
Fatal_Y_N    156
Species      156
Month        156
dtype: int64

## Countries Table


In [27]:
#create "countries" df
countries_df = unprovoked_attacks_2000[['Country', 'Month']]

In [28]:
countries_df.head()

Unnamed: 0,Country,Month
1,USA,6
3,AUSTRALIA,6
5,AUSTRALIA,6
6,BRAZIL,6
7,USA,5


## Export dataframes to csv

In [29]:
# Export 'unprovoked_attacks_2000' as csv
unprovoked_attacks_2000 = unprovoked_attacks_2000.to_csv('Resources/unprovoked_attacks_2000.csv', index = False)

# Export 'countries_df' as csv
countries_df = countries_df.to_csv('Resources/countries_df.csv', index = False)