In [1]:
import pandas as pd
import numpy as np
import time
import datetime

In [2]:
path = 'attacks.csv'

In [3]:
# Reading attacks dataset. Source: https://stackoverflow.com/questions/42339876/error-unicodedecodeerror-utf-8-codec-cant-decode-byte-0xff-in-position-0-in
attacks_raw_df = pd.read_csv(path, encoding='cp1252')


In [4]:
attacks_raw_df.head()

Unnamed: 0,Country,Case_Number,Date,Year,Type,Area,Location,Activity,Name,Sex,Age,Injury,Fatal_Y_N,Time,Species,Investigator_ or_ Source,pdf,href
0,USA,2018.06.25,6/25/2018,2018.0,Boating,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
1,USA,2018.06.18,6/18/2018,2018.0,Unprovoked,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
2,USA,2018.06.09,6/9/2018,2018.0,Invalid,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
3,AUSTRALIA,2018.06.08,6/8/2018,2018.0,Unprovoked,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
4,MEXICO,2018.06.04,6/4/2018,2018.0,Provoked,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...


In [5]:
# Drop irrelevant columns
attacks_df = attacks_raw_df.drop(
    ['Name', 'Investigator_ or_ Source', 'pdf', 'href'], axis=1)

In [6]:
attacks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6326 entries, 0 to 6325
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Country      6252 non-null   object 
 1   Case_Number  6302 non-null   object 
 2   Date         6302 non-null   object 
 3   Year         6300 non-null   float64
 4   Type         6298 non-null   object 
 5   Area         5847 non-null   object 
 6   Location     5762 non-null   object 
 7   Activity     5758 non-null   object 
 8   Sex          5771 non-null   object 
 9   Age          3471 non-null   object 
 10  Injury       6274 non-null   object 
 11  Fatal_Y_N    5766 non-null   object 
 12  Time         2948 non-null   object 
 13  Species      3464 non-null   object 
dtypes: float64(1), object(13)
memory usage: 692.0+ KB


In [7]:
# Keep only unprovoked attacks
unprovoked_attacks_df = attacks_df[attacks_df['Type'] == "Unprovoked"] 

In [8]:
unprovoked_attacks_df.head()

Unnamed: 0,Country,Case_Number,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal_Y_N,Time,Species
1,USA,2018.06.18,6/18/2018,2018.0,Unprovoked,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,14h00 -15h00,
3,AUSTRALIA,2018.06.08,6/8/2018,2018.0,Unprovoked,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,,2 m shark
5,AUSTRALIA,2018.06.03.b,6/3/2018,2018.0,Unprovoked,New South Wales,"Flat Rock, Ballina",Kite surfing,M,,"No injury, board bitten",N,,
6,BRAZIL,2018.06.03.a,6/3/2018,2018.0,Unprovoked,Pernambuco,"Piedade Beach, Recife",Swimming,M,18.0,FATAL,Y,Late afternoon,Tiger shark
7,USA,2018.05.27,5/27/2018,2018.0,Unprovoked,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52.0,Minor injury to foot. PROVOKED INCIDENT,N,,"Lemon shark, 3'"


In [9]:
# Add Month column to extract the month of the attack
unprovoked_attacks_df['Month'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
unprovoked_attacks_df.head(5)

Unnamed: 0,Country,Case_Number,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal_Y_N,Time,Species,Month
1,USA,2018.06.18,6/18/2018,2018.0,Unprovoked,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,
3,AUSTRALIA,2018.06.08,6/8/2018,2018.0,Unprovoked,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,,2 m shark,
5,AUSTRALIA,2018.06.03.b,6/3/2018,2018.0,Unprovoked,New South Wales,"Flat Rock, Ballina",Kite surfing,M,,"No injury, board bitten",N,,,
6,BRAZIL,2018.06.03.a,6/3/2018,2018.0,Unprovoked,Pernambuco,"Piedade Beach, Recife",Swimming,M,18.0,FATAL,Y,Late afternoon,Tiger shark,
7,USA,2018.05.27,5/27/2018,2018.0,Unprovoked,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52.0,Minor injury to foot. PROVOKED INCIDENT,N,,"Lemon shark, 3'",


## Cleaning 'Date' column

In [11]:
# Drop rows with a year less than 2000
unprovoked_attacks_2000 = unprovoked_attacks_df[unprovoked_attacks_df["Year"]>=2000]
unprovoked_attacks_2000.head()

Unnamed: 0,Country,Case_Number,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal_Y_N,Time,Species,Month
1,USA,2018.06.18,6/18/2018,2018.0,Unprovoked,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,
3,AUSTRALIA,2018.06.08,6/8/2018,2018.0,Unprovoked,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,,2 m shark,
5,AUSTRALIA,2018.06.03.b,6/3/2018,2018.0,Unprovoked,New South Wales,"Flat Rock, Ballina",Kite surfing,M,,"No injury, board bitten",N,,,
6,BRAZIL,2018.06.03.a,6/3/2018,2018.0,Unprovoked,Pernambuco,"Piedade Beach, Recife",Swimming,M,18.0,FATAL,Y,Late afternoon,Tiger shark,
7,USA,2018.05.27,5/27/2018,2018.0,Unprovoked,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52.0,Minor injury to foot. PROVOKED INCIDENT,N,,"Lemon shark, 3'",


In [12]:
unprovoked_attacks_2000['Month'] = unprovoked_attacks_2000['Case_Number'].str[5:7]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
unprovoked_attacks_2000.head()

Unnamed: 0,Country,Case_Number,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal_Y_N,Time,Species,Month
1,USA,2018.06.18,6/18/2018,2018.0,Unprovoked,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,6
3,AUSTRALIA,2018.06.08,6/8/2018,2018.0,Unprovoked,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,,2 m shark,6
5,AUSTRALIA,2018.06.03.b,6/3/2018,2018.0,Unprovoked,New South Wales,"Flat Rock, Ballina",Kite surfing,M,,"No injury, board bitten",N,,,6
6,BRAZIL,2018.06.03.a,6/3/2018,2018.0,Unprovoked,Pernambuco,"Piedade Beach, Recife",Swimming,M,18.0,FATAL,Y,Late afternoon,Tiger shark,6
7,USA,2018.05.27,5/27/2018,2018.0,Unprovoked,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52.0,Minor injury to foot. PROVOKED INCIDENT,N,,"Lemon shark, 3'",5


In [14]:
# Convert Month column to integer
unprovoked_attacks_2000['Month'].astype(int)

1       6
3       6
5       6
6       6
7       5
       ..
2071    2
2072    2
2074    2
2075    2
2077    1
Name: Month, Length: 1617, dtype: int64

In [15]:
# Convert Year column to integer
# unprovoked_attacks_2000.Year = unprovoked_attacks_2000.Year.round(decimals=0)
# unprovoked_attacks_2000['Year'].astype(int)

In [16]:
# unprovoked_attacks_2000.info()

In [17]:
# Drop Case Number column
unprovoked_attacks_2000 = unprovoked_attacks_2000.drop(['Case_Number'], axis=1)

In [18]:
unprovoked_attacks_2000.head()

Unnamed: 0,Country,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal_Y_N,Time,Species,Month
1,USA,6/18/2018,2018.0,Unprovoked,Georgia,"St. Simon Island, Glynn County",Standing,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,6
3,AUSTRALIA,6/8/2018,2018.0,Unprovoked,New South Wales,Arrawarra Headland,Surfing,M,,Minor injury to lower leg,N,,2 m shark,6
5,AUSTRALIA,6/3/2018,2018.0,Unprovoked,New South Wales,"Flat Rock, Ballina",Kite surfing,M,,"No injury, board bitten",N,,,6
6,BRAZIL,6/3/2018,2018.0,Unprovoked,Pernambuco,"Piedade Beach, Recife",Swimming,M,18.0,FATAL,Y,Late afternoon,Tiger shark,6
7,USA,5/27/2018,2018.0,Unprovoked,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52.0,Minor injury to foot. PROVOKED INCIDENT,N,,"Lemon shark, 3'",5


In [19]:
unprovoked_attacks_2000.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1617 entries, 1 to 2077
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    1616 non-null   object 
 1   Date       1617 non-null   object 
 2   Year       1617 non-null   float64
 3   Type       1617 non-null   object 
 4   Area       1565 non-null   object 
 5   Location   1565 non-null   object 
 6   Activity   1557 non-null   object 
 7   Sex        1617 non-null   object 
 8   Age        1299 non-null   object 
 9   Injury     1616 non-null   object 
 10  Fatal_Y_N  1617 non-null   object 
 11  Time       1251 non-null   object 
 12  Species    980 non-null    object 
 13  Month      1617 non-null   object 
dtypes: float64(1), object(13)
memory usage: 189.5+ KB


In [20]:
# Fill 'Age' NaN with 0
unprovoked_attacks_2000['Age']= unprovoked_attacks_2000['Age'].fillna(0)

In [21]:
unprovoked_attacks_2000

Unnamed: 0,Country,Date,Year,Type,Area,Location,Activity,Sex,Age,Injury,Fatal_Y_N,Time,Species,Month
1,USA,6/18/2018,2018.0,Unprovoked,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,,06
3,AUSTRALIA,6/8/2018,2018.0,Unprovoked,New South Wales,Arrawarra Headland,Surfing,M,0,Minor injury to lower leg,N,,2 m shark,06
5,AUSTRALIA,6/3/2018,2018.0,Unprovoked,New South Wales,"Flat Rock, Ballina",Kite surfing,M,0,"No injury, board bitten",N,,,06
6,BRAZIL,6/3/2018,2018.0,Unprovoked,Pernambuco,"Piedade Beach, Recife",Swimming,M,18,FATAL,Y,Late afternoon,Tiger shark,06
7,USA,5/27/2018,2018.0,Unprovoked,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,M,52,Minor injury to foot. PROVOKED INCIDENT,N,,"Lemon shark, 3'",05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2071,USA,2/21/2000,2000.0,Unprovoked,Florida,"Riviera Beach, Palm Beach County",,M,27,Right calf bitten,N,Afternoon,,02
2072,SOUTH AFRICA,2/19/2000,2000.0,Unprovoked,Western Cape Province,Struis Bay,Body surfing,M,0,Foot bitten,N,14h00,"White shark, 2.5 m",02
2074,NEW ZEALAND,2/3/2000,2000.0,Unprovoked,South Island,Oreti Beach (reported as the 4th person bitten...,Surfing,M,12,"No injury, wetsuit punctured",N,,,02
2075,AUSTRALIA,2/1/2000,2000.0,Unprovoked,South Australia,"Point Sinclair, Cactus Beach near Penong",Surfing,M,26,Hand bitten,N,,3 m [10'] shark,02


In [22]:
# Fill Nan with string 'other'
unprovoked_attacks_2000[['Area', 'Country', 'Location', 'Activity', 'Injury']] = unprovoked_attacks_2000[['Area', 'Country', 'Location', 'Activity', 'Injury']].fillna('other')


In [24]:
# Fill Nan with string 'U'
# unprovoked_attacks_2000[['Sex']] = unprovoked_attacks_2000[['Sex']].fillna('U', inplace=True)


In [None]:
unprovoked_attacks_2000.head()

In [None]:
# count fatal attacks
unprovoked_attacks_2000[unprovoked_attacks_2000['Fatal_Y_N']=='Y'].count()

In [None]:
# Export df as csv
unprovoked_attacks_2000 = unprovoked_attacks_2000.to_csv('Resources/unprovoked_attacks_2000.csv')