In [1]:
import pandas as pd
import requests

Set up a path variable to access the data

In [2]:
shark_url = "http://api.fish.wa.gov.au/webapi/v1/RawData"
csv_path = "data/sharks/shark_sightings_raw.csv"

Download the data from the API

In [3]:
response = requests.get(shark_url).json()


Load the data into a Pandas DF

In [4]:
shark_df = pd.DataFrame(response)

Save the data into a CSV file

In [5]:
shark_df.to_csv("data/sharks/shark_sightings_raw.csv")

In [6]:
sharks_df = pd.read_csv(csv_path)
sharks_df

Unnamed: 0.1,Unnamed: 0,RawDataId,ObjectId,Distance,DistanceUnit,InteractionValue,InteractionId,TownProximity,LocationValue,SightingNumbervalue,ReportDateTime,SightingDateTime,SightingSizeValue,SightingSpeciesValue,OwnerValue,LocationDetail,LocationX,LocationY
0,0,63755,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-31T00:13:00,,white,Fisheries Advise,,117.94592,-35.08678
1,1,63754,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-31T00:03:00,,white,Fisheries Advise,,117.94592,-35.08678
2,2,63753,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-30T23:40:00,,white,Fisheries Advise,,117.94592,-35.08678
3,3,63752,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-30T23:31:00,,white,Fisheries Advise,,117.94592,-35.08678
4,4,63751,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-30T23:11:00,,white,Fisheries Advise,,117.94592,-35.08678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17325,17325,54674,WA00004TSE,10.0,m,Sighted,1,Dunsborough,"Bunker Bay, near Dunsborough",1,,2014-10-14T03:40:00,2m,white,Public report,,115.04110,-33.54306
17326,17326,54675,WA00003TSE,,,Sighted,1,Yallingup,"Yallingup Beach, Yallingup",1,,2014-10-14T02:08:00,,unknown sp.,Public report,,115.02817,-33.63339
17327,17327,54676,WA00002TSE,,,Sighted,1,Dunsborough,"Dunsborough Boat Ramp, Dunsborough",1,2014-10-13T04:00:00,2014-10-13T03:55:00,4.5m,white,Public report,Old Dunsborough,115.10530,-33.60111
17328,17328,46307,1035,,,Detected,4,,Garden Island (north end),,,2014-10-13T02:28:00,,tiger,Fisheries Advise,,115.64283,-32.14411


# Data Cleaning

## Species

Here the species present in the data are reviewed. The are several cases of duplicate values that need to be combined, and species values that are not sharks need to be removed.

In [7]:
sharks_df['SightingSpeciesValue'].value_counts()

bronze whaler                 5590
unknown sp.                   4422
tiger                         3879
white                         2554
hammerhead                     317
whale carcass                  191
whaler                         119
bull                            91
mako                            49
sonar object                    45
other                           30
dusky                           11
grey nurse                      10
detection event - possible       4
blacktip                         4
whale carcass                    4
blue                             4
Whale Carcass                    1
white                            1
shortfin mako                    1
Name: SightingSpeciesValue, dtype: int64

Replace duplicate entry values with a single common value.

In [8]:
clean_sharks = sharks_df.copy()
clean_sharks['SightingSpeciesValue'] = clean_sharks['SightingSpeciesValue'].replace({
    'Whale Carcass':'whale carcass',
    'whale carcass ':'whale carcass',
    'whaler':'bronze whaler',
    'shortfin mako':'mako', 
    'white ': 'white'})

Drop rows where the species value is not a shark.

In [9]:
# create a list of species values to filter out
drop_species = ['whale carcass',
                'detection event - possible',
                'other',
                'sonar object'] 

# use the list to create a filter
species_filter = ~clean_sharks.SightingSpeciesValue.isin(drop_species) 

# apply the filter to the dataset
clean_sharks = clean_sharks[species_filter] 

# display the cleaned species values
clean_sharks['SightingSpeciesValue'].value_counts()

bronze whaler    5709
unknown sp.      4422
tiger            3879
white            2555
hammerhead        317
bull               91
mako               50
dusky              11
grey nurse         10
blue                4
blacktip            4
Name: SightingSpeciesValue, dtype: int64

## Report Owner

There are several cases of duplicate entrys that need to be consolidated

In [10]:
clean_sharks['OwnerValue'].value_counts()

Fisheries Advise           10179
Public report               3068
SLS Westpac Heli report     2421
SLS Lifesavers report        629
Fisheries advise             301
SMART drumline trial         300
UPDATE to prior report       105
DBCA officers report          33
DPAW report                   10
Public Report                  3
Fisheries advise               2
SLS Lifesavers report          1
Clever Buoy trial              1
SLS Westpac Heli               1
Name: OwnerValue, dtype: int64

In [11]:
clean_sharks['OwnerValue'] = clean_sharks['OwnerValue'].replace({
    'Fisheries advise': 'Fisheries Advise',
    'SLS Lifesavers report ': 'SLS Lifesavers report',
    'SLS Westpac Heli': 'SLS Westpac Heli report',
    'Public Report': 'Public report'
})

# display the cleaned owner values
clean_sharks['OwnerValue'].value_counts()

Fisheries Advise           10480
Public report               3071
SLS Westpac Heli report     2422
SLS Lifesavers report        630
SMART drumline trial         300
UPDATE to prior report       105
DBCA officers report          33
DPAW report                   10
Fisheries advise               2
Clever Buoy trial              1
Name: OwnerValue, dtype: int64

## Interaction Type

text here

In [12]:
clean_sharks['InteractionId'].value_counts()

4    10178
1     6437
3      283
2      121
6       34
5        2
Name: InteractionId, dtype: int64

In [13]:
clean_sharks

Unnamed: 0.1,Unnamed: 0,RawDataId,ObjectId,Distance,DistanceUnit,InteractionValue,InteractionId,TownProximity,LocationValue,SightingNumbervalue,ReportDateTime,SightingDateTime,SightingSizeValue,SightingSpeciesValue,OwnerValue,LocationDetail,LocationX,LocationY
0,0,63755,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-31T00:13:00,,white,Fisheries Advise,,117.94592,-35.08678
1,1,63754,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-31T00:03:00,,white,Fisheries Advise,,117.94592,-35.08678
2,2,63753,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-30T23:40:00,,white,Fisheries Advise,,117.94592,-35.08678
3,3,63752,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-30T23:31:00,,white,Fisheries Advise,,117.94592,-35.08678
4,4,63751,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-30T23:11:00,,white,Fisheries Advise,,117.94592,-35.08678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17325,17325,54674,WA00004TSE,10.0,m,Sighted,1,Dunsborough,"Bunker Bay, near Dunsborough",1,,2014-10-14T03:40:00,2m,white,Public report,,115.04110,-33.54306
17326,17326,54675,WA00003TSE,,,Sighted,1,Yallingup,"Yallingup Beach, Yallingup",1,,2014-10-14T02:08:00,,unknown sp.,Public report,,115.02817,-33.63339
17327,17327,54676,WA00002TSE,,,Sighted,1,Dunsborough,"Dunsborough Boat Ramp, Dunsborough",1,2014-10-13T04:00:00,2014-10-13T03:55:00,4.5m,white,Public report,Old Dunsborough,115.10530,-33.60111
17328,17328,46307,1035,,,Detected,4,,Garden Island (north end),,,2014-10-13T02:28:00,,tiger,Fisheries Advise,,115.64283,-32.14411


## Filtering

Select sightings between a specified date range

In [14]:
datefiltered_df = clean_sharks.copy().loc[
    (clean_sharks["SightingDateTime"] > '2021-01-01') & (clean_sharks["SightingDateTime"] < '2021-07-01')]

datefiltered_df.head()

Unnamed: 0.1,Unnamed: 0,RawDataId,ObjectId,Distance,DistanceUnit,InteractionValue,InteractionId,TownProximity,LocationValue,SightingNumbervalue,ReportDateTime,SightingDateTime,SightingSizeValue,SightingSpeciesValue,OwnerValue,LocationDetail,LocationX,LocationY
52,52,63703,WA53997TSE,30.0,m offshore,sighted,1,Mandurah,"MH055 Fourth Groyne Surf Break BEN Sign, north...",,,2021-06-30T16:10:47,3.0m,unknown sp.,Public report,,115.736,-32.505
53,53,63702,WA94783TSE,50.0,m offshore,sighted,1,Augusta,"Flinders Bay, Augusta",,,2021-06-30T08:57:04,3.0m,white,Public report,Flinders Bay Beach,115.186,-34.3263
55,55,63699,WA17863TSE,10.0,m offshore,sighted,1,Dunsborough,"Rocky Point Surfing Spot, north of Dunsborough",,2021-06-28T12:00:30,2021-06-28T10:30:54,large,unknown sp.,Public report,,115.059,-33.5457
56,56,63700,WA36733TSE,50.0,m offshore,sighted,1,Bunbury,"Back Beach, Bunbury",,2021-06-28T14:00:05,2021-06-27T14:00:20,,unknown sp.,Public report,,115.629,-33.3272
57,57,63698,WA99425TSE,50.0,m offshore,sighted,1,Denmark,"Parry Beach, west of Denmark",,2021-06-26T11:30:14,2021-06-26T10:00:10,large,white,Public report,,117.16,-35.0381


In [15]:
# save data at this stage for group use

export_data = datefiltered_df[['InteractionValue','InteractionId','SightingSpeciesValue','SightingDateTime','OwnerValue','LocationX','LocationY']]
export_data.to_csv('data/sharks/sharks_cleaned.csv')
export_data

Unnamed: 0,InteractionValue,InteractionId,SightingSpeciesValue,SightingDateTime,OwnerValue,LocationX,LocationY
52,sighted,1,unknown sp.,2021-06-30T16:10:47,Public report,115.736000,-32.50500
53,sighted,1,white,2021-06-30T08:57:04,Public report,115.186000,-34.32630
55,sighted,1,unknown sp.,2021-06-28T10:30:54,Public report,115.059000,-33.54570
56,sighted,1,unknown sp.,2021-06-27T14:00:20,Public report,115.629000,-33.32720
57,sighted,1,white,2021-06-26T10:00:10,Public report,117.160000,-35.03810
...,...,...,...,...,...,...,...
1078,sighted,1,unknown sp.,2021-01-01T10:18:02,Public report,122.146000,-34.01190
1079,sighted,1,unknown sp.,2021-01-01T10:16:38,Public report,115.346000,-33.64410
1080,Detected,4,white,2021-01-01T00:49:00,Fisheries Advise,115.093091,-33.57247
1081,Detected,4,white,2021-01-01T00:43:00,Fisheries Advise,115.093091,-33.57247


Round GPS coordinates for grouping 

In [16]:
datefiltered_df['LocationX'] = round(sharks_df['LocationX'],2)
datefiltered_df['LocationY'] = round(sharks_df['LocationY'],2)
sharks_df

Unnamed: 0.1,Unnamed: 0,RawDataId,ObjectId,Distance,DistanceUnit,InteractionValue,InteractionId,TownProximity,LocationValue,SightingNumbervalue,ReportDateTime,SightingDateTime,SightingSizeValue,SightingSpeciesValue,OwnerValue,LocationDetail,LocationX,LocationY
0,0,63755,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-31T00:13:00,,white,Fisheries Advise,,117.94592,-35.08678
1,1,63754,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-31T00:03:00,,white,Fisheries Advise,,117.94592,-35.08678
2,2,63753,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-30T23:40:00,,white,Fisheries Advise,,117.94592,-35.08678
3,3,63752,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-30T23:31:00,,white,Fisheries Advise,,117.94592,-35.08678
4,4,63751,2980,,,Detected,4,,"Frenchman Bay, Albany",,,2021-07-30T23:11:00,,white,Fisheries Advise,,117.94592,-35.08678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17325,17325,54674,WA00004TSE,10.0,m,Sighted,1,Dunsborough,"Bunker Bay, near Dunsborough",1,,2014-10-14T03:40:00,2m,white,Public report,,115.04110,-33.54306
17326,17326,54675,WA00003TSE,,,Sighted,1,Yallingup,"Yallingup Beach, Yallingup",1,,2014-10-14T02:08:00,,unknown sp.,Public report,,115.02817,-33.63339
17327,17327,54676,WA00002TSE,,,Sighted,1,Dunsborough,"Dunsborough Boat Ramp, Dunsborough",1,2014-10-13T04:00:00,2014-10-13T03:55:00,4.5m,white,Public report,Old Dunsborough,115.10530,-33.60111
17328,17328,46307,1035,,,Detected,4,,Garden Island (north end),,,2014-10-13T02:28:00,,tiger,Fisheries Advise,,115.64283,-32.14411


In [17]:
sharks_grouped = sharks_df.copy().groupby(['LocationX','LocationY','SightingSpeciesValue']).agg({
    'SightingSpeciesValue':'first',
    'LocationX':"first",
    'LocationY':'first',
    'SightingDateTime':'count'})

In [18]:
sharks_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SightingSpeciesValue,LocationX,LocationY,SightingDateTime
LocationX,LocationY,SightingSpeciesValue,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
113.05700,-25.84240,whale carcass,whale carcass,113.05700,-25.84240,1
113.07810,-25.26041,white,white,113.07810,-25.26041,1
113.14952,-24.75295,unknown sp.,unknown sp.,113.14952,-24.75295,1
113.15400,-26.13550,white,white,113.15400,-26.13550,1
113.53400,-25.92860,tiger,tiger,113.53400,-25.92860,2
...,...,...,...,...,...,...
123.56369,-34.36934,white,white,123.56369,-34.36934,2
123.56400,-34.36930,white,white,123.56400,-34.36930,15
124.86570,-14.43664,unknown sp.,unknown sp.,124.86570,-14.43664,1
126.04600,-32.27240,bronze whaler,bronze whaler,126.04600,-32.27240,1


In [19]:
sharks_spotted = sharks_df.copy().groupby(['LocationX','LocationY']).agg({
    'LocationX':"first",
    'LocationY':'first',
    'SightingDateTime':'count'}).rename(columns=
{"SightingDateTime":"Sightings"})

In [20]:
sharks_spotted

Unnamed: 0_level_0,Unnamed: 1_level_0,LocationX,LocationY,Sightings
LocationX,LocationY,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
113.05700,-25.84240,113.05700,-25.84240,1
113.07810,-25.26041,113.07810,-25.26041,1
113.14952,-24.75295,113.14952,-24.75295,1
113.15400,-26.13550,113.15400,-26.13550,1
113.53400,-25.92860,113.53400,-25.92860,2
...,...,...,...,...
123.55200,-34.35980,123.55200,-34.35980,5
123.56369,-34.36934,123.56369,-34.36934,2
123.56400,-34.36930,123.56400,-34.36930,15
124.86570,-14.43664,124.86570,-14.43664,1


In [21]:
shark_test = sharks_spotted.reset_index(drop=True)
shark_test

Unnamed: 0,LocationX,LocationY,Sightings
0,113.05700,-25.84240,1
1,113.07810,-25.26041,1
2,113.14952,-24.75295,1
3,113.15400,-26.13550,1
4,113.53400,-25.92860,2
...,...,...,...
1943,123.55200,-34.35980,5
1944,123.56369,-34.36934,2
1945,123.56400,-34.36930,15
1946,124.86570,-14.43664,1


In [22]:
shark_test.to_csv("data/sharks/shark_locations.csv")