In [1]:
# Import the packages you're going to need. If you have activated the correct virtual environment when you launched jupyter
# notebook, you should not have to install these in the browser.

# Package with functionality for plotting data and graphs.
import matplotlib.pyplot as plt
# Package with optimized arrays structures and linear algebra operations for fast matrix and vector operations.
import numpy as np
# Package to handle datasets as "dataframes" efficiently. It automates a lot of the operations of reading in data
# from excel files, cleaning up messy or missing data, and finding things like cross-correlation of data, which is
# important in determining how dependent your variables are on each other.
import pandas as pd

In [2]:
# Read in the dataset. Specify the engine like so because there's a bug in pandas that makes it unable to read .xlxs files
# with the vanilla call.
attacks = pd.read_excel('../datasets/GSAF5.xlsx', engine='openpyxl')

In [3]:
# The .head() command is a good way to see the form of your dataset. Note that shark_attacks.head() returns the head object
# which when called alone prints below by default in jupyter. In PyCharm, you'd need to write print(shark_attacks.head()).
attacks.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Unnamed: 246,Unnamed: 247,Unnamed: 248,Unnamed: 249,Unnamed: 250,Unnamed: 251,Unnamed: 252,Unnamed: 253,Unnamed: 254,Unnamed: 255
0,2021.01.13,13-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Blackwall Reach,Swimming,Cameron Wrathall,M,...,,,,,,,,,,
1,2021.01.11,11-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Victoria,13th Beach,Swimming,male,M,...,,,,,,,,,,
2,2021.01.09,09-Jan-2021,2021.0,Unprovoked,ST KITTS / NEVIS,The Narrows,Booby Island,Swimming / Kayaking,Brook …,F,...,,,,,,,,,,
3,2021.01.07,07-Jan-2021,2021.0,Unprovoked,NEW ZEALAND,North Island,"Waihī Beach, Bay of Plenty",Swimming,Kaelah Marlow,F,...,,,,,,,,,,
4,2021.01.02,02-Jan-2021,2021.0,Unprovoked,USA,Hawaii,"Anaehoomalu Bay\nWaikoloa, Hawaii County",Swimming,female,F,...,,,,,,,,,,


In [4]:
# Check the shape of your data. 
# As a note, you don't want to have more parameters (columns) than you do examples (rows). This
# is more of a consideration if, for example, you break up the Country column into many columns for "AUS", "US", "NEW ZEALAND",
# etc., which all have either a 1 or 0 depending on which country the attack occurred in. This is called one-hot labelling and
# is common when you need to feed you algorithm the values for a country as numbers (like US is number 3 and Australia is
# number 7), but don't want your data to be ordinal (this would suggest US is less than Australia).
attacks.shape

(65535, 256)

In [5]:
attacks.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ',
       ...
       'Unnamed: 246', 'Unnamed: 247', 'Unnamed: 248', 'Unnamed: 249',
       'Unnamed: 250', 'Unnamed: 251', 'Unnamed: 252', 'Unnamed: 253',
       'Unnamed: 254', 'Unnamed: 255'],
      dtype='object', length=256)

In [6]:
# Let's say we want to see how many of our columns have an "Unnamed" header, and are therefore useless. Run the following
# command. It will find the column array of attacks (given by attacks.columns, shown first), and then apply the
# str.contains method to that array. Since this method is applied to an array, you will get back an equal sized array in
# which each value a boolean representing whether that header contained "Unnamed". Note that case=False means we don't
# care about the case of the letters.
unnamed_columns = attacks.columns.str.contains('unnamed',case = False)
print("Our columns list: ", attacks.columns)
print("Which of them contain 'Unnamed':", unnamed_columns)
print("Length of this array is still", len(unnamed_columns))

Our columns list:  Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ',
       ...
       'Unnamed: 246', 'Unnamed: 247', 'Unnamed: 248', 'Unnamed: 249',
       'Unnamed: 250', 'Unnamed: 251', 'Unnamed: 252', 'Unnamed: 253',
       'Unnamed: 254', 'Unnamed: 255'],
      dtype='object', length=256)
Which of them contain 'Unnamed': [False False False False False False False False False False False False
 False False False False False False False False False False  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True 

In [7]:
# Obviously, we want to drop these columns, since they give us no valuable information. Do so with the following command:
# attacks.drop(attacks.columns[unnamed_columns], axis=1, inplace=True). There is a lot going on here. attacks.drop will
# drop the columns from the dataframe that are included in the list you give it. Which means we expect that
# attacks.columns[unnamed_columns] is a list of the labels we want to drop. Remember unnamed_columns is a list of booleans
# that says whether each column has "Unnamed" in it. It's length is the number of columns in attacks. So, indexing
# attacks.columns with this list of booleans returns the list of column labels that align with the "True" labels in
# unnamed_columns. Thus, the full command removes the unnamed columns. Note that the axis parameter indicates that the
# entries in the list will be removed along the column axis (axis=1 rather than 0). Inplace means the command will modify
# the attack dataframe itself rather than creating a modified copy.
print("The array of unnamed columns is:", attacks.columns[unnamed_columns])

The array of unnamed columns is: Index(['Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25',
       'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29',
       'Unnamed: 30', 'Unnamed: 31',
       ...
       'Unnamed: 246', 'Unnamed: 247', 'Unnamed: 248', 'Unnamed: 249',
       'Unnamed: 250', 'Unnamed: 251', 'Unnamed: 252', 'Unnamed: 253',
       'Unnamed: 254', 'Unnamed: 255'],
      dtype='object', length=234)


In [8]:
attacks.drop(attacks.columns[unnamed_columns], axis=1, inplace=True)

In [9]:
# Show the resulting dataframe.
attacks.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2021.01.13,13-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Blackwall Reach,Swimming,Cameron Wrathall,M,...,N,07h58,"Bull shark, 6.5 to 10 ft","B. Myatt, GSAF & K. McMurray, TrackingSharks.com",2021.01.13-Wrathall,,,2021.01.13,2021.01.13,6607.0
1,2021.01.11,11-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Victoria,13th Beach,Swimming,male,M,...,N,17h40,,"B. Myatt, GSAF & K. McMurray, TrackingSharks.com",2021.01.11-Australia.pdf,,,2021.01.11,2021.01.11,6606.0
2,2021.01.09,09-Jan-2021,2021.0,Unprovoked,ST KITTS / NEVIS,The Narrows,Booby Island,Swimming / Kayaking,Brook …,F,...,N,09h00-10h00,,"K. McMurray, TrackingSharks.com",2021.01.09-StKitts-Nevis.pdf,,,2021.01.09,2021.01.09,6605.0
3,2021.01.07,07-Jan-2021,2021.0,Unprovoked,NEW ZEALAND,North Island,"Waihī Beach, Bay of Plenty",Swimming,Kaelah Marlow,F,...,Y,17h10,,"C. Black, GSAF",2021.01.07-Marlow.pdf,,,2021.01.07,2021.01.07,6604.0
4,2021.01.02,02-Jan-2021,2021.0,Unprovoked,USA,Hawaii,"Anaehoomalu Bay\nWaikoloa, Hawaii County",Swimming,female,F,...,N,08h00,,"K. McMurray, TrackingSharks.com",2021.01.02-Hawaii.pdf,,,2021.01.02,2021.01.02,6603.0


In [10]:
# Now you'll see we've gotten rid of all the unnamed columns. There's still some columns that we probably won't need. We
# can clean those later. Let's check which values of our dataframe are nan still.
attacks.isna()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,True,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,True,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,True,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65530,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
65531,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
65532,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
65533,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [11]:
# Note that we have 65535 rows. It looks like we have a bunch of empty rows we want to get rid of. We'll do that with
# dropna(). Note that "how" can be "any" for if any nan appears, or "all" for if all values of the row/column are nan.
attacks.dropna(axis=0, how="all", inplace=True)
attacks

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2021.01.13,13-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Blackwall Reach,Swimming,Cameron Wrathall,M,...,N,07h58,"Bull shark, 6.5 to 10 ft","B. Myatt, GSAF & K. McMurray, TrackingSharks.com",2021.01.13-Wrathall,,,2021.01.13,2021.01.13,6607.0
1,2021.01.11,11-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Victoria,13th Beach,Swimming,male,M,...,N,17h40,,"B. Myatt, GSAF & K. McMurray, TrackingSharks.com",2021.01.11-Australia.pdf,,,2021.01.11,2021.01.11,6606.0
2,2021.01.09,09-Jan-2021,2021.0,Unprovoked,ST KITTS / NEVIS,The Narrows,Booby Island,Swimming / Kayaking,Brook …,F,...,N,09h00-10h00,,"K. McMurray, TrackingSharks.com",2021.01.09-StKitts-Nevis.pdf,,,2021.01.09,2021.01.09,6605.0
3,2021.01.07,07-Jan-2021,2021.0,Unprovoked,NEW ZEALAND,North Island,"Waihī Beach, Bay of Plenty",Swimming,Kaelah Marlow,F,...,Y,17h10,,"C. Black, GSAF",2021.01.07-Marlow.pdf,,,2021.01.07,2021.01.07,6604.0
4,2021.01.02,02-Jan-2021,2021.0,Unprovoked,USA,Hawaii,"Anaehoomalu Bay\nWaikoloa, Hawaii County",Swimming,female,F,...,N,08h00,,"K. McMurray, TrackingSharks.com",2021.01.02-Hawaii.pdf,,,2021.01.02,2021.01.02,6603.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8778,0,,,,,,,,,,...,,,,,,,,,,
8779,0,,,,,,,,,,...,,,,,,,,,,
8780,0,,,,,,,,,,...,,,,,,,,,,
8781,0,,,,,,,,,,...,,,,,,,,,,


In [12]:
# We still have a lot of nan values. This is because some rows have only one or two non-nan values. Let's go and and get
# rid of any columns we probably won't want. Don't honestly know where these columns came from.
attacks.drop(["pdf", "href", "href formula", "Case Number.1", "Case Number.2", "original order"], axis=1, inplace=True)
attacks

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source
0,2021.01.13,13-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Blackwall Reach,Swimming,Cameron Wrathall,M,54,Severe injury to leg,N,07h58,"Bull shark, 6.5 to 10 ft","B. Myatt, GSAF & K. McMurray, TrackingSharks.com"
1,2021.01.11,11-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Victoria,13th Beach,Swimming,male,M,7,Injury to leg,N,17h40,,"B. Myatt, GSAF & K. McMurray, TrackingSharks.com"
2,2021.01.09,09-Jan-2021,2021.0,Unprovoked,ST KITTS / NEVIS,The Narrows,Booby Island,Swimming / Kayaking,Brook …,F,,Injury to leg,N,09h00-10h00,,"K. McMurray, TrackingSharks.com"
3,2021.01.07,07-Jan-2021,2021.0,Unprovoked,NEW ZEALAND,North Island,"Waihī Beach, Bay of Plenty",Swimming,Kaelah Marlow,F,19,FATAL,Y,17h10,,"C. Black, GSAF"
4,2021.01.02,02-Jan-2021,2021.0,Unprovoked,USA,Hawaii,"Anaehoomalu Bay\nWaikoloa, Hawaii County",Swimming,female,F,68,Significant injury to right lower leg,N,08h00,,"K. McMurray, TrackingSharks.com"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8778,0,,,,,,,,,,,,,,,
8779,0,,,,,,,,,,,,,,,
8780,0,,,,,,,,,,,,,,,
8781,0,,,,,,,,,,,,,,,


In [16]:
# Now let's take a data column we care about that has a lot of nan values, like Species. We don't know this for many
# attacks so we'd like to have "Unknown" instead of nan. attacks.columnsCheck the column names so we can remove them.
attacks.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source'],
      dtype='object')

In [18]:
# You may notice that the Species column has a trailing space ("Species "), so we'll fix that with the str.strip() 
# method. This removes trailing whitespace. We'll apply it within the pandas method "rename" by setting the columns arg
# equal to a lambda function. A lambda function is just a function that is defined in place, that is applied
# to its variable. So to sum up, the columns arg of attacks.rename() will take a function, which it will apply to the
# dataframe's columns. It's kind of like saying "Rename the columns of this dataframe by this function."
attacks.rename(columns=lambda x: x.strip(), inplace=True)
attacks

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source
0,2021.01.13,13-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Blackwall Reach,Swimming,Cameron Wrathall,M,54,Severe injury to leg,N,07h58,"Bull shark, 6.5 to 10 ft","B. Myatt, GSAF & K. McMurray, TrackingSharks.com"
1,2021.01.11,11-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Victoria,13th Beach,Swimming,male,M,7,Injury to leg,N,17h40,,"B. Myatt, GSAF & K. McMurray, TrackingSharks.com"
2,2021.01.09,09-Jan-2021,2021.0,Unprovoked,ST KITTS / NEVIS,The Narrows,Booby Island,Swimming / Kayaking,Brook …,F,,Injury to leg,N,09h00-10h00,,"K. McMurray, TrackingSharks.com"
3,2021.01.07,07-Jan-2021,2021.0,Unprovoked,NEW ZEALAND,North Island,"Waihī Beach, Bay of Plenty",Swimming,Kaelah Marlow,F,19,FATAL,Y,17h10,,"C. Black, GSAF"
4,2021.01.02,02-Jan-2021,2021.0,Unprovoked,USA,Hawaii,"Anaehoomalu Bay\nWaikoloa, Hawaii County",Swimming,female,F,68,Significant injury to right lower leg,N,08h00,,"K. McMurray, TrackingSharks.com"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8778,0,,,,,,,,,,,,,,,
8779,0,,,,,,,,,,,,,,,
8780,0,,,,,,,,,,,,,,,
8781,0,,,,,,,,,,,,,,,


In [19]:
# Now we can replace the nan values in Species like so:
attacks['Species'] = attacks['Species'].fillna('Unknown')
attacks

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source
0,2021.01.13,13-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Blackwall Reach,Swimming,Cameron Wrathall,M,54,Severe injury to leg,N,07h58,"Bull shark, 6.5 to 10 ft","B. Myatt, GSAF & K. McMurray, TrackingSharks.com"
1,2021.01.11,11-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Victoria,13th Beach,Swimming,male,M,7,Injury to leg,N,17h40,Unknown,"B. Myatt, GSAF & K. McMurray, TrackingSharks.com"
2,2021.01.09,09-Jan-2021,2021.0,Unprovoked,ST KITTS / NEVIS,The Narrows,Booby Island,Swimming / Kayaking,Brook …,F,,Injury to leg,N,09h00-10h00,Unknown,"K. McMurray, TrackingSharks.com"
3,2021.01.07,07-Jan-2021,2021.0,Unprovoked,NEW ZEALAND,North Island,"Waihī Beach, Bay of Plenty",Swimming,Kaelah Marlow,F,19,FATAL,Y,17h10,Unknown,"C. Black, GSAF"
4,2021.01.02,02-Jan-2021,2021.0,Unprovoked,USA,Hawaii,"Anaehoomalu Bay\nWaikoloa, Hawaii County",Swimming,female,F,68,Significant injury to right lower leg,N,08h00,Unknown,"K. McMurray, TrackingSharks.com"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8778,0,,,,,,,,,,,,,,Unknown,
8779,0,,,,,,,,,,,,,,Unknown,
8780,0,,,,,,,,,,,,,,Unknown,
8781,0,,,,,,,,,,,,,,Unknown,


In [22]:
# Let's print out the first 10 rows to check. In Python array indexing can be summed up like this:
# array[:, :] gives you the array with all rows and columns included.
# array[:5, 5:] gives you the array with rows 0-5 and columns 5-(len(array) - 1).
# array[:10], gives you the rows 0-10 with all columns included.
# You'll see the "Species" column now has "Unknown" instead of nan.
attacks[:10]

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source
0,2021.01.13,13-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Western Australia,Blackwall Reach,Swimming,Cameron Wrathall,M,54.0,Severe injury to leg,N,07h58,"Bull shark, 6.5 to 10 ft","B. Myatt, GSAF & K. McMurray, TrackingSharks.com"
1,2021.01.11,11-Jan-2021,2021.0,Unprovoked,AUSTRALIA,Victoria,13th Beach,Swimming,male,M,7.0,Injury to leg,N,17h40,Unknown,"B. Myatt, GSAF & K. McMurray, TrackingSharks.com"
2,2021.01.09,09-Jan-2021,2021.0,Unprovoked,ST KITTS / NEVIS,The Narrows,Booby Island,Swimming / Kayaking,Brook …,F,,Injury to leg,N,09h00-10h00,Unknown,"K. McMurray, TrackingSharks.com"
3,2021.01.07,07-Jan-2021,2021.0,Unprovoked,NEW ZEALAND,North Island,"Waihī Beach, Bay of Plenty",Swimming,Kaelah Marlow,F,19.0,FATAL,Y,17h10,Unknown,"C. Black, GSAF"
4,2021.01.02,02-Jan-2021,2021.0,Unprovoked,USA,Hawaii,"Anaehoomalu Bay\nWaikoloa, Hawaii County",Swimming,female,F,68.0,Significant injury to right lower leg,N,08h00,Unknown,"K. McMurray, TrackingSharks.com"
5,2020.12.30,30-Dec-2020,2020.0,Unprovoked,USA,California,"Coronado, San Diego County",Swimming,Phil Garn,M,,No injury. Swim fin bittten,N,Afternoon,Juvenile white shark,"K. McMurray, TrackingSharks.com"
6,2020.12.23,23-Dec-2020,2020.0,Unprovoked,MALDIVES,Raa Atoll,Faadoo Island,Fell off fishing boat,male,M,40.0,Injuries to leg,N,09h36,Unknown,Raajee.mv
7,2020.12.10,10-Dec-2020,2020.0,Unprovoked,ST MARTIN,,Orient Beach,Swimming,female,F,39.0,FATAL,Y,14h00,Tiger shark,"Kevin McMurray, TrackingSharks.com & D. Baldwin"
8,2020.12.09,09-Dec-2020,2020.0,Unprovoked,AUSTRALIA,Western Australia,Cable Beach,Surfing,Sam Heseltine,M,,"No injury, Board bitten",N,07h00,Unknown,"M. Michelson, GSAF & K. McMurray, Tracking Sha..."
9,2020.12.08,08-Dec-2020,2020.0,Unprovoked,USA,Hawaii,Honolua Bay,Surfing,Robin Warren,M,56.0,FATAL,Y,07h45,"Tiger shark. 14'3""","M. Michelson, GSAF & K. McMurray, Tracking Sha..."
