# Setup

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset from a Excel file into a Pandas DataFrame
df = pd.read_excel('GSAF5.xls')

In [3]:
# Check information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25792 entries, 0 to 25791
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8771 non-null   object 
 1   Date                    6558 non-null   object 
 2   Year                    6556 non-null   float64
 3   Type                    6552 non-null   object 
 4   Country                 6508 non-null   object 
 5   Area                    6091 non-null   object 
 6   Location                6008 non-null   object 
 7   Activity                6002 non-null   object 
 8   Name                    6343 non-null   object 
 9   Sex                     5987 non-null   object 
 10  Age                     3660 non-null   object 
 11  Injury                  6528 non-null   object 
 12  Fatal (Y/N)             6006 non-null   object 
 13  Time                    3139 non-null   object 
 14  Species                 3610 non-null 

In [4]:
# # Remove unnecessary spaces in the columns names
df.columns = df.columns.str.strip()
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [5]:
# Remove unwanted columns
# The columns 'Case Number', 'Name', Injury' and 'Investigator or Source' are not relevant information
# The columns 'Unnamed: 22' and 'Unnamed: 22' have no relevant information for the analysis.
# The columns 'pdf', 'href formula', 'href' do not contain relevant information for the analysis. They just list the 
# references.
# The columns 'Case Number.1', 'Case Number.2' and 'original order' are similar to the 'Case Number', so they also can be
# removed
df = df.drop(columns=['Case Number', 'Name', 'Injury', 'Investigator or Source', 'pdf', 'href formula', 'href', 
                      'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23'])

In [6]:
# Remove all rows that have NaN in all columns
# There is total of 25792 lines, but all columns do not have even 9000 rows with non-NaN.. Therefore, there are lots of
# rows that all columns are only missing values
df = df.dropna(how='all')

In [7]:
# Remove all rows that have less than 3 non-NaN
# If the row contains less than 3 non-NaN, there is basically no information about a shark accident
df = df.dropna(thresh=3)

In [8]:
# Check information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6558 entries, 0 to 6557
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         6558 non-null   object 
 1   Year         6556 non-null   float64
 2   Type         6552 non-null   object 
 3   Country      6508 non-null   object 
 4   Area         6091 non-null   object 
 5   Location     6008 non-null   object 
 6   Activity     6002 non-null   object 
 7   Sex          5987 non-null   object 
 8   Age          3660 non-null   object 
 9   Fatal (Y/N)  6006 non-null   object 
 10  Time         3139 non-null   object 
 11  Species      3610 non-null   object 
dtypes: float64(1), object(11)
memory usage: 666.0+ KB


In [9]:
df

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Fatal (Y/N),Time,Species
0,20-Aug-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Boogie boarding,F,50,N,11h00,
1,14-Aug-2020,2020.0,Unprovoked,AUSTRALIA,New South Wales,"Shelly Beach, Port Macquarie",Surfing,F,35,N,09h30,"White shark, 2-to 3m"
2,10-Aug-2020,2020.0,Provoked,USA,Florida,"Off Gasparilla Island, Charlotte County",Fishing,M,55,N,16h00,"Blacktip shark, 6'"
3,02-Aug-2020,2020.0,Unprovoked,USA,Virgin Islands,"Candle Reef, St. Croix",Snorkeling,F,,N,14h00,"Nurse shark, 5'"
4,31-Jul-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,F,22,N,17h00,
...,...,...,...,...,...,...,...,...,...,...,...,...
6553,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,,Y,,
6554,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,M,,Y,,
6555,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,,Y,,
6556,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,,Y,,


# Age

In [10]:
df.Age.unique()

array([50, 35, 55, nan, 22, 14, 28, 38, 4, 63, 23, 11, 12, 10, 29, 15, 36,
       7, 16, 30, 60, 18, 9, 26, 57, 'Teen', 24, 59, 13, 75, 21, '30s',
       45, 33, 17, 37, 70, 44, '28 & 22', 32, 20, 51, '22, 57, 31', '60s',
       40, 49, "20's", 43, 8, 64, 19, 65, 67, 53, 34, 25, 58, 74, 46, 41,
       31, '9 & 60', 48, '20s', 42, 39, 56, 61, 'a minor', 6, 62, 52, 54,
       69, '40s', 3, 82, 73, 68, 47, 66, 72, 27, 71, '38', '39', '23',
       '32', '52', '68', '12', '18', '19', '43', '47', '6', '37', '9',
       '36', '10', '16', '13', '11', '17', '14', '30', '50', '29', '65',
       '63', '26', '71', '48', '70', '58', '18 months', '22', '41', '35',
       '57', '20', '24', '34', '15', '44', '53', '7', '40', '28', '33',
       '31', '45', '50s', '8', '51', '61', '42', '25', 'teen', '66', '21',
       '77', '46', '60', '74', '55', '27', '3', '56', '64', '28 & 26',
       '62', '5', '49', '54', '86', '59', '18 or 20', '12 or 13',
       '46 & 34', '28, 23 & 30', 'Teens', 77, '36 & 26', 

In [11]:
# Function to convert string to integer
def convert_str_to_int(age):
    try:
        age = int(age)
    except:
        age = -1
    return age

In [12]:
# Number 
df_total = df.Date.count()
print(df_total)

# Convert the strings in column 'Age' to integers
df['age_int'] = df.Age.apply(convert_str_to_int)

# How many were not digits
print(df.loc[df.age_int == -1, :].age_int.value_counts() / 6558)
print(df.loc[df.age_int == -1, :].age_int.value_counts() / df_total)

# Check the result
df

6558
-1    0.459134
Name: age_int, dtype: float64
-1    0.459134
Name: age_int, dtype: float64


Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Fatal (Y/N),Time,Species,age_int
0,20-Aug-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Boogie boarding,F,50,N,11h00,,50
1,14-Aug-2020,2020.0,Unprovoked,AUSTRALIA,New South Wales,"Shelly Beach, Port Macquarie",Surfing,F,35,N,09h30,"White shark, 2-to 3m",35
2,10-Aug-2020,2020.0,Provoked,USA,Florida,"Off Gasparilla Island, Charlotte County",Fishing,M,55,N,16h00,"Blacktip shark, 6'",55
3,02-Aug-2020,2020.0,Unprovoked,USA,Virgin Islands,"Candle Reef, St. Croix",Snorkeling,F,,N,14h00,"Nurse shark, 5'",-1
4,31-Jul-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,F,22,N,17h00,,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6553,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,,Y,,,-1
6554,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,M,,Y,,,-1
6555,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,,Y,,,-1
6556,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,,Y,,,-1


In [13]:
# Classification by age
df['age_cat'] = np.where(df['age_int'] > 65, 'Elder',
                        np.where(df['age_int'] > 35, 'Adult',
                                np.where(df['age_int'] > 17, 'Young Adult',
                                        np.where(df['age_int'] > 12, 'Teenager',
                                                 np.where(df['age_int'] == -1, '-', 'Child')))))

In [14]:
df

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Fatal (Y/N),Time,Species,age_int,age_cat
0,20-Aug-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Boogie boarding,F,50,N,11h00,,50,Adult
1,14-Aug-2020,2020.0,Unprovoked,AUSTRALIA,New South Wales,"Shelly Beach, Port Macquarie",Surfing,F,35,N,09h30,"White shark, 2-to 3m",35,Young Adult
2,10-Aug-2020,2020.0,Provoked,USA,Florida,"Off Gasparilla Island, Charlotte County",Fishing,M,55,N,16h00,"Blacktip shark, 6'",55,Adult
3,02-Aug-2020,2020.0,Unprovoked,USA,Virgin Islands,"Candle Reef, St. Croix",Snorkeling,F,,N,14h00,"Nurse shark, 5'",-1,-
4,31-Jul-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,F,22,N,17h00,,22,Young Adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6553,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,,Y,,,-1,-
6554,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,M,,Y,,,-1,-
6555,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,,Y,,,-1,-
6556,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,,Y,,,-1,-


# Year

In [15]:
df.Year.unique()

array([2020., 2019., 2018., 2017.,   nan, 2016., 2015., 2014., 2013.,
       2012., 2011., 2010., 2009., 2008., 2007., 2006., 2005., 2004.,
       2003., 2002., 2001., 2000., 1999., 1998., 1997., 1996., 1995.,
       1984., 1994., 1993., 1992., 1991., 1990., 1989., 1969., 1988.,
       1987., 1986., 1985., 1983., 1982., 1981., 1980., 1979., 1978.,
       1977., 1976., 1975., 1974., 1973., 1972., 1971., 1970., 1968.,
       1967., 1966., 1965., 1964., 1963., 1962., 1961., 1960., 1959.,
       1958., 1957., 1956., 1955., 1954., 1953., 1952., 1951., 1950.,
       1949., 1948., 1848., 1947., 1946., 1945., 1944., 1943., 1942.,
       1941., 1940., 1939., 1938., 1937., 1936., 1935., 1934., 1933.,
       1932., 1931., 1930., 1929., 1928., 1927., 1926., 1925., 1924.,
       1923., 1922., 1921., 1920., 1919., 1918., 1917., 1916., 1915.,
       1914., 1913., 1912., 1911., 1910., 1909., 1908., 1907., 1906.,
       1905., 1904., 1903., 1902., 1901., 1900., 1899., 1898., 1897.,
       1896., 1895.,

In [16]:
# Convert the strings in column 'Year' to integers
df['year_int'] = df.Year.apply(convert_str_to_int)

# Check the result
df

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Fatal (Y/N),Time,Species,age_int,age_cat,year_int
0,20-Aug-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Boogie boarding,F,50,N,11h00,,50,Adult,2020
1,14-Aug-2020,2020.0,Unprovoked,AUSTRALIA,New South Wales,"Shelly Beach, Port Macquarie",Surfing,F,35,N,09h30,"White shark, 2-to 3m",35,Young Adult,2020
2,10-Aug-2020,2020.0,Provoked,USA,Florida,"Off Gasparilla Island, Charlotte County",Fishing,M,55,N,16h00,"Blacktip shark, 6'",55,Adult,2020
3,02-Aug-2020,2020.0,Unprovoked,USA,Virgin Islands,"Candle Reef, St. Croix",Snorkeling,F,,N,14h00,"Nurse shark, 5'",-1,-,2020
4,31-Jul-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,F,22,N,17h00,,22,Young Adult,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6553,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,,Y,,,-1,-,0
6554,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,M,,Y,,,-1,-,0
6555,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,,Y,,,-1,-,0
6556,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,,Y,,,-1,-,0


In [17]:
# Check % incidents occured before 1801
print(f'The incidents before 1801 represents only {(df[df.Year < 1801].Year.count() / df.Year.count()) * 100:.2f}%',
      f'of the dataset. So, only the years from 1800 will be analysed.', sep=' ')

The incidents before 1801 represents only 2.61% of the dataset. So, only the years from 1800 will be analysed.


In [18]:
# Selecting only the years from 1800
df = df.loc[df['Year'] >= 1801, :]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6385 entries, 0 to 6386
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         6385 non-null   object 
 1   Year         6385 non-null   float64
 2   Type         6379 non-null   object 
 3   Country      6345 non-null   object 
 4   Area         5963 non-null   object 
 5   Location     5886 non-null   object 
 6   Activity     5860 non-null   object 
 7   Sex          5828 non-null   object 
 8   Age          3643 non-null   object 
 9   Fatal (Y/N)  5840 non-null   object 
 10  Time         3129 non-null   object 
 11  Species      3564 non-null   object 
 12  age_int      6385 non-null   int64  
 13  age_cat      6385 non-null   object 
 14  year_int     6385 non-null   int64  
dtypes: float64(1), int64(2), object(12)
memory usage: 798.1+ KB


In [19]:
df

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Fatal (Y/N),Time,Species,age_int,age_cat,year_int
0,20-Aug-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Boogie boarding,F,50,N,11h00,,50,Adult,2020
1,14-Aug-2020,2020.0,Unprovoked,AUSTRALIA,New South Wales,"Shelly Beach, Port Macquarie",Surfing,F,35,N,09h30,"White shark, 2-to 3m",35,Young Adult,2020
2,10-Aug-2020,2020.0,Provoked,USA,Florida,"Off Gasparilla Island, Charlotte County",Fishing,M,55,N,16h00,"Blacktip shark, 6'",55,Adult,2020
3,02-Aug-2020,2020.0,Unprovoked,USA,Virgin Islands,"Candle Reef, St. Croix",Snorkeling,F,,N,14h00,"Nurse shark, 5'",-1,-,2020
4,31-Jul-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,F,22,N,17h00,,22,Young Adult,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6382,Reported 26-Feb-1804,1804.0,Watercraft,AUSTRALIA,New South Wales,"Georges Head, off Port Jackson",,,,N,,,-1,-,1804
6383,May-17-1803,1803.0,Sea Disaster,USA,South Carolina,Off Charleston,,M,,N,,,-1,-,1803
6384,Mar-1803,1803.0,Unprovoked,AUSTRALIA,Western Australia,"Hamelin Harbour, at Faure Island",,M,,N,,,-1,-,1803
6385,Reported Apr-13-1802,1802.0,Unprovoked,INDIA,,,,,,Y,,,-1,-,1802


In [20]:
# Classification by year
df['century'] = np.where(df['year_int'] >= 2001, 21,
                            np.where(df['year_int'] >= 1901, 20, 19))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['century'] = np.where(df['year_int'] >= 2001, 21,


In [21]:
df

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Fatal (Y/N),Time,Species,age_int,age_cat,year_int,century
0,20-Aug-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Boogie boarding,F,50,N,11h00,,50,Adult,2020,21
1,14-Aug-2020,2020.0,Unprovoked,AUSTRALIA,New South Wales,"Shelly Beach, Port Macquarie",Surfing,F,35,N,09h30,"White shark, 2-to 3m",35,Young Adult,2020,21
2,10-Aug-2020,2020.0,Provoked,USA,Florida,"Off Gasparilla Island, Charlotte County",Fishing,M,55,N,16h00,"Blacktip shark, 6'",55,Adult,2020,21
3,02-Aug-2020,2020.0,Unprovoked,USA,Virgin Islands,"Candle Reef, St. Croix",Snorkeling,F,,N,14h00,"Nurse shark, 5'",-1,-,2020,21
4,31-Jul-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,F,22,N,17h00,,22,Young Adult,2020,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6382,Reported 26-Feb-1804,1804.0,Watercraft,AUSTRALIA,New South Wales,"Georges Head, off Port Jackson",,,,N,,,-1,-,1804,19
6383,May-17-1803,1803.0,Sea Disaster,USA,South Carolina,Off Charleston,,M,,N,,,-1,-,1803,19
6384,Mar-1803,1803.0,Unprovoked,AUSTRALIA,Western Australia,"Hamelin Harbour, at Faure Island",,M,,N,,,-1,-,1803,19
6385,Reported Apr-13-1802,1802.0,Unprovoked,INDIA,,,,,,Y,,,-1,-,1802,19


# Gender

In [22]:
# Check the column 'Sex'
df.Sex.unique()

array(['F', 'M', nan, 'M ', 'lli', 'M x 2', 'N', '.'], dtype=object)

In [23]:
df.Sex.count()

5828

In [24]:
# % of each value
df.Sex.value_counts() / df.Sex.count()

M        0.882464
F        0.116335
M        0.000343
N        0.000343
lli      0.000172
M x 2    0.000172
.        0.000172
Name: Sex, dtype: float64

In [None]:
# % of each NaN
df.Sex.isna().sum()

In [25]:
# Check number of missing values
print(f'The number of NaN in the column "Sex" is {df.Sex.isna().sum()}, which represents '
      f'{(df.Sex.isna().sum() / df.Sex.count())*100:.2f}% of the dataset.', sep=' ')

The number of NaN in the column "Sex" is 557, which represents 9.56% of the dataset.


Since the NaN represents only a samll part of the dataset, the rows containing NaN in the column 'Sex' will be removed.

In [26]:
# Remove rows that the column 'Sex' is a missing value
df[df.Sex.isna()]
df = df.drop(df[df.Sex.isna()].index)

In [27]:
# Check infromation about the changed dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5828 entries, 0 to 6386
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         5828 non-null   object 
 1   Year         5828 non-null   float64
 2   Type         5822 non-null   object 
 3   Country      5790 non-null   object 
 4   Area         5463 non-null   object 
 5   Location     5392 non-null   object 
 6   Activity     5452 non-null   object 
 7   Sex          5828 non-null   object 
 8   Age          3604 non-null   object 
 9   Fatal (Y/N)  5363 non-null   object 
 10  Time         3023 non-null   object 
 11  Species      3245 non-null   object 
 12  age_int      5828 non-null   int64  
 13  age_cat      5828 non-null   object 
 14  year_int     5828 non-null   int64  
 15  century      5828 non-null   int32  
dtypes: float64(1), int32(1), int64(2), object(12)
memory usage: 751.3+ KB


In [28]:
# Check values in the column 'Sex'
df.Sex.value_counts()

M        5143
F         678
M           2
N           2
lli         1
M x 2       1
.           1
Name: Sex, dtype: int64

In [29]:
# Remove unnecessary spaces in the values of the column 'Sex'
df['Sex'] = df['Sex'].str.strip()
df.Sex.value_counts()

M        5145
F         678
N           2
lli         1
M x 2       1
.           1
Name: Sex, dtype: int64

In [30]:
# Remove rows that the gender is 'N', 'lli', '.' or 'M x 2'
df = df.drop(df[(df.Sex == '.') | (df.Sex == 'N') | (df.Sex == 'lli') | ((df.Sex == 'M x 2'))].index)
df.Sex.value_counts()

M    5145
F     678
Name: Sex, dtype: int64

In [31]:
# Check infromation about the changed dataset
# 5 columns were removed
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5823 entries, 0 to 6384
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         5823 non-null   object 
 1   Year         5823 non-null   float64
 2   Type         5817 non-null   object 
 3   Country      5786 non-null   object 
 4   Area         5459 non-null   object 
 5   Location     5389 non-null   object 
 6   Activity     5447 non-null   object 
 7   Sex          5823 non-null   object 
 8   Age          3603 non-null   object 
 9   Fatal (Y/N)  5358 non-null   object 
 10  Time         3022 non-null   object 
 11  Species      3241 non-null   object 
 12  age_int      5823 non-null   int64  
 13  age_cat      5823 non-null   object 
 14  year_int     5823 non-null   int64  
 15  century      5823 non-null   int32  
dtypes: float64(1), int32(1), int64(2), object(12)
memory usage: 750.6+ KB


In [32]:
# Check column 'Sex' percentage
df_gener_total = df.Sex.value_counts().sum()
df.Sex.value_counts() / df_gener_total

M    0.883565
F    0.116435
Name: Sex, dtype: float64

In [33]:
print(f'The males represent {(df.Sex.value_counts() / df_gener_total)[0]*100:.2f}% of the people who were'
      f'involved in incidents with sharks, while women represent only {(df.Sex.value_counts() / df_gener_total)[1]*100:.2f}%.',sep=' ')

The males represent 88.36% of the people who wereinvolved in incidents with sharks, while women represent only 11.64%.


In [34]:
# Create a subset containing only females
df_male = df.loc[df['Sex'] == 'F']

In [35]:
# Create a subset containing only males
df_male = df.loc[df['Sex'] == 'M']
df_male.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5145 entries, 2 to 6384
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         5145 non-null   object 
 1   Year         5145 non-null   float64
 2   Type         5139 non-null   object 
 3   Country      5110 non-null   object 
 4   Area         4799 non-null   object 
 5   Location     4737 non-null   object 
 6   Activity     4821 non-null   object 
 7   Sex          5145 non-null   object 
 8   Age          3101 non-null   object 
 9   Fatal (Y/N)  4742 non-null   object 
 10  Time         2592 non-null   object 
 11  Species      2844 non-null   object 
 12  age_int      5145 non-null   int64  
 13  age_cat      5145 non-null   object 
 14  year_int     5145 non-null   int64  
 15  century      5145 non-null   int32  
dtypes: float64(1), int32(1), int64(2), object(12)
memory usage: 663.2+ KB


In [36]:
# Check males age
df_male.age_cat.value_counts()

-              2126
Young Adult    1514
Adult           703
Teenager        537
Child           219
Elder            46
Name: age_cat, dtype: int64

In [37]:
df_male_total = df_male.Date.count()
df_male_total

5145

In [38]:
# Check males age
df_male.age_cat.value_counts() / df_male_total

-              0.413217
Young Adult    0.294266
Adult          0.136638
Teenager       0.104373
Child          0.042566
Elder          0.008941
Name: age_cat, dtype: float64

In [55]:
(df_male.age_cat.value_counts() / df_male_total)*100

-              41.321672
Young Adult    29.426628
Adult          13.663751
Teenager       10.437318
Child           4.256560
Elder           0.894072
Name: age_cat, dtype: float64

In [39]:
df_male_age = df_male.loc[df_male['age_cat'] != '-']
df_male_age_total = df_male_age.Date.count()
df_male_age.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3019 entries, 2 to 6369
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         3019 non-null   object 
 1   Year         3019 non-null   float64
 2   Type         3016 non-null   object 
 3   Country      3011 non-null   object 
 4   Area         2910 non-null   object 
 5   Location     2888 non-null   object 
 6   Activity     2907 non-null   object 
 7   Sex          3019 non-null   object 
 8   Age          3019 non-null   object 
 9   Fatal (Y/N)  2838 non-null   object 
 10  Time         2076 non-null   object 
 11  Species      1919 non-null   object 
 12  age_int      3019 non-null   int64  
 13  age_cat      3019 non-null   object 
 14  year_int     3019 non-null   int64  
 15  century      3019 non-null   int32  
dtypes: float64(1), int32(1), int64(2), object(12)
memory usage: 389.2+ KB


In [40]:
df_male_age_total

3019

In [41]:
df_male_age.age_cat.value_counts()

Young Adult    1514
Adult           703
Teenager        537
Child           219
Elder            46
Name: age_cat, dtype: int64

In [42]:
df_male_age.age_cat.value_counts() / df_male_age_total

Young Adult    0.501491
Adult          0.232859
Teenager       0.177873
Child          0.072541
Elder          0.015237
Name: age_cat, dtype: float64

In [56]:
(df_male_age.age_cat.value_counts() / df_male_age_total)*100

Young Adult    50.149056
Adult          23.285856
Teenager       17.787347
Child           7.254058
Elder           1.523683
Name: age_cat, dtype: float64

50% of male who were in a shark incident were Young Adults.

In [43]:
df_male_ya = df_male_age[df_male_age['age_cat'] == 'Young Adult']
df_male_ya.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1514 entries, 6 to 6369
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         1514 non-null   object 
 1   Year         1514 non-null   float64
 2   Type         1514 non-null   object 
 3   Country      1508 non-null   object 
 4   Area         1457 non-null   object 
 5   Location     1440 non-null   object 
 6   Activity     1462 non-null   object 
 7   Sex          1514 non-null   object 
 8   Age          1514 non-null   object 
 9   Fatal (Y/N)  1436 non-null   object 
 10  Time         1062 non-null   object 
 11  Species      993 non-null    object 
 12  age_int      1514 non-null   int64  
 13  age_cat      1514 non-null   object 
 14  year_int     1514 non-null   int64  
 15  century      1514 non-null   int32  
dtypes: float64(1), int32(1), int64(2), object(12)
memory usage: 195.2+ KB


In [57]:
df_male_ya_total = df_male_ya.Date.count()
df_male_ya.groupby(by='Activity').age_cat.count().sort_values(ascending=False)

Activity
Surfing                                      417
Swimming                                     187
Spearfishing                                 153
Fishing                                       54
Diving                                        26
                                            ... 
Snorkeling – hunting crayfish and abalone      1
Spearfishing                                   1
Spearfishing & holding catch                   1
Spearfishing & lassoed shark                   1
 Diving for abalone (Hookah)                   1
Name: age_cat, Length: 361, dtype: int64

In [59]:
(df_male_ya.groupby(by='Activity').age_cat.count().sort_values(ascending=False) / df_male_ya_total)*100

Activity
Surfing                                      27.542933
Swimming                                     12.351387
Spearfishing                                 10.105680
Fishing                                       3.566711
Diving                                        1.717305
                                               ...    
Snorkeling – hunting crayfish and abalone     0.066050
Spearfishing                                  0.066050
Spearfishing & holding catch                  0.066050
Spearfishing & lassoed shark                  0.066050
 Diving for abalone (Hookah)                  0.066050
Name: age_cat, Length: 361, dtype: float64

**Young adult males surfing were the most involved in shark incidents.**

# Analysis though years

In [45]:
# Create subsets for each century
df_cen21 = df.loc[df['century'] == 21, :]
df_cen20 = df.loc[df['century'] == 20, :]
df_cen19 = df.loc[df['century'] == 19, :]

In [46]:
df_cen21_total = df_cen21.Date.count()
print(df_cen21_total)
df_cen21.info()

2124
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2124 entries, 0 to 2224
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         2124 non-null   object 
 1   Year         2124 non-null   float64
 2   Type         2120 non-null   object 
 3   Country      2122 non-null   object 
 4   Area         2042 non-null   object 
 5   Location     2038 non-null   object 
 6   Activity     2043 non-null   object 
 7   Sex          2124 non-null   object 
 8   Age          1660 non-null   object 
 9   Fatal (Y/N)  1976 non-null   object 
 10  Time         1568 non-null   object 
 11  Species      1368 non-null   object 
 12  age_int      2124 non-null   int64  
 13  age_cat      2124 non-null   object 
 14  year_int     2124 non-null   int64  
 15  century      2124 non-null   int32  
dtypes: float64(1), int32(1), int64(2), object(12)
memory usage: 273.8+ KB


In [47]:
df_cen20_total = df_cen20.Date.count()
print(df_cen20_total)
df_cen20.info()

3168
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3168 entries, 2225 to 5803
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         3168 non-null   object 
 1   Year         3168 non-null   float64
 2   Type         3167 non-null   object 
 3   Country      3148 non-null   object 
 4   Area         2971 non-null   object 
 5   Location     2920 non-null   object 
 6   Activity     2930 non-null   object 
 7   Sex          3168 non-null   object 
 8   Age          1866 non-null   object 
 9   Fatal (Y/N)  2911 non-null   object 
 10  Time         1365 non-null   object 
 11  Species      1752 non-null   object 
 12  age_int      3168 non-null   int64  
 13  age_cat      3168 non-null   object 
 14  year_int     3168 non-null   int64  
 15  century      3168 non-null   int32  
dtypes: float64(1), int32(1), int64(2), object(12)
memory usage: 408.4+ KB


In [48]:
df_cen19_total = df_cen19.Date.count()
print(df_cen19_total)
df_cen19.info()

531
<class 'pandas.core.frame.DataFrame'>
Int64Index: 531 entries, 4792 to 6384
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         531 non-null    object 
 1   Year         531 non-null    float64
 2   Type         530 non-null    object 
 3   Country      516 non-null    object 
 4   Area         446 non-null    object 
 5   Location     431 non-null    object 
 6   Activity     474 non-null    object 
 7   Sex          531 non-null    object 
 8   Age          77 non-null     object 
 9   Fatal (Y/N)  471 non-null    object 
 10  Time         89 non-null     object 
 11  Species      121 non-null    object 
 12  age_int      531 non-null    int64  
 13  age_cat      531 non-null    object 
 14  year_int     531 non-null    int64  
 15  century      531 non-null    int32  
dtypes: float64(1), int32(1), int64(2), object(12)
memory usage: 68.4+ KB


Não estamos a nem 1/4 do século e o número de incidentes com tubarões só está aproximadamente 1000 de diferença (Tbm talvez não se tenha todos os registros)

In [49]:
df_cen21.Sex.value_counts() / df_cen21_total

M    0.81968
F    0.18032
Name: Sex, dtype: float64

In [50]:
df_cen20.Sex.value_counts() / df_cen20_total

M    0.913826
F    0.086174
Name: Sex, dtype: float64

In [51]:
df_cen19.Sex.value_counts() / df_cen19_total

M    0.958569
F    0.041431
Name: Sex, dtype: float64

Reflete o comportamento da época. As mulheres não se faziam muita coisa além do que cuidar da casa.

In [52]:
df_cen21.Activity.value_counts() / df_cen21_total

Surfing                             0.313559
Swimming                            0.137006
Spearfishing                        0.061205
Fishing                             0.054614
Snorkeling                          0.038606
                                      ...   
Paddling on kneeboard               0.000471
Diving / fishing                    0.000471
Swimming after falling overboard    0.000471
Boogie-boarding / swimming          0.000471
Longline fishing for sharks         0.000471
Name: Activity, Length: 277, dtype: float64

In [53]:
df_cen20.Activity.value_counts() / df_cen20_total

Swimming                                                                                                                                                                    0.146780
Surfing                                                                                                                                                                     0.108270
Spearfishing                                                                                                                                                                0.065972
Fishing                                                                                                                                                                     0.065341
Wading                                                                                                                                                                      0.023043
                                                                                               

In [54]:
df_cen19.Activity.value_counts() / df_cen19_total

Swimming                             0.177024
Bathing                              0.156309
Fishing                              0.060264
Fell overboard                       0.041431
Diving                               0.022599
                                       ...   
Trailing hand in the water           0.001883
Swimming / floating on his back      0.001883
Fell overboard from the Sobella      0.001883
Standing, gathering oysters          0.001883
Wreck of the steamship Birkenhead    0.001883
Name: Activity, Length: 192, dtype: float64

# New heading