# Setup

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset from a Excel file into a Pandas DataFrame
df = pd.read_excel('GSAF5.xls')

In [3]:
# Check information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25792 entries, 0 to 25791
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8771 non-null   object 
 1   Date                    6558 non-null   object 
 2   Year                    6556 non-null   float64
 3   Type                    6552 non-null   object 
 4   Country                 6508 non-null   object 
 5   Area                    6091 non-null   object 
 6   Location                6008 non-null   object 
 7   Activity                6002 non-null   object 
 8   Name                    6343 non-null   object 
 9   Sex                     5987 non-null   object 
 10  Age                     3660 non-null   object 
 11  Injury                  6528 non-null   object 
 12  Fatal (Y/N)             6006 non-null   object 
 13  Time                    3139 non-null   object 
 14  Species                 3610 non-null 

In [4]:
# # Remove unnecessary spaces in the columns names
df.columns = df.columns.str.strip()
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [5]:
# Remove unwanted columns
# The columns 'Name' and 'Investigator or Source' are not relevant information
# The columns 'Unnamed: 22' and 'Unnamed: 22' have no relevant information for the analysis.
# The columns 'pdf', 'href formula', 'href' do not contain relevant information for the analysis. They just list the 
# references.
# The columns 'Case Number.1', 'Case Number.2' and 'original order' are similar to the 'Case Number', so they also can be
# removed
df = df.drop(columns=['Name', 'Investigator or Source', 'pdf', 'href formula', 'href', 'Case Number.1', 'Case Number.2',
                      'original order', 'Unnamed: 22', 'Unnamed: 23'])

In [6]:
# Remove all rows that have NaN in all columns
# There is total of 25792 lines, but all columns do not have even 9000 rows with non-NaN.. Therefore, there are lots of
# rows that all columns are only missing values
df = df.dropna(how='all')

In [7]:
# Remove all rows that have less than 3 non-NaN
# If the row contains less than 3 non-NaN, there is basically no information about a shark accident
df = df.dropna(thresh=3)

In [8]:
# Check information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6558 entries, 0 to 6557
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Case Number  6557 non-null   object 
 1   Date         6558 non-null   object 
 2   Year         6556 non-null   float64
 3   Type         6552 non-null   object 
 4   Country      6508 non-null   object 
 5   Area         6091 non-null   object 
 6   Location     6008 non-null   object 
 7   Activity     6002 non-null   object 
 8   Sex          5987 non-null   object 
 9   Age          3660 non-null   object 
 10  Injury       6528 non-null   object 
 11  Fatal (Y/N)  6006 non-null   object 
 12  Time         3139 non-null   object 
 13  Species      3610 non-null   object 
dtypes: float64(1), object(13)
memory usage: 768.5+ KB


In [9]:
df

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,2020.08.20,20-Aug-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Boogie boarding,F,50,Minor lacerations to left leg,N,11h00,
1,2020.08.14,14-Aug-2020,2020.0,Unprovoked,AUSTRALIA,New South Wales,"Shelly Beach, Port Macquarie",Surfing,F,35,Lacerations to right calf and posterior thigh,N,09h30,"White shark, 2-to 3m"
2,2020.08.10,10-Aug-2020,2020.0,Provoked,USA,Florida,"Off Gasparilla Island, Charlotte County",Fishing,M,55,Injury to left forearm by hooked shark PROVOKE...,N,16h00,"Blacktip shark, 6'"
3,2020.08.02,02-Aug-2020,2020.0,Unprovoked,USA,Virgin Islands,"Candle Reef, St. Croix",Snorkeling,F,,Lacerations to hand and wrist,N,14h00,"Nurse shark, 5'"
4,2020.07.31.c,31-Jul-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,F,22,Lacerations to foot,N,17h00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6553,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,,FATAL,Y,,
6554,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,M,,FATAL,Y,,
6555,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,,FATAL,Y,,
6556,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,,FATAL,Y,,


In [61]:
df.Year.unique()

array([2020., 2019., 2018., 2017.,   nan, 2016., 2015., 2014., 2013.,
       2012., 2011., 2010., 2009., 2008., 2007., 2006., 2005., 2004.,
       2003., 2002., 2001., 2000., 1999., 1998., 1997., 1996., 1995.,
       1984., 1994., 1993., 1992., 1991., 1990., 1989., 1969., 1988.,
       1987., 1986., 1985., 1983., 1982., 1981., 1980., 1979., 1978.,
       1977., 1976., 1975., 1974., 1973., 1972., 1971., 1970., 1968.,
       1967., 1966., 1965., 1964., 1963., 1962., 1961., 1960., 1959.,
       1958., 1957., 1956., 1955., 1954., 1953., 1952., 1951., 1950.,
       1949., 1948., 1848., 1947., 1946., 1945., 1944., 1943., 1942.,
       1941., 1940., 1939., 1938., 1937., 1936., 1935., 1934., 1933.,
       1932., 1931., 1930., 1929., 1928., 1927., 1926., 1925., 1924.,
       1923., 1922., 1921., 1920., 1919., 1918., 1917., 1916., 1915.,
       1914., 1913., 1912., 1911., 1910., 1909., 1908., 1907., 1906.,
       1905., 1904., 1903., 1902., 1901., 1900., 1899., 1898., 1897.,
       1896., 1895.,

# Age

In [10]:
df.Age.unique()

array([50, 35, 55, nan, 22, 14, 28, 38, 4, 63, 23, 11, 12, 10, 29, 15, 36,
       7, 16, 30, 60, 18, 9, 26, 57, 'Teen', 24, 59, 13, 75, 21, '30s',
       45, 33, 17, 37, 70, 44, '28 & 22', 32, 20, 51, '22, 57, 31', '60s',
       40, 49, "20's", 43, 8, 64, 19, 65, 67, 53, 34, 25, 58, 74, 46, 41,
       31, '9 & 60', 48, '20s', 42, 39, 56, 61, 'a minor', 6, 62, 52, 54,
       69, '40s', 3, 82, 73, 68, 47, 66, 72, 27, 71, '38', '39', '23',
       '32', '52', '68', '12', '18', '19', '43', '47', '6', '37', '9',
       '36', '10', '16', '13', '11', '17', '14', '30', '50', '29', '65',
       '63', '26', '71', '48', '70', '58', '18 months', '22', '41', '35',
       '57', '20', '24', '34', '15', '44', '53', '7', '40', '28', '33',
       '31', '45', '50s', '8', '51', '61', '42', '25', 'teen', '66', '21',
       '77', '46', '60', '74', '55', '27', '3', '56', '64', '28 & 26',
       '62', '5', '49', '54', '86', '59', '18 or 20', '12 or 13',
       '46 & 34', '28, 23 & 30', 'Teens', 77, '36 & 26', 

In [11]:
# Function to convert string to integer
def convert_str_to_int(age):
    try:
        age = int(age)
    except:
        age = -1
    return age

In [12]:
# Convert the strings in column 'Age' to integers
df['age_int'] = df.Age.apply(convert_str_to_int)

# Check the result
df

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,age_int
0,2020.08.20,20-Aug-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Boogie boarding,F,50,Minor lacerations to left leg,N,11h00,,50
1,2020.08.14,14-Aug-2020,2020.0,Unprovoked,AUSTRALIA,New South Wales,"Shelly Beach, Port Macquarie",Surfing,F,35,Lacerations to right calf and posterior thigh,N,09h30,"White shark, 2-to 3m",35
2,2020.08.10,10-Aug-2020,2020.0,Provoked,USA,Florida,"Off Gasparilla Island, Charlotte County",Fishing,M,55,Injury to left forearm by hooked shark PROVOKE...,N,16h00,"Blacktip shark, 6'",55
3,2020.08.02,02-Aug-2020,2020.0,Unprovoked,USA,Virgin Islands,"Candle Reef, St. Croix",Snorkeling,F,,Lacerations to hand and wrist,N,14h00,"Nurse shark, 5'",-1
4,2020.07.31.c,31-Jul-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,F,22,Lacerations to foot,N,17h00,,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6553,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,,FATAL,Y,,,-1
6554,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,M,,FATAL,Y,,,-1
6555,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,,FATAL,Y,,,-1
6556,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,,FATAL,Y,,,-1


In [13]:
# Classification by age
df['age_cat'] = np.where(df['age_int'] > 65, 'Elder',
                        np.where(df['age_int'] > 35, 'Adult',
                                np.where(df['age_int'] > 17, 'Young Adult',
                                        np.where(df['age_int'] > 12, 'Teenager',
                                                 np.where(df['age_int'] == -1, '-', 'Child')))))

In [14]:
df

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,age_int,age_cat
0,2020.08.20,20-Aug-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Boogie boarding,F,50,Minor lacerations to left leg,N,11h00,,50,Adult
1,2020.08.14,14-Aug-2020,2020.0,Unprovoked,AUSTRALIA,New South Wales,"Shelly Beach, Port Macquarie",Surfing,F,35,Lacerations to right calf and posterior thigh,N,09h30,"White shark, 2-to 3m",35,Young Adult
2,2020.08.10,10-Aug-2020,2020.0,Provoked,USA,Florida,"Off Gasparilla Island, Charlotte County",Fishing,M,55,Injury to left forearm by hooked shark PROVOKE...,N,16h00,"Blacktip shark, 6'",55,Adult
3,2020.08.02,02-Aug-2020,2020.0,Unprovoked,USA,Virgin Islands,"Candle Reef, St. Croix",Snorkeling,F,,Lacerations to hand and wrist,N,14h00,"Nurse shark, 5'",-1,-
4,2020.07.31.c,31-Jul-2020,2020.0,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,F,22,Lacerations to foot,N,17h00,,22,Young Adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6553,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,,FATAL,Y,,,-1,-
6554,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,M,,FATAL,Y,,,-1,-
6555,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,,FATAL,Y,,,-1,-
6556,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,M,,FATAL,Y,,,-1,-


# Gender

In [15]:
# Check the column 'Sex'
df.Sex.unique()

array(['F', 'M', nan, 'M ', 'lli', 'M x 2', 'N', '.'], dtype=object)

In [16]:
df.Sex.count()

5987

In [17]:
# Check number of missing values
print(f'The number of NaN in the column "Sex" is {df.Sex.isna().sum()}, which represents '
      f'{(df.Sex.isna().sum() / df.Sex.count())*100:.2f}% of the dataset.', sep=' ')

The number of NaN in the column "Sex" is 571, which represents 9.54% of the dataset.


Since the NaN represents only a samll part of the dataset, the rows containing NaN in the column 'Sex' will be removed.

In [18]:
# Remove rows that the column 'Sex' is a missing value
df[df.Sex.isna()]
df = df.drop(df[df.Sex.isna()].index)

In [19]:
# Check infromation about the changed dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5987 entries, 0 to 6557
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Case Number  5986 non-null   object 
 1   Date         5987 non-null   object 
 2   Year         5985 non-null   float64
 3   Type         5981 non-null   object 
 4   Country      5941 non-null   object 
 5   Area         5581 non-null   object 
 6   Location     5505 non-null   object 
 7   Activity     5583 non-null   object 
 8   Sex          5987 non-null   object 
 9   Age          3621 non-null   object 
 10  Injury       5972 non-null   object 
 11  Fatal (Y/N)  5516 non-null   object 
 12  Time         3032 non-null   object 
 13  Species      3287 non-null   object 
 14  age_int      5987 non-null   int64  
 15  age_cat      5987 non-null   object 
dtypes: float64(1), int64(1), object(14)
memory usage: 795.1+ KB


In [20]:
# Check values in the column 'Sex'
df.Sex.value_counts()

M        5287
F         693
N           2
M           2
M x 2       1
lli         1
.           1
Name: Sex, dtype: int64

In [21]:
# Remove unnecessary spaces in the values of the column 'Sex'
df['Sex'] = df['Sex'].str.strip()
df.Sex.value_counts()

M        5289
F         693
N           2
M x 2       1
lli         1
.           1
Name: Sex, dtype: int64

In [22]:
# Remove rows that the gender is 'N', 'lli', '.' or 'M x 2'
df = df.drop(df[(df.Sex == '.') | (df.Sex == 'N') | (df.Sex == 'lli') | ((df.Sex == 'M x 2'))].index)
df.Sex.value_counts()

M    5289
F     693
Name: Sex, dtype: int64

In [23]:
# Check infromation about the changed dataset
# 5 columns were removed
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5982 entries, 0 to 6557
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Case Number  5981 non-null   object 
 1   Date         5982 non-null   object 
 2   Year         5980 non-null   float64
 3   Type         5976 non-null   object 
 4   Country      5937 non-null   object 
 5   Area         5577 non-null   object 
 6   Location     5502 non-null   object 
 7   Activity     5578 non-null   object 
 8   Sex          5982 non-null   object 
 9   Age          3620 non-null   object 
 10  Injury       5967 non-null   object 
 11  Fatal (Y/N)  5511 non-null   object 
 12  Time         3031 non-null   object 
 13  Species      3283 non-null   object 
 14  age_int      5982 non-null   int64  
 15  age_cat      5982 non-null   object 
dtypes: float64(1), int64(1), object(14)
memory usage: 794.5+ KB


In [24]:
# Check column 'Sex' percentage
df_gener_total = df.Sex.value_counts().sum()
df.Sex.value_counts() / df_gener_total

M    0.884152
F    0.115848
Name: Sex, dtype: float64

In [25]:
print(f'The males represent {(df.Sex.value_counts() / df_gener_total)[0]*100:.2f}% of the people who were'
      f'involved in incidents with sharks, while women represent only {(df.Sex.value_counts() / df_gener_total)[1]*100:.2f}%.',sep=' ')

The males represent 88.42% of the people who wereinvolved in incidents with sharks, while women represent only 11.58%.


In [26]:
# Create a subset containing only females
df_male = df.loc[df['Sex'] == 'F']

In [27]:
# Create a subset containing only males
df_male = df.loc[df['Sex'] == 'M']
df_male.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5289 entries, 2 to 6557
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Case Number  5288 non-null   object 
 1   Date         5289 non-null   object 
 2   Year         5287 non-null   float64
 3   Type         5283 non-null   object 
 4   Country      5247 non-null   object 
 5   Area         4904 non-null   object 
 6   Location     4839 non-null   object 
 7   Activity     4942 non-null   object 
 8   Sex          5289 non-null   object 
 9   Age          3116 non-null   object 
 10  Injury       5275 non-null   object 
 11  Fatal (Y/N)  4881 non-null   object 
 12  Time         2599 non-null   object 
 13  Species      2884 non-null   object 
 14  age_int      5289 non-null   int64  
 15  age_cat      5289 non-null   object 
dtypes: float64(1), int64(1), object(14)
memory usage: 702.4+ KB


In [34]:
# Check males age
df_male.age_cat.value_counts()

-              2257
Young Adult    1520
Adult           705
Teenager        541
Child           220
Elder            46
Name: age_cat, dtype: int64

In [36]:
df_male_total = df_male.Date.count()
df_male_total

5289

In [37]:
# Check males age
df_male.age_cat.value_counts() / df_male_total

-              0.426735
Young Adult    0.287389
Adult          0.133296
Teenager       0.102288
Child          0.041596
Elder          0.008697
Name: age_cat, dtype: float64

In [40]:
df_male_age = df_male.loc[df_male['age_cat'] != '-']
df_male_age_total = df_male_age.Date.count()
df_male_age.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3032 entries, 2 to 6557
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Case Number  3032 non-null   object 
 1   Date         3032 non-null   object 
 2   Year         3031 non-null   float64
 3   Type         3029 non-null   object 
 4   Country      3024 non-null   object 
 5   Area         2921 non-null   object 
 6   Location     2898 non-null   object 
 7   Activity     2920 non-null   object 
 8   Sex          3032 non-null   object 
 9   Age          3032 non-null   object 
 10  Injury       3028 non-null   object 
 11  Fatal (Y/N)  2850 non-null   object 
 12  Time         2080 non-null   object 
 13  Species      1926 non-null   object 
 14  age_int      3032 non-null   int64  
 15  age_cat      3032 non-null   object 
dtypes: float64(1), int64(1), object(14)
memory usage: 402.7+ KB


In [42]:
df_male_age_total

3032

In [43]:
df_male_age.age_cat.value_counts()

Young Adult    1520
Adult           705
Teenager        541
Child           220
Elder            46
Name: age_cat, dtype: int64

In [44]:
df_male_age.age_cat.value_counts() / df_male_age_total

Young Adult    0.501319
Adult          0.232520
Teenager       0.178430
Child          0.072559
Elder          0.015172
Name: age_cat, dtype: float64

50% of male who were in a shark incident were Young Adults.

In [54]:
df_male_ya = df_male_age[df_male_age['age_cat'] == 'Young Adult']
df_male_ya.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1520 entries, 6 to 6497
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Case Number  1520 non-null   object 
 1   Date         1520 non-null   object 
 2   Year         1519 non-null   float64
 3   Type         1520 non-null   object 
 4   Country      1514 non-null   object 
 5   Area         1462 non-null   object 
 6   Location     1444 non-null   object 
 7   Activity     1468 non-null   object 
 8   Sex          1520 non-null   object 
 9   Age          1520 non-null   object 
 10  Injury       1518 non-null   object 
 11  Fatal (Y/N)  1441 non-null   object 
 12  Time         1064 non-null   object 
 13  Species      996 non-null    object 
 14  age_int      1520 non-null   int64  
 15  age_cat      1520 non-null   object 
dtypes: float64(1), int64(1), object(14)
memory usage: 201.9+ KB


In [59]:
df_male_ya.groupby(by='Activity').age_cat.count().sort_values(ascending=False)

Activity
Surfing                                      417
Swimming                                     188
Spearfishing                                 154
Fishing                                       54
Body boarding                                 26
                                            ... 
Skindiving, fish at belt                       1
Snorkeling – hunting crayfish and abalone      1
Spearfishing & holding catch                   1
Spearfishing & lassoed shark                   1
 Diving for abalone (Hookah)                   1
Name: age_cat, Length: 364, dtype: int64

# **Young adult males surfing were the most involved in shark incidents.**