# Projeto Shark Attack

## Questão analisada: Quais tipos de atividades levam homens e mulheres a sofrerem maiores números de ataques de tubarão

### Importação da biblioteca

In [105]:
import pandas as pd
import numpy as np
import os
import re

In [106]:
os.listdir('data')

['.ipynb_checkpoints', 'attacks.csv']

## Conjunto de dados

In [107]:
data = pd.read_csv('data/attacks.csv', encoding = 'latin-1' )

In [108]:
#Colunas em minúsculo; com remoção de caracteres iniciais (lstrip); replace de espaçamento, pontos, etc 

attack_rename = {column : column.lower().lstrip().replace('_', '').replace(' ', '_').replace('.', '_').replace(':_', '_') for column in data}
data = data.rename(attack_rename, axis = 1)

In [109]:
data.head(5)

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex_,...,species_,investigator_or_source,pdf,href_formula,href,case_number_1,case_number_2,original_order,unnamed_22,unnamed_23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [110]:
data.shape

(25723, 24)

In [111]:
data['type']

0           Boating
1        Unprovoked
2           Invalid
3        Unprovoked
4          Provoked
            ...    
25718           NaN
25719           NaN
25720           NaN
25721           NaN
25722           NaN
Name: type, Length: 25723, dtype: object

In [112]:
data.isnull().sum()

case_number               17021
date                      19421
year                      19423
type                      19425
country                   19471
area                      19876
location                  19961
activity                  19965
name                      19631
sex_                      19986
age                       22252
injury                    19449
fatal_(y/n)               19960
time                      22775
species_                  22259
investigator_or_source    19438
pdf                       19421
href_formula              19422
href                      19421
case_number_1             19421
case_number_2             19421
original_order            19414
unnamed_22                25722
unnamed_23                25721
dtype: int64

# Removendo Duplicatas

In [113]:
sum(~data.duplicated())/sum(data.duplicated())

0.325176446344856

In [114]:
data = data.loc[~data.duplicated(),]

In [115]:
data.duplicated().sum()

0

# Removendo colunas

In [116]:
data.columns

Index(['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex_', 'age', 'injury', 'fatal_(y/n)', 'time',
       'species_', 'investigator_or_source', 'pdf', 'href_formula', 'href',
       'case_number_1', 'case_number_2', 'original_order', 'unnamed_22',
       'unnamed_23'],
      dtype='object')

In [117]:
data = data.drop(['case_number', 'date', 'year', 'area', 'name','injury', 'time', 'investigator_or_source', 'pdf', 'href_formula', 'href', 'case_number_1', 'case_number_2', 'original_order', 'unnamed_22','unnamed_23'], axis = 1)

# Tratamento de dados

## Valores Ausentes

In [118]:
data.isna().sum()

type             14
country          60
location        550
activity        554
sex_            575
age            2841
fatal_(y/n)     549
species_       2848
dtype: int64

In [119]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6312 entries, 0 to 25722
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   type         6298 non-null   object
 1   country      6252 non-null   object
 2   location     5762 non-null   object
 3   activity     5758 non-null   object
 4   sex_         5737 non-null   object
 5   age          3471 non-null   object
 6   fatal_(y/n)  5763 non-null   object
 7   species_     3464 non-null   object
dtypes: object(8)
memory usage: 443.8+ KB


In [120]:
data.head()

Unnamed: 0,type,country,location,activity,sex_,age,fatal_(y/n),species_
0,Boating,USA,"Oceanside, San Diego County",Paddling,F,57.0,N,White shark
1,Unprovoked,USA,"St. Simon Island, Glynn County",Standing,F,11.0,N,
2,Invalid,USA,"Habush, Oahu",Surfing,M,48.0,N,
3,Unprovoked,AUSTRALIA,Arrawarra Headland,Surfing,M,,N,2 m shark
4,Provoked,MEXICO,La Ticla,Free diving,M,,N,"Tiger shark, 3m"


## Valores coluna por sexo

In [121]:
data['sex_'].unique()

array(['F', 'M', nan, 'M ', 'lli', 'N', '.'], dtype=object)

In [122]:
len(data['sex_'])

6312

In [123]:
data['sex_'].value_counts()

M      5094
F       637
M         2
N         2
lli       1
.         1
Name: sex_, dtype: int64

In [124]:
data.loc[data['sex_'].str.contains('M ', case=False, na=False), 'sex_'] = 'M'
data.drop(data[data['sex_'] == 'lli'].index, inplace=True)
data.drop(data[data['sex_'] == 'N'].index, inplace=True)
data.drop(data[data['sex_'] == '.'].index, inplace=True)
data['sex_'].value_counts()

M    5096
F     637
Name: sex_, dtype: int64

In [95]:
data = data.dropna(subset = ['sex_'])

## Análise entre generos

In [125]:
data_women = data.loc[data['sex_']=='F']

In [85]:
data_men = data.loc[data['sex_']=='M']

## Valores colunas por atividades

In [126]:
data['activity'].isnull().sum()

554

In [127]:
data.isnull().any() 

type           True
country        True
location       True
activity       True
sex_           True
age            True
fatal_(y/n)    True
species_       True
dtype: bool

In [128]:
data.isnull().all(axis=1)

0        False
1        False
2        False
3        False
4        False
         ...  
6307      True
6308      True
6309      True
8702      True
25722     True
Length: 6308, dtype: bool

In [129]:
data['activity'].isnull().value_counts()

False    5754
True      554
Name: activity, dtype: int64

In [134]:
data['activity'].isnull().value_counts(normalize=True)

False    0.912175
True     0.087825
Name: activity, dtype: float64

In [135]:
data['activity']

0           Paddling
1           Standing
2            Surfing
3            Surfing
4        Free diving
            ...     
6307             NaN
6308             NaN
6309             NaN
8702             NaN
25722            NaN
Name: activity, Length: 6308, dtype: object

In [137]:
data.loc[data['activity'].str.contains("Swim", case=False, na=False), 'activity_rename'] = 'swimming'
data.loc[data['activity'].str.contains("Surf", case=False, na=False), 'activity_rename'] = 'surfing'
data.loc[data['activity'].str.contains("Fish", case=False, na=False), 'activity_rename'] = 'fishing'
data.loc[data['activity'].str.contains("Div", case=False, na=False), 'activity_rename'] = 'diving'
data.loc[data['activity'].str.contains("Bath", case=False, na=False), 'activity_rename'] = 'bathing'
data.loc[data['activity'].str.contains("stan", case=False, na=False), 'activity_rename'] = 'standing'
data.loc[data['activity'].str.contains("wadi", case=False, na=False), 'activity_rename'] = 'wading'
data.loc[data['activity'].str.contains("paddling", case=False, na=False), 'activity_rename'] = 'rowing'

In [138]:
data.loc[data['activity_rename'].isna(),'activity_rename'] = 'other'

In [139]:
data['activity_rename'].value_counts()

other       1750
surfing     1182
fishing     1142
swimming    1087
diving       602
bathing      191
wading       179
standing     153
rowing        22
Name: activity_rename, dtype: int64

In [140]:
data

Unnamed: 0,type,country,location,activity,sex_,age,fatal_(y/n),species_,activity_rename
0,Boating,USA,"Oceanside, San Diego County",Paddling,F,57,N,White shark,rowing
1,Unprovoked,USA,"St. Simon Island, Glynn County",Standing,F,11,N,,standing
2,Invalid,USA,"Habush, Oahu",Surfing,M,48,N,,surfing
3,Unprovoked,AUSTRALIA,Arrawarra Headland,Surfing,M,,N,2 m shark,surfing
4,Provoked,MEXICO,La Ticla,Free diving,M,,N,"Tiger shark, 3m",diving
...,...,...,...,...,...,...,...,...,...
6307,,,,,,,,,other
6308,,,,,,,,,other
6309,,,,,,,,,other
8702,,,,,,,,,other


## Agrupamento

## Tipos de incidentes e sua classificação:

#### Não provocado - Quando um tubarão percebe um humano como uma ameaça ou concorrente de uma fonte de alimento

#### Provocado - GSAF define um incidente provocado como aquele em que o tubarão foi espetado, fisgado, capturado ou em que um humano tirou o "primeiro sangue"

#### Incidentes envolvendo embarcações – Incidentes em que um barco foi mordido ou abalroado por um tubarão . No entanto, há casos em que o tubarão foi fisgado, apanhado com rede ou arpejado, são classificados como incidentes provocados.

#### Incidentes questionáveis - Incidentes em que não há dados suficientes para determinar se a lesão foi causada por um tubarão ou a pessoa se afogou e o corpo foi posteriormente recuperado por tubarões. 


In [141]:
#Registro de incidentes: Não provocado / Provocado / inválido / Desastre Marinho / Passeio de barco / Barco /Questionável / Boatomg 1 
data.value_counts("type")

type
Unprovoked      4594
Provoked         573
Invalid          547
Sea Disaster     238
Boating          202
Boat             137
Questionable       2
Boatomg            1
dtype: int64