# Projeto Shark Attack

## Questão analisada: Quais tipos de atividades levam homens e mulheres a sofrerem maiores números de ataques de tubarão

### Importação da biblioteca

In [67]:
import pandas as pd
import numpy as np
import os
import re

In [68]:
os.listdir('data')

['.ipynb_checkpoints', 'attacks.csv']

## Conjunto de dados

In [69]:
data = pd.read_csv('data/attacks.csv', encoding = 'latin-1' )

In [70]:
#Colunas em minúsculo; com remoção de caracteres iniciais (lstrip); replace de espaçamento, pontos, etc 

attack_rename = {column : column.lower().lstrip().replace('_', '').replace(' ', '_').replace('.', '_').replace(':_', '_') for column in data}
data = data.rename(attack_rename, axis = 1)

In [71]:
data.head(5)

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex_,...,species_,investigator_or_source,pdf,href_formula,href,case_number_1,case_number_2,original_order,unnamed_22,unnamed_23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [72]:
data.shape

(25723, 24)

In [75]:
data.isnull().sum()

case_number               17021
date                      19421
year                      19423
type                      19425
country                   19471
area                      19876
location                  19961
activity                  19965
name                      19631
sex_                      19986
age                       22252
injury                    19449
fatal_(y/n)               19960
time                      22775
species_                  22259
investigator_or_source    19438
pdf                       19421
href_formula              19422
href                      19421
case_number_1             19421
case_number_2             19421
original_order            19414
unnamed_22                25722
unnamed_23                25721
dtype: int64

# Removendo Duplicatas

In [76]:
sum(~data.duplicated())/sum(data.duplicated())

0.325176446344856

In [77]:
data = data.loc[~data.duplicated(),]

In [78]:
data.duplicated().sum()

0

# Removendo colunas

In [27]:
data.columns

Index(['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex_', 'age', 'injury', 'fatal_(y/n)', 'time',
       'species_', 'investigator_or_source', 'pdf', 'href_formula', 'href',
       'case_number_1', 'case_number_2', 'original_order', 'unnamed_22',
       'unnamed_23'],
      dtype='object')

In [28]:
data = data.drop(['case_number', 'date', 'year', 'area', 'name','injury', 'time', 'investigator_or_source', 'pdf', 'href_formula', 'href', 'case_number_1', 'case_number_2', 'original_order', 'unnamed_22','unnamed_23'], axis = 1)

# Tratando valores

## Valores ausentes

In [29]:
data.isna().sum()

type             14
country          60
location        550
activity        554
sex_            575
age            2841
fatal_(y/n)     549
species_       2848
dtype: int64

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6312 entries, 0 to 25722
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   type         6298 non-null   object
 1   country      6252 non-null   object
 2   location     5762 non-null   object
 3   activity     5758 non-null   object
 4   sex_         5737 non-null   object
 5   age          3471 non-null   object
 6   fatal_(y/n)  5763 non-null   object
 7   species_     3464 non-null   object
dtypes: object(8)
memory usage: 443.8+ KB


In [31]:
data.head()

Unnamed: 0,type,country,location,activity,sex_,age,fatal_(y/n),species_
0,Boating,USA,"Oceanside, San Diego County",Paddling,F,57.0,N,White shark
1,Unprovoked,USA,"St. Simon Island, Glynn County",Standing,F,11.0,N,
2,Invalid,USA,"Habush, Oahu",Surfing,M,48.0,N,
3,Unprovoked,AUSTRALIA,Arrawarra Headland,Surfing,M,,N,2 m shark
4,Provoked,MEXICO,La Ticla,Free diving,M,,N,"Tiger shark, 3m"


## Valores coluna por sexo

In [79]:
data['sex_'].unique()

array(['F', 'M', nan, 'M ', 'lli', 'N', '.'], dtype=object)

In [80]:
len(data['sex_'])

6312

In [81]:
data['sex_'].value_counts()

M      5094
F       637
M         2
N         2
lli       1
.         1
Name: sex_, dtype: int64

In [82]:
data.loc[data['sex_'].str.contains('M ', case=False, na=False), 'sex_'] = 'M'
data.drop(data[data['sex_'] == 'lli'].index, inplace=True)
data.drop(data[data['sex_'] == 'N'].index, inplace=True)
data.drop(data[data['sex_'] == '.'].index, inplace=True)
data['sex_'].value_counts()

M    5096
F     637
Name: sex_, dtype: int64

In [83]:
len(data['sex_'])

6308

In [86]:
data = data.dropna(subset = ['sex_'])

### Análise entre generos

In [89]:
data_women = data.loc[data['sex_']=='F']
len(data_women)

637

In [88]:
data_men = data.loc[data['sex_']=='M']
len(data_men)

5096

## Valores colunas por atividades

In [39]:
data['activity'].isnull().sum()

393

In [40]:
data.isnull().any() 

type            True
country         True
location        True
activity        True
sex_           False
age             True
fatal_(y/n)     True
species_        True
dtype: bool

In [41]:
data.isnull().all(axis=1)

0       False
1       False
2       False
3       False
4       False
        ...  
6297    False
6298    False
6299    False
6300    False
6301    False
Length: 5733, dtype: bool

In [42]:
data['activity'].isnull().value_counts()

False    5340
True      393
Name: activity, dtype: int64

In [43]:
data['activity'].isnull().value_counts(normalize=True)

False    0.93145
True     0.06855
Name: activity, dtype: float64

In [91]:
data['activity'].shape

(5733,)

In [45]:
data.loc[data['activity'].str.contains("Swim", case=False, na=False), 'activity_rename'] = 'swimming'
data.loc[data['activity'].str.contains("Surf", case=False, na=False), 'activity_rename'] = 'surfing'
data.loc[data['activity'].str.contains("Fish", case=False, na=False), 'activity_rename'] = 'fishing'
data.loc[data['activity'].str.contains("Div", case=False, na=False), 'activity_rename'] = 'diving'
data.loc[data['activity'].str.contains("Bath", case=False, na=False), 'activity_rename'] = 'bathing'
data.loc[data['activity'].str.contains("stan", case=False, na=False), 'activity_rename'] = 'standing'
data.loc[data['activity'].str.contains("wadi", case=False, na=False), 'activity_rename'] = 'wading'
data.loc[data['activity'].str.contains("paddling", case=False, na=False), 'activity_rename'] = 'rowing'

In [46]:
data.loc[data['activity_rename'].isna(),'activity_rename'] = 'other'

In [47]:
data_activity = data['activity_rename'].value_counts()

In [92]:
data_activity.shape

(9,)

# Contagem vitimas por atividade em percentual

In [50]:
data.head()

Unnamed: 0,type,country,location,activity,sex_,age,fatal_(y/n),species_,activity_rename
0,Boating,USA,"Oceanside, San Diego County",Paddling,F,57.0,N,White shark,rowing
1,Unprovoked,USA,"St. Simon Island, Glynn County",Standing,F,11.0,N,,standing
2,Invalid,USA,"Habush, Oahu",Surfing,M,48.0,N,,surfing
3,Unprovoked,AUSTRALIA,Arrawarra Headland,Surfing,M,,N,2 m shark,surfing
4,Provoked,MEXICO,La Ticla,Free diving,M,,N,"Tiger shark, 3m",diving


In [54]:
analise_activity = pd.pivot_table(data, index= 'activity_rename', columns = 'sex_', values='activity', aggfunc='count')

In [98]:
analise_activity.loc[:,'calculo_homens']=analise_activity['M'] / analise_activity.sum(axis = 1)

In [99]:
analise_activity.loc[:,'calculo_mulheres']=analise_activity['F'] / analise_activity.sum(axis = 1)

In [63]:
analise_activity

sex_,F,M,calculo_homens,calculo_mulheres
activity_rename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bathing,22,162,0.880435,0.118996
diving,32,550,0.945017,0.054894
fishing,27,944,0.972194,0.027779
other,158,881,0.847931,0.151945
rowing,5,16,0.761905,0.229759
standing,37,116,0.75817,0.240638
surfing,71,1082,0.938422,0.061528
swimming,175,885,0.834906,0.164964
wading,57,120,0.677966,0.320805


In [97]:
analise_activity['M'] / analise_activity.sum(axis = 1)*100 #calculo percentual por homens

activity_rename
bathing     87.570078
diving      94.339899
fishing     97.119444
other       84.711615
rowing      72.864901
standing    75.327652
surfing     93.760902
swimming    83.411948
wading      67.417631
dtype: float64

In [95]:
analise_activity['F'] / analise_activity.sum(axis = 1)*100 #calculo percentual por mulheres

activity_rename
bathing     11.892234
diving       5.488867
fishing      2.777781
other       15.192321
rowing      22.771462
standing    24.026927
surfing      6.152518
swimming    16.493888
wading      32.023377
dtype: float64