In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import regex as re
import src.cleanfun as cf

In [2]:
#cargamos csv
sharks = pd.read_csv("data/attacks.csv",encoding = "ISO-8859-1")

In [3]:
sharks.sample(5)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
13764,,,,,,,,,,,...,,,,,,,,,,
13077,,,,,,,,,,,...,,,,,,,,,,
3553,1966.09.10,10-Sep-1966,1966.0,Unprovoked,AUSTRALIA,Western Australia,Roe Reef off Rottnest Island,Spearfishing,Frank Paxman,M,...,"Mako shark, 1.9 m [6.5']","The West Australian (Perth), 9/14/1966; H.D.Ba...",1966.09.10-Paxman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1966.09.10,1966.09.10,2750.0,,
25219,,,,,,,,,,,...,,,,,,,,,,
8252,0,,,,,,,,,,...,,,,,,,,,,


In [4]:
#eliminamos todas las columnas y filas que todos sus valores sean nulos
sharks.dropna(how="all", inplace=True)
sharks.dropna(how="all",axis = 1, inplace=True)

# Hipótesis
- hipotesis 1 los ataques de tiburones han icrmentado según han ido avanzando los años
- hipótesis 2 los ataquesde tiburones atacan más a embarcaciones que a nadadores
- [hipóstesis 3 Florida es la capital mundial de los ataques de tiburones](https://www.lavanguardia.com/ocio/viajes/20210407/6631447/6-playas-mas-peligrosas-mundo.html)
- [hipótesis 4 ¿Es posible el ataque de tiburón en la costa española?](https://www.mundo-geo.es/naturaleza/es-posible-ataque-tiburon-en-costa-espanola_238643_102.html)

In [5]:
#compruebo qué colmunas tienen datos interesantes para mis hipótesis y borro las que no me hacen ninguna falta.
sharks.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [6]:
columnas_no = ['Case Number','Investigator or Source','pdf', 'href formula',  'href',  'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23'] 
sharks.drop(columnas_no, axis=1, inplace=True)

In [7]:
sharks.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species '],
      dtype='object')

In [8]:
#ahora que hemos borrado columnas vemos si quedan filas que son nan por completo y borrranos
sharks.dropna(how="all", inplace=True)

In [9]:
#cambio nombre de algunas columnas que tienen espacios o caracteres especiales en el nombre
#"Sex " "Species " "Fatal (Y/N)"
cols = ["Sex ","Species ","Fatal (Y/N)"]
new_names = {"Sex " : "Sex",
                 "Species " : "Species",
                 "Fatal (Y/N)" : "Fatal"}
sharks.rename(columns = new_names, inplace=True)
sharks.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Name', 'Sex', 'Age', 'Injury', 'Fatal', 'Time', 'Species'],
      dtype='object')

In [10]:
#segundo repaso a datos para ver si borramos alguna columna más que no me de información necesaria para las hipótesis
sharks.sample(20)
sharks.drop(["Name"],axis=1, inplace=True)

### Limpieza de datos nulos

In [11]:
#miramos cuantos datos nulos hay en cda columna.
sharks.isna().sum()

Date           0
Year           2
Type           4
Country       50
Area         455
Location     540
Activity     544
Sex          565
Age         2831
Injury        28
Fatal        539
Time        3354
Species     2838
dtype: int64

#### rellenamos los nulos en función de los datos que contienen las columnas


In [12]:
sharks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6302 entries, 0 to 6301
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      6302 non-null   object 
 1   Year      6300 non-null   float64
 2   Type      6298 non-null   object 
 3   Country   6252 non-null   object 
 4   Area      5847 non-null   object 
 5   Location  5762 non-null   object 
 6   Activity  5758 non-null   object 
 7   Sex       5737 non-null   object 
 8   Age       3471 non-null   object 
 9   Injury    6274 non-null   object 
 10  Fatal     5763 non-null   object 
 11  Time      2948 non-null   object 
 12  Species   3464 non-null   object 
dtypes: float64(1), object(12)
memory usage: 689.3+ KB


In [13]:
#Year
sharks.Year.unique()
#rellenamos años vacíos con 0
sharks.Year.fillna(0.0,inplace= True)
#convertirmos tipo de dato (float) a int.
sharks.Year = sharks.Year.astype(dtype = "int64")

In [14]:
#type
sharks.Type.unique() #['Boating', 'Unprovoked', 'Invalid', 'Provoked', 'Questionable','Sea Disaster', nan, 'Boat', 'Boatomg']
sharks[sharks["Type"] == "Invalid"].sample(20)
#invalid NO nos sirve para los nulos porque es para ataques que no fueron de tiburón
#mirar a ver si podemos eliminar esas filas porque queremos estudiar solo los casos reales de tiburones.
#nos da mucha información si el ataque fue en embarcación o direcgtamente a "nadador"
sharks.Type.fillna("UNKNOWN",inplace= True)
sharks.Type = sharks.Type.str.strip() #borramos espacios
sharks = sharks[sharks["Type"] != "Invalid"]

In [15]:
#Country, Area y Location -> nans == UNKNOWN
sharks["Country"].fillna("UNKNOWN", inplace = True)
sharks["Area"].fillna("UNKNOWN", inplace = True)
sharks["Location"].fillna("UNKNOWN", inplace = True)
sharks.Country = sharks.Country.str.strip()
sharks.Area = sharks.Area.str.strip()
sharks.Location = sharks.Location.str.strip()

In [16]:
#activity
list(sharks.Activity.unique())
#unknown
#de esta columna podemos sacar datos sobre si el tiburón atacó a embarcación o persona.
sharks["Activity"].fillna("UNKNOWN", inplace = True)
sharks.Activity = sharks.Activity.str.strip()

In [17]:
#sex
sharks.Sex.unique()
#si el sexo de la víctima es desconocido lo rellenamos con "X"
#además queremos que se queden tres datos únicos -> M,F,X
sharks.Sex.fillna("X",inplace = True)
sharks.Sex = sharks.Sex.str.strip() #borramos espacios vacíos antes y después de cada cadena.
sharks.Sex.unique() #['F', 'M', 'X', 'lli', 'N', '.'] > lli, N y . == X
sharks["Sex"].replace('lli',"X",inplace=True)
sharks["Sex"].replace('.',"X",inplace=True)
sharks["Sex"].replace('N',"X",inplace=True)

In [18]:
#Age
sharks.Age.unique()
#rellenamos nans con "UNKNOWN" porque 0 puede confundir la media si luego necesitamos usarla.
sharks.Age.fillna("UNKNOWN",inplace= True)
sharks.Age = sharks.Age.str.strip()

In [19]:
#Injury > nan = UNKNOWN
sharks.Injury.unique()
sharks["Injury"].fillna("UNKNOWN", inplace = True)

In [20]:
#Fatal
sharks.Fatal.unique
#nulos = UNKNOWN
sharks["Fatal"].fillna("UNKNOWN", inplace = True)
sharks.Fatal = sharks.Fatal.str.strip()
sharks.Fatal.unique() #comprobar registros con FAtal == 'M', '2017', 'y'
sharks[(sharks["Fatal"] == 'M')|(sharks["Fatal"] == "2017")|(sharks["Fatal"] == "y")].Injury.unique
sharks["Fatal"].replace('M',"N",inplace=True)
sharks["Fatal"].replace('y',"Y",inplace=True)
sharks["Fatal"].replace('2017',"N",inplace=True)
sharks.Fatal.unique() #['N', 'Y', 'UNKNOWN']

array(['N', 'Y', 'UNKNOWN'], dtype=object)

In [21]:
#Time
sharks.Time.unique()
#nulos = UNKNOWN
sharks["Time"].fillna("UNKNOWN", inplace = True)
#puede que esta columna no la utlicemos para ningún análisis. No nos interesan los datos tan exactos de la hora del ataque.

In [22]:
#Species
list(sharks.Species.unique())
sharks["Species"].fillna("UNKNOWN", inplace = True)
sharks.Species = sharks.Species.str.strip()

In [23]:
sharks.isna().sum()

Date        0
Year        0
Type        0
Country     0
Area        0
Location    0
Activity    0
Sex         0
Age         0
Injury      0
Fatal       0
Time        0
Species     0
dtype: int64

#### precesamos el texto de las columnas

##### Date, Year.

In [24]:
sharks.Date.value_counts()
#de la columna Date extraemos solo las fechas
sharks["Year_D"] = sharks["Date"].str.extract(r'(\d{4})')
sharks["Month_D"] = sharks["Date"].str.extract(r'([A-Z][a-z][a-z]-\d{4})')
sharks["Month_D"] = sharks["Month_D"].str.extract(r'([A-Z][a-z][a-z])')
#volvemos a rellenar los nulos que quedan 
#sharks["Date"].fillna("UNKNOWN", inplace = True)

In [25]:
sharks.Year_D.unique() #los registros de esta columna parecen más concretos que la de Year
#rellenamos nans de esta columna y tanbién de la columna Month. elimnamos Year.
sharks.Year_D.fillna("UNKNOWN",inplace= True)
sharks.Month_D.fillna("UNKNOWN",inplace= True)
#convertimos los strings a enteros
sharks.Year_D = sharks.Year_D.apply(cf.entero_if)

In [26]:
sharks.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Sex', 'Age', 'Injury', 'Fatal', 'Time', 'Species', 'Year_D',
       'Month_D'],
      dtype='object')

##### Type

In [27]:
sharks.Type.unique()

array(['Boating', 'Unprovoked', 'Provoked', 'Questionable',
       'Sea Disaster', 'UNKNOWN', 'Boat', 'Boatomg'], dtype=object)

In [28]:
sharks["Type"] = sharks["Type"].str.replace((".*[Bb](OAT|oat).*"),"Boat", regex=True)
sharks.Type.unique()

array(['Boat', 'Unprovoked', 'Provoked', 'Questionable', 'Sea Disaster',
       'UNKNOWN'], dtype=object)

#### Activity

In [29]:
sharks["Act"] = sharks.Activity.str.extract(r'(\w+ing)+')

In [30]:
sharks["Act"].fillna("UNKONWN", inplace = True)

In [31]:
sharks.Act.unique

<bound method Series.unique of 0       Paddling
1       Standing
3        Surfing
4         diving
5        surfing
          ...   
6297      Diving
6298      diving
6299    Swimming
6300     UNKONWN
6301    Swimming
Name: Act, Length: 5755, dtype: object>

#### Activity -> "Fishing"
- como en Type tenemos "Boat" para saber si hay ataques direcatemente a los barcos, pero luego en activity "Fishing" la mayoría son desde barcos, vamos a crear una columna "Boat" incluyendo todos los registros que son type "Boat" además de los que no son "Boat" pero son Fishin

In [32]:
#filtramos todos registros que son tipo Boat y de los que no son tipo Boat pero son activity fishing para ver cuantos realemnte van en barco
sharks["Boat"] = np.where(((sharks["Type"] == "Boat")|((sharks["Type"] != "Boat") & (sharks["Act"] == "Fishing"))),True,False)

In [33]:
sharks.Boat.value_counts()

False    4984
True      771
Name: Boat, dtype: int64

#### Sex

In [34]:
sharks.Sex.unique()

array(['F', 'M', 'X'], dtype=object)

#### Age

In [35]:
#dejamos solos los valores de edades de Age:
sharks.Age = sharks.Age.apply(cf.edades)
sharks.Age.unique()

array(['57', '11', 'UNKNOWN', '18', '52', '15', '12', '32', '10', '34',
       '30', '60', '33', '29', '54', '41', '37', '19', '25', '69', '38',
       '55', '35', '45', '40', '28', '20', '24', '26', '49', '14', '22',
       '7', '31', '17', '13', '42', '3', '50', '46', '16', '82', '48',
       '21', '51', '39', '58', '47', '61', '65', '73', '36', '66', '43',
       '9', '72', '59', '6', '64', '23', '71', '44', '27', '62', '68',
       '63', '70', '18 months', '53', '8', '77', '74', '56', '5', '86',
       '84', '87', '75', '9 months', '1', '81', '78', '2'], dtype=object)

In [36]:
sharks.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Sex', 'Age', 'Injury', 'Fatal', 'Time', 'Species', 'Year_D', 'Month_D',
       'Act', 'Boat'],
      dtype='object')

#### Injury

In [37]:
sharks["Injur_Type"] = sharks.Injury.apply(cf.actividades)

In [38]:
sharks.Injur_Type.unique()

array(['No injury', 'Minor injuries', 'Lacerations', 'Fatal injury',
       'Bitten', 'Injured', 'Others', 'Punctured', 'Limb severed',
       'No details', 'Unknown'], dtype=object)

#### Fatal

In [39]:
sharks.Fatal.unique()

array(['N', 'Y', 'UNKNOWN'], dtype=object)

#### Time
no vamos a usar esta columna, porque para nuestros datos no nos intersa a qué hora pasaron los incidentes

### species

In [40]:
sharks["Spec"] = sharks.Species.str.extract(r'(\w{3,}(?= shark))')

In [41]:
sharks["Spec"].fillna("UNKONWN", inplace = True)

In [42]:
sharks.Spec.unique()

array(['White', 'UNKONWN', 'Tiger', 'Lemon', 'Bull', 'reef', 'nurse',
       'bull', 'Wobbegong', 'Blacktip', 'white', 'Galapagos', 'small',
       'Nurse', 'blue', 'Cookiecutter', 'Spinner', 'blacktip', 'whitetip',
       'Sandtiger', 'Blue', 'gill', 'sevengill', 'Angel', 'dogfish',
       'Mako', 'whaler', 'Reef', 'Silky', 'juvenile', 'Hammerhead',
       'spinner', 'foot', 'Raggedtooth', 'Goblin', 'tiger', 'metre',
       'Sandbar', 'Cow', 'Salmon', 'Porbeagle', 'Jackson', 'sandtiger',
       'Sevengill', 'Zambesi', 'lemon', '30kg', 'hammerhead', 'Thresher',
       'whale', 'cutter', 'Dusky', 'smoothhound', 'Basking', 'Sand',
       'sandbar', 'same', 'copper', 'mako', 'brown', 'colored', 'cow',
       'sand', 'captive', 'bonnethed', 'finned', 'dusky', 'Soupfin',
       'young', 'Leopard', 'Small', 'Unidentified', 'grey', 'female',
       'Two', 'gaffed', 'silvertip', 'Zambezi', 'silky', 'gray',
       'thresher', 'Whale', 'Carpet', 'Copper', 'porbeagle', 'Dog', 'for',
       'carpe

### country // Areas

In [43]:
sharks.Area.value_counts().sample(20) #las areas nos interesan sobre todo Florida en comparación con el resto...

Elqui Province                          1
330 to 350 miles east of Wake Island    1
Balearics                               1
Manila                                  3
St. Johns Reef                          3
Antalya Province                        1
Cyrenaica                               1
Providenciales                          1
Between Perth & Colombo                 1
North of Pernambuco, Brazil             1
Kagawa Prefecture                       1
Isla Provedencia                        1
Rhode Island                            7
Paraiba                                 1
Las Perlas archipelago                  1
Aden                                    6
Tongatapu                               1
Fernando de Noronha                     1
Pacific coast                           1
Saipan                                  1
Name: Area, dtype: int64

## Exportación

In [44]:
sharks

Unnamed: 0,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal,Time,Species,Year_D,Month_D,Act,Boat,Injur_Type,Spec
0,25-Jun-2018,2018,Boat,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,2018,Jun,Paddling,True,No injury,White
1,18-Jun-2018,2018,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,UNKNOWN,2018,Jun,Standing,False,Minor injuries,UNKONWN
3,08-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,M,UNKNOWN,Minor injury to lower leg,N,UNKNOWN,2 m shark,2018,Jun,Surfing,False,Minor injuries,UNKONWN
4,04-Jun-2018,2018,Provoked,MEXICO,Colima,La Ticla,Free diving,M,UNKNOWN,Lacerations to leg & hand shark PROVOKED INCIDENT,N,UNKNOWN,"Tiger shark, 3m",2018,Jun,diving,False,Lacerations,Tiger
5,03-Jun-2018,2018,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Kite surfing,M,UNKNOWN,"No injury, board bitten",N,UNKNOWN,UNKNOWN,2018,Jun,surfing,False,No injury,UNKONWN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6297,Before 1903,0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,M,UNKNOWN,FATAL,Y,UNKNOWN,UNKNOWN,1903,UNKNOWN,Diving,False,Fatal injury,UNKONWN
6298,Before 1903,0,Unprovoked,AUSTRALIA,Western Australia,UNKNOWN,Pearl diving,M,UNKNOWN,FATAL,Y,UNKNOWN,UNKNOWN,1903,UNKNOWN,diving,False,Fatal injury,UNKONWN
6299,1900-1905,0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,M,UNKNOWN,FATAL,Y,UNKNOWN,UNKNOWN,1900,UNKNOWN,Swimming,False,Fatal injury,UNKONWN
6300,1883-1889,0,Unprovoked,PANAMA,UNKNOWN,"Panama Bay 8ºN, 79ºW",UNKNOWN,M,UNKNOWN,FATAL,Y,UNKNOWN,UNKNOWN,1883,UNKNOWN,UNKONWN,False,Fatal injury,UNKONWN


In [45]:
#exportarmos sharky
sharks.to_csv("data/sharks.csv")