In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import regex as re
import misfunciones as mf

In [2]:
#cargamos csv
sharks = pd.read_csv("data/attacks.csv",encoding = "ISO-8859-1")

In [3]:
sharks.sample(100)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
6795,0,,,,,,,,,,...,,,,,,,,,,
16737,,,,,,,,,,,...,,,,,,,,,,
8428,0,,,,,,,,,,...,,,,,,,,,,
1532,2005.11.02.a,02-Nov-2005,2005.0,Unprovoked,USA,California,"Ocean Beach, San Francisco, San Francisco County",Surfing,Jake Daneman,M,...,12' to 14' white shark,"R. Collier, GSAF",2005.11.02.a-Daneman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2005.11.02.a,2005.11.02.a,4771.0,,
13141,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23952,,,,,,,,,,,...,,,,,,,,,,
22534,,,,,,,,,,,...,,,,,,,,,,
14662,,,,,,,,,,,...,,,,,,,,,,
23386,,,,,,,,,,,...,,,,,,,,,,


In [4]:
#eliminamos todas las columnas y filas que todos sus valores sean nulos
sharks.dropna(how="all", inplace=True)
sharks.dropna(how="all",axis = 1, inplace=True)

# Hipótesis
- hipotesis 1 los ataques de tiburones han icrmentado según han ido avanzando los años
- hipótesis 2 los ataquesde tiburones atacan más a embarcaciones que a nadadores
- [hipóstesis 3 Florida es la capital mundial de los ataques de tiburones](https://www.lavanguardia.com/ocio/viajes/20210407/6631447/6-playas-mas-peligrosas-mundo.html)
- [hipótesis 4 ¿Es posible el ataque de tiburón en la costa española?](https://www.mundo-geo.es/naturaleza/es-posible-ataque-tiburon-en-costa-espanola_238643_102.html)

In [5]:
#compruebo qué colmunas tienen datos interesantes para mis hipótesis y borro las que no me hacen ninguna falta.
sharks.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [6]:
columnas_no = ['Case Number','Investigator or Source','pdf', 'href formula',  'href',  'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23'] 
sharks.drop(columnas_no, axis=1, inplace=True)

In [7]:
sharks.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species '],
      dtype='object')

In [8]:
#ahora que hemos borrado columnas vemos si quedan filas que son nan por completo y borrranos
sharks.dropna(how="all", inplace=True)

In [9]:
#cambio nombre de algunas columnas que tienen espacios o caracteres especiales en el nombre
#"Sex " "Species " "Fatal (Y/N)"
cols = ["Sex ","Species ","Fatal (Y/N)"]
new_names = {"Sex " : "Sex",
                 "Species " : "Species",
                 "Fatal (Y/N)" : "Fatal"}
sharks.rename(columns = new_names, inplace=True)
sharks.columns

Index(['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity',
       'Name', 'Sex', 'Age', 'Injury', 'Fatal', 'Time', 'Species'],
      dtype='object')

In [10]:
#segundo repaso a datos para ver si borramos alguna columna más que no me de información necesaria para las hipótesis
sharks.sample(20)
sharks.drop(["Name"],axis=1, inplace=True)

### Limpieza de datos nulos

In [11]:
#miramos cuantos datos nulos hay en cda columna.
sharks.isna().sum()

Date           0
Year           2
Type           4
Country       50
Area         455
Location     540
Activity     544
Sex          565
Age         2831
Injury        28
Fatal        539
Time        3354
Species     2838
dtype: int64

#### rellenamos los nulos en función de los datos que contienen las columnas


In [12]:
sharks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6302 entries, 0 to 6301
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      6302 non-null   object 
 1   Year      6300 non-null   float64
 2   Type      6298 non-null   object 
 3   Country   6252 non-null   object 
 4   Area      5847 non-null   object 
 5   Location  5762 non-null   object 
 6   Activity  5758 non-null   object 
 7   Sex       5737 non-null   object 
 8   Age       3471 non-null   object 
 9   Injury    6274 non-null   object 
 10  Fatal     5763 non-null   object 
 11  Time      2948 non-null   object 
 12  Species   3464 non-null   object 
dtypes: float64(1), object(12)
memory usage: 689.3+ KB


In [13]:
#Year
sharks.Year.unique()
#rellenamos años vacíos con 0
sharks.Year.fillna(0.0,inplace= True)
#convertirmos tipo de dato (float) a int.
sharks.Year = sharks.Year.astype(dtype = "int64")

In [14]:
#type
sharks.Type.unique() #['Boating', 'Unprovoked', 'Invalid', 'Provoked', 'Questionable','Sea Disaster', nan, 'Boat', 'Boatomg']
sharks[sharks["Type"] == "Invalid"].sample(20)
#invalid NO nos sirve para los nulos porque es para ataques que no fueron de tiburón
#mirar a ver si podemos eliminar esas filas porque queremos estudiar solo los casos reales de tiburones.
#nos da mucha información si el ataque fue en embarcación o direcgtamente a "nadador"
sharks.Type.fillna("UNKNOWN",inplace= True)

In [15]:
#Country, Area y Location -> nans == UNKNOWN
sharks["Country"].fillna("UNKNOWN", inplace = True)
sharks["Area"].fillna("UNKNOWN", inplace = True)
sharks["Location"].fillna("UNKNOWN", inplace = True)

In [16]:
#activity
list(sharks.Activity.unique())
#unknown
#de esta columna podemos sacar datos sobre si el tiburón atacó a embarcación o persona.
sharks["Activity"].fillna("UNKNOWN", inplace = True)

In [17]:
#sex
sharks.Sex.unique()
#si el sexo de la víctima es desconocido lo rellenamos con "X"
#además queremos que se queden tres datos únicos -> M,F,X
sharks.Sex.fillna("X",inplace = True)
sharks.Sex = sharks.Sex.str.strip() #borramos espacios vacíos antes y después de cada cadena.
sharks.Sex.unique() #['F', 'M', 'X', 'lli', 'N', '.'] > lli, N y . == X
sharks["Sex"].replace('lli',"X",inplace=True)
sharks["Sex"].replace('.',"X",inplace=True)
sharks["Sex"].replace('N',"X",inplace=True)

In [18]:
#Age
sharks.Age.unique()
#rellenamos nans con "UNKNOWN" porque 0 puede confundir la media si luego necesitamos usarla.
sharks.Age.fillna("UNKNOWN",inplace= True)
#quitamos espacios antes y detrás de los strings
sharks.Age = sharks.Age.str.strip()
#convertirmos a número todas las cadenas que sean dígitos
sharks.Age = sharks.Age.apply(mf.entero_if)
#queda pendiente limpiar datos que son cadena y no enteros.
    #posiblidad = nueva columnas con rangos de edad y y poner datos como los strings que ya están.

In [19]:
#Injury > nan = UNKNOWN
sharks.Injury.unique()
sharks["Injury"].fillna("UNKNOWN", inplace = True)

In [20]:
#Fatal
sharks.Fatal.unique
#nulos = UNKNOWN
sharks["Fatal"].fillna("UNKNOWN", inplace = True)
sharks.Fatal = sharks.Fatal.str.strip()
sharks.Fatal.unique() #comprobar registros con FAtal == 'M', '2017', 'y'
sharks[(sharks["Fatal"] == 'M')|(sharks["Fatal"] == "2017")|(sharks["Fatal"] == "y")].Injury.unique
sharks["Fatal"].replace('M',"N",inplace=True)
sharks["Fatal"].replace('y',"Y",inplace=True)
sharks["Fatal"].replace('2017',"N",inplace=True)
sharks.Fatal.unique() #['N', 'Y', 'UNKNOWN']

array(['N', 'Y', 'UNKNOWN'], dtype=object)

In [21]:
#Time
sharks.Time.unique()
#nulos = UNKNOWN
sharks["Time"].fillna("UNKNOWN", inplace = True)
#puede que esta columna no la utlicemos para ningún análisis. No nos interesan los datos tan exactos de la hora del ataque.

In [22]:
#Species
list(sharks.Species.unique())
sharks["Species"].fillna("UNKNOWN", inplace = True)

In [23]:
sharks.isna().sum()

Date        0
Year        0
Type        0
Country     0
Area        0
Location    0
Activity    0
Sex         0
Age         0
Injury      0
Fatal       0
Time        0
Species     0
dtype: int64

#### precesamos el texto de las columnas

##### Date, Year.

In [24]:
sharks.Date.value_counts()
#de la columna Date extraemos solo las fechas
sharks["Year_D"] = sharks["Date"].str.extract(r'(\d{4})')
sharks["Month_D"] = sharks["Date"].str.extract(r'([A-Z][a-z][a-z]-\d{4})')
sharks["Month_D"] = sharks["Month_D"].str.extract(r'([A-Z][a-z][a-z])')
#volvemos a rellenar los nulos que quedan 
#sharks["Date"].fillna("UNKNOWN", inplace = True)

In [25]:
sharks.Year_D.unique() #los registros de esta columna parecen más concretos que la de Year
#rellenamos nans de esta columna y tanbién de la columna Month. elimnamos Year.
sharks.Year_D.fillna("UNKNOWN",inplace= True)
sharks.Month_D.fillna("UNKNOWN",inplace= True)
#convertimos los strings a enteros
sharks.Year_D = sharks.Year_D.apply(mf.entero_if)


In [26]:
# miramos los registros cuyo tipo es Invalid para ver si nos sirven o no... 
sharks[(sharks.Type == "Invalid")].sample(50)
# muchos de los registros confirman que no son ataques de tiburón, otros son ataques de tiburón muy dudosos. 
# nos deshacemos de los registros que tengan typo = "INVALID"

sharks = sharks[(sharks.Type != "Invalid")]


In [27]:
#como tenemos unos datos más coherentes con month_D y Year_D borramos Date y year
sharks.drop(["Date","Year"],axis=1, inplace=True)

##### Type

In [28]:
sharks.Type.unique()

array(['Boating', 'Unprovoked', 'Provoked', 'Questionable',
       'Sea Disaster', 'UNKNOWN', 'Boat', 'Boatomg'], dtype=object)

In [29]:
sharks[(sharks["Type"] == "Boating")|(sharks["Type"] == "Boatomg")|(sharks["Type"] == "Boat")].sample(50) #son el mismo tipo
#cambiamos todos a "Boat"
sharks["Type"] = sharks["Type"].str.replace((".*[Bb](OAT|oat).*"),"Boat", regex=True)
sharks.Type.unique()

array(['Boat', 'Unprovoked', 'Provoked', 'Questionable', 'Sea Disaster',
       'UNKNOWN'], dtype=object)

In [30]:
#sharks[["Type","Activity","Injury"]][(sharks.Type != "Boat")&(sharks.Activity.str.contains(".*[Ff](ishin|ISHIN).*",regex = True))].sample(50)
#sharks[["Type","Activity","Injury"]][(sharks.Type != "Boat")&(sharks.Injury.str.contains(".*[Oo](c+upan|C+UPAN).*",regex = True))]
#sharks[["Type","Activity","Injury"]][((sharks.Type != "Boat")&(sharks.Injury.str.contains(".*[Oo](c+upan|C+UPAN).*",regex = True)))|(sharks.Injury.str.contains(".*[Oo](c+upan|C+UPAN).*",regex = False))]

In [31]:
#comprobamos los datos de las columnas Type, Activity e Injury que son las que más datos nos dan de si el tiburón ataca botes
sharks[["Type","Activity","Injury"]]
#ponemos filtro a Type para que sea diferente de Boat y así ver si quedan datos que aporten ataques a embarcaciones
sharks[["Type","Activity","Injury"]][(sharks.Type != "Boat")].sample(50)
#algunos registros dicen en Injury = "No injury to ocuppant(s)" por lo que podemos deducir que se atacó embarcación:
sharks[["Type","Activity","Injury"]][(sharks.Type != "Boat")&(sharks.Injury.str.contains(".*[Oo](c+upan|C+UPAN).*",regex = True))]
#algunos registros de Activity parece ser que ocurrieron en embarcaciones en las que se estaba pescando -> Ataque embarcación.
sharks[["Type","Activity","Injury"]][(sharks.Type != "Boat")&(sharks.Activity.str.contains("(\s|^)[Ff](ishing|ISHING).*",regex = True))].sample(10)

  sharks[["Type","Activity","Injury"]][(sharks.Type != "Boat")&(sharks.Injury.str.contains(".*[Oo](c+upan|C+UPAN).*",regex = True))]
  sharks[["Type","Activity","Injury"]][(sharks.Type != "Boat")&(sharks.Activity.str.contains("(\s|^)[Ff](ishing|ISHING).*",regex = True))].sample(10)


Unnamed: 0,Type,Activity,Injury
5398,Provoked,Fishing,Right hand severely bitten by netted shark PRO...
5864,Unprovoked,Fishing,FATAL
1094,Provoked,Fishing,Injuries to face & neck PROVOKED INCIDENT
3463,Unprovoked,Dynamite fishing,Lacerations to head
3984,Provoked,Fishing,Right arm bitten by shark taken onboard in net...
3087,Provoked,Shark fishing,No injury to occupants; hooked shark tore out ...
3917,Provoked,Fishing,Landed shark in boat bit his left leg PROVOKE...
1421,Unprovoked,Fishing,Lower right leg and foot bitten
4048,Provoked,Shark fishing,"No injury to occupant, hull bitten PROVOKED IN..."
2754,Provoked,Fishing,Hand bitten by captured shark PROVOKED INCIDENT


In [32]:
#reemplazamos datos en Activity para hacerlos más concisos.
list(sharks.Activity.unique())
sharks.Activity = sharks.Activity.str.strip()

In [33]:
len(list(sharks.Activity.unique()))

1392

In [34]:
list(sharks.Activity.unique())
#("(\s|^)[Ff](ishing|ISHING).*",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ss](URF|urf).*","Surf and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Bb](oog|OOG).*","Surf and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Kk](ITE|ite).*","Surf and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Bb](ody|ODY).*[Bb](OARD|oard).*","Surf and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*([Dd](ivin|IVIN)|[Dd](ive|IVE)).*","Diving and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ss](wim+|WIM+).*","Swimming and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Jj](ump|UMP).*","Swimming and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Pp](ad+|AD+).*","Paddling and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ss](NORK|nork).*","Snorkeling and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ff](E+DIN|e+din).*","Feeding animals and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Sp](EAR|ear).*","Using spears",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Kk](ayak|AYAK).*","Kayaking and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Cc](anoe|ANOE).*","Canoeing and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Bb](ATH|ath).*","Swimming and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ff](LOAT|loat).*","Swimming and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ss](ki|KI).*","Skiing and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Pp](LAY|lay).*","Playing and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Kk](ILL|ill).*","Fishing and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*([Cc](apsi|APSI)|[Ss](ink|INK)).*","Capsized, sinked",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ww](reck|RECK).*","Shipwreck",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Dd](eep|EEP).*","Diving and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ff](ilm|ILM).*","Filming or photographing",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Pp](HOT|hot).*","Filming or photographing",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ss](elfi|ELFI).*","Filming or photographing",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Rr](OW|ow).*","Rowing and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*([Ss](TAND|tand)|[Ss](itt|ITT)).*","Standing/Sitting",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ss](ail|AIL).*","sailing and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ww](ash|ASH).*","washing",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ww](adin|ADIN).*","wading",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Dd](isast|ISAST).*","Air/Sea Disaster",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Aa](irc|IRC).*","Air/Sea Disaster",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Rr](ESCU|escu).*","Rescuing and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Aa](DRIFT|drift).*","Adrift",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Oo](VERBO|verbo).*","Fell/swept overboard",regex = True)
#overboard 


sharks["Activity"] = sharks["Activity"].str.replace(".*[Ff](ish|ISH).*","Fishing and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ff](inn|INN).*","Fishing and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Hh](UNT|unt).*","Fishing and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ss](hrim|HRIM).*","Fishing and similars",regex = True)
sharks["Activity"] = sharks["Activity"].str.replace(".*[Ll](OBST|obst).*","Fishing and similars",regex = True)
#'Lobstering'

In [35]:
list(sharks.Activity.unique())

['Paddling and similars',
 'Standing/Sitting',
 'Surf and similars',
 'Diving and similars',
 'Swimming and similars',
 'Fishing and similars',
 'Walking',
 'Feeding animals and similars',
 'wading',
 'Kayaking and similars',
 'Snorkeling and similars',
 'UNKNOWN',
 'Using spears',
 'Capsized, sinked',
 'Canoeing and similars',
 'SUP',
 'Skiing and similars',
 'Touching a shark',
 'Attempting to lasso a shark',
 'Kakaying',
 'washing',
 'Filming or photographing',
 'Tagging sharks',
 'SUP Foil boarding',
 'Teasing a shark',
 'Air/Sea Disaster',
 'Playing and similars',
 'Rescuing and similars',
 'Rowing and similars',
 'Petting a shark',
 'Kneeling in the water',
 'Fell into the water',
 'Shark watching',
 'sailing and similars',
 'Casting a net',
 'Wrangling a shark',
 'Attempting to free the shark',
 'Wakeboarding',
 'Attempting to fix motor',
 'Measuring sharks',
 'Yacht race',
 'Treading water',
 "Accidentally stood on hooked shark's tail before attempting to gut it",
 'Attempting 

In [36]:
len(list(sharks.Activity.unique()))

325

In [37]:
#hacer una nueva columna booleana "boat" 1 = el tibruón ataca a embarcación. 0 = tiburón ataca persona en agua.
#incluir datos que de Type == Boat
sharks[(sharks["Type"] == "Boat")]
#también incluir datos que Type != de Boat pero en activity estuvieran pescando y en injury "No injury to occupants"
#sharks[["Type","Activity","Injury"]][(sharks.Type != "Boat")&(sharks.Activity.str.contains("(\s|^)[Ff](ishing|ISHING).*",regex = True))]
sharks[(sharks["Type"] == "Boat")|((sharks.Type != "Boat")&(sharks.Activity.str.contains("(\s|^)[Ff](ishing|ISHING).*",regex = True)))]

  sharks[(sharks["Type"] == "Boat")|((sharks.Type != "Boat")&(sharks.Activity.str.contains("(\s|^)[Ff](ishing|ISHING).*",regex = True)))]


Unnamed: 0,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal,Time,Species,Year_D,Month_D
0,Boat,USA,California,"Oceanside, San Diego County",Paddling and similars,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,2018,Jun
7,Unprovoked,USA,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing and similars,M,52,Minor injury to foot. PROVOKED INCIDENT,N,UNKNOWN,"Lemon shark, 3'",2018,May
23,Unprovoked,MALDIVES,Alifu Alifu Atoll,Madoogali,Fishing and similars,M,32,5-inch cut to hand,N,21h50,Tiger shark,2018,Apr
85,UNKNOWN,SAMOA,Upolu Island,Nofoalii,Fishing and similars,M,UNKNOWN,Injuries to hands and legs,N,Night,UNKNOWN,2017,Sep
86,Boat,AUSTRALIA,Westerm Australia,Esperance,Fishing and similars,X,UNKNOWN,"sharks rammed boats, no injury to occupants",N,UNKNOWN,"White shark, 3.5m",2017,Sep
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6283,Unprovoked,USA,Florida,"Gadsden Point, Tampa Bay",Fishing and similars,M,UNKNOWN,2-inch lacerations,N,UNKNOWN,UNKNOWN,1921,UNKNOWN
6289,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,Fishing and similars,M,UNKNOWN,"FATAL, body not recovered but shark was caught...",Y,UNKNOWN,UNKNOWN,1913,Jul
6293,Unprovoked,AUSTRALIA,UNKNOWN,UNKNOWN,Fishing and similars,M,UNKNOWN,"FATAL, knocked overboard by tail of shark & ca...",Y,UNKNOWN,Blue pointer,1906,UNKNOWN
6294,Unprovoked,AUSTRALIA,UNKNOWN,UNKNOWN,Fishing and similars,M,UNKNOWN,FATAL,Y,UNKNOWN,Blue pointer,1906,UNKNOWN


In [38]:
#me gustaría hacer una columna que solo pusiera "boat" e incluir un 1 por toods lso type = Boat
    #también incluir todos los #activity == fishing and similars
#para ver los tipos de 

## species

In [39]:
len(list(sharks.Species.unique())) #1493
sharks.Species = sharks.Species.str.strip()

In [40]:
sharks["Species_02"] = sharks["Species"].str.replace(".*[Uu](NKNOW|nknow).*","UNKNOWN",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ww](HITE|hite).*","White",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Tt](iger|IGER).*","Tiger",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ll](emon|EMON).*","Lemon",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Bb](UL|ul).*","Bull",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Nn](URSE|urse).*","Nurse",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ss](pin|PIN).*","Spinner",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Hh](ammer|AMMER).*","Hammerhead",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ll](UE|ue).*","Blue",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Mm](ako|AKO).*","Mako",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Bb](lacktip|LACKTIP).*","Blacktip",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ww](obbegong|OBBEGONG).*","Wobbegong",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Rr](eef|EEF).*","Reef",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Bb](ronze|RONZE).[Ww](haler|HALER).*","Bronze whaler",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Rr](agged|AGGED).*","Raggedtooth",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ss](and|and).*","Sandbar",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Gg](R(A|E)Y|r(a|e)y).*","Grey",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ww](HALE|hale)\s.*","Whale",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Zz](ambe|AMBE).*","Zambesi",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Bb](LACK|lack).[T|t].*","Blacktip",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*\d.*","UNKNOWN",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Uu](niden|NIDEN).*","UNKNOWN",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ss](mall|MALL)\s.*","UNKNOWN",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ll](ittle|ITTLE)\s.*","UNKNOWN",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Jj](uvenile|UVENILE).*","UNKNOWN",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Yy](oung|OUNG).*","UNKNOWN",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ll](ARGE|arge).*","UNKNOWN",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Uu](NKNOW|nknow).*","UNKNOWN",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Rr](ecover|ECOVER).*","UNKNOWN",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ss](chool|CHOOL).*","School",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Pp](ack|ACK)\s.*","School",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Nn](UMBER|umber).*","School",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ss](EVERAL|everal).*","School",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ss](hark|HARK).*","Other",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Cc].\s.*","Other",regex = True)
sharks["Species_02"] = sharks["Species"].str.replace(".*[Ss](hovel|HOVEL).*","Other",regex = True)
#sharks["Species"] = sharks["Species"].replace("''","UNKNOWN",inplace = True)

#Wobbegong 

In [41]:
prueba = pd.DataFrame(sharks.Species.value_counts())
prueba

Unnamed: 0,Species
UNKNOWN,2817
White shark,162
Tiger shark,74
Bull shark,51
6' shark,40
...,...
"Tiger shark, 4 m [13'] ?",1
"Shortfin mako shark, 3 m to 3.4 m [10' to 11']",1
"Miami, a 60 cm blacktip shark and two 60 cm bamboo catsharks",1
"Thought to involve a white, bull or tiger shark",1


In [42]:
list(sharks.Species.unique())

['White shark',
 'UNKNOWN',
 '2 m shark',
 'Tiger shark, 3m',
 'Tiger shark',
 "Lemon shark, 3'",
 "Bull shark, 6'",
 'Grey reef shark',
 'Tawny nurse shark, 2m',
 'Shark involvement not confirmed',
 'Questionable',
 '3 m shark',
 'White shark, 3.5 m',
 'White shark, 2.5 m',
 "6' shark",
 'Juvenile bull shark',
 'Bull shark',
 "Tiger shark, 12'",
 'Wobbegong shark',
 '3.5 m shark',
 '1.8 m shark',
 'Blacktip shark',
 'Juvenile white shark,  2.7 to 3.2 m',
 'Bull shark, 2 m',
 'Galapagos shark?',
 'Bull shark, 3 m',
 'Grey reef shark. 2 m',
 'small shark',
 'Wobbegong shark?',
 'Juvenile nurse shark',
 "Nurse shark. 5'",
 'Tiger shark, female',
 'Some drowned but other may have been killed by blue sharks',
 'White shark, 4.6 m',
 'Cookiecutter shark',
 'Wobbegong shark, 1 m',
 'White shark, 4.5 m',
 'Spinner shark, 4 to 5 feet',
 'Tiger shark, 8 to 10 feet',
 "8' shark",
 "5' shark",
 "4' to 5' shark",
 'Porbeagle, 1.5 m',
 'White shark, 3.5m',
 "5' to 6' shark",
 'White shark, 3 to 3.5

In [43]:
len(list(sharks.Species.unique()))

1411

Hacemos que los datos sean más concretos en la columnas.

In [50]:
#creamos un sub dataframe con solo las colmunas que realmente necesitamos.
sharky = sharks[["Type","Country","Area","Activity","Sex","Age","Fatal","Species_02","Year_D","Month_D"]]

In [53]:
#exportarmos sharky
sharky.to_csv("data/sharks.csv")

In [46]:
#comprobamos que los registros que tienen como unknown area, location y Country
#sharks[(sharks.Country == "UNKNOWN")|(sharks.Country == "UNKNOWN")|(sharks.Country == "UNKNOWN")].sample(20)

In [48]:
sharks

Unnamed: 0,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal,Time,Species,Year_D,Month_D,Species_02
0,Boat,USA,California,"Oceanside, San Diego County",Paddling and similars,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,2018,Jun,White shark
1,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing/Sitting,F,11,Minor injury to left thigh,N,14h00 -15h00,UNKNOWN,2018,Jun,UNKNOWN
3,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surf and similars,M,UNKNOWN,Minor injury to lower leg,N,UNKNOWN,2 m shark,2018,Jun,2 m shark
4,Provoked,MEXICO,Colima,La Ticla,Diving and similars,M,UNKNOWN,Lacerations to leg & hand shark PROVOKED INCIDENT,N,UNKNOWN,"Tiger shark, 3m",2018,Jun,"Tiger shark, 3m"
5,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Surf and similars,M,UNKNOWN,"No injury, board bitten",N,UNKNOWN,UNKNOWN,2018,Jun,UNKNOWN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6297,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving and similars,M,UNKNOWN,FATAL,Y,UNKNOWN,UNKNOWN,1903,UNKNOWN,UNKNOWN
6298,Unprovoked,AUSTRALIA,Western Australia,UNKNOWN,Diving and similars,M,UNKNOWN,FATAL,Y,UNKNOWN,UNKNOWN,1903,UNKNOWN,UNKNOWN
6299,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming and similars,M,UNKNOWN,FATAL,Y,UNKNOWN,UNKNOWN,1900,UNKNOWN,UNKNOWN
6300,Unprovoked,PANAMA,UNKNOWN,"Panama Bay 8ºN, 79ºW",UNKNOWN,M,UNKNOWN,FATAL,Y,UNKNOWN,UNKNOWN,1883,UNKNOWN,UNKNOWN
