In [2]:
#libraries used
import pandas as pd
import re
from datetime import datetime

In [3]:
#import of the data set
data = pd.read_csv('GSAF5.csv', encoding='latin-1')

In [4]:
#check the number of null values per column
null_cols = data.isnull().sum()
null_cols[null_cols > 0]
#Time have 3213 null values -> don't keep it because if this information is often missing, we won't be able to get a trend on the attack hour
#Species have 2934 null values -> keep it because we might still want to know what kind of shark attack
#Age have 2681 null values -> don't keep it as it is not an interresting information if often missing
#I wanted to remove Name column as I don't find this information useful but sometimes it contains sex and other information

Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
href formula                 1
href                         3
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

In [5]:
#remove all desired columns from our data set
drop_cols = ['Time','Age','Year','Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22','Unnamed: 23']
data = data.drop(drop_cols, axis=1)

In [6]:
#pd.to_datetime(data['Date'], format='%d/%m/%Y', errors='coerce')

In [7]:
#function to clean the date from Case number attribute
def clean_Date(e):

    if re.search("[0-9]{4}.[0-9]{2}.[0-9]{2}",e):
        splitted_date = re.split("\D",e)
        if splitted_date[1] == "00":
            splitted_date[1] = "01"
        if splitted_date[2] == "00":
            splitted_date[2] = "01"
        cleaned_date = datetime(int(splitted_date[0]),int(splitted_date[1]),int(splitted_date[2]))
    else:
        cleaned_date = "Unknown Date"
    
    return cleaned_date

In [8]:
#function to change empty value by "Not specified"
def empty_values(e):
    if re.search("^$",e):
        e = "Not specified"
    return e

In [9]:
#apply clean_date function to our Case number column and put the calculated date into date column
data['Date'] = data['Case Number'].apply(clean_Date)

In [10]:
#create a new column to know if the Date did not contain day and/or month
lst=[]
for e in data['Case Number']:
    if re.search("[0-9]{4}.0{2}.0{2}",e) or re.search("[0-9]{4}.[0-9]{2}.0{2}",e):
        lst.append("Yes")
    else:
        lst.append("No")
data['date_alterated'] = pd.DataFrame(lst)

In [11]:
#function to remove space at the beginning and at the end of my string
def remove_spaces(e):
    if re.search("^ ",e):
        e = re.sub("^ +","",e)
    if re.search(" $",e):
        e = re.sub(" +$","",e)
    return e

In [12]:
#rename our columns name by applying remove space function
data.rename(remove_spaces,axis=1, inplace=True)
print(data.columns)

Index(['Case Number', 'Date', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex', 'Injury', 'Fatal (Y/N)', 'Species',
       'Investigator or Source', 'pdf', 'href formula', 'href',
       'date_alterated'],
      dtype='object')


In [13]:
#Convert Country data into str because some are floats
#I remove "?", considering the country is correct
data['Country'] = data['Country'].astype(str)
data['Country'] = data['Country'].str.replace('\?', '')
data['Country'] = data['Country'].apply(remove_spaces)

In [14]:
#Clean Area column
data['Area'] = data['Area'].astype(str)
data['Area'] = data['Area'].str.replace('\"','')
data['Area'] = data['Area'].str.replace("Tavenui","Taveuni")
data['Area'] = data['Area'].str.replace("Tamilnadu","Tamil Nadu")
data['Area'] = data['Area'].str.replace("Guerro","Guerrero")
data['Area'] = data['Area'].str.replace("Guerrrero","Guerrero")
data['Area'] = data['Area'].str.replace("islans","Islands")
data['Area'] = data['Area'].str.replace("Province","")
data['Area'] = data['Area'].str.replace("Region","")
data['Area'] = data['Area'].str.replace("shire","")
data['Area'] = data['Area'].apply(empty_values)
data['Area'] = data['Area'].apply(remove_spaces)

In [15]:
#Clean Location column
data['Location'] = data['Location'].astype(str)
data['Location'] = data['Location'].str.replace("\(","")
data['Location'] = data['Location'].str.replace("\)","")
data['Location'] = data['Location'].apply(empty_values)
data['Location'] = data['Location'].apply(remove_spaces)

In [None]:
def sex_cleaning(e):
    if e != "F" or e != "M":
        e = "Not specified"
    return e

In [16]:
#Clean Sex column
data['Sex'] = data['Sex'].astype(str)
data['Sex'] = data['Sex'].apply(empty_values)
data['Sex'] = data['Sex'].apply(remove_spaces)

data['Sex'] = data['Sex'].apply(sex_cleaning)

"""lst = []
for e in data['Sex']:
    #print(e)
    if e != "F" or e != "M":
        e = "Not specified"
    lst.append(e)
data['Sex'] = pd.DataFrame(lst)"""

In [17]:
#I use Name column information to get the sex if precised
data['Name'] = data['Name'].astype(str)
lst = []
for e in data['Name']:
    if re.search("boy.?",e) or re.search("[Mm]ale.?",e):
        lst.append("M")
    elif re.search("girl.?",e) or re.search("[Ff]emale.?",e):
        lst.append("F")
    else:
        lst.append("")
data['test sex'] = pd.DataFrame(lst)

In [18]:
#I use newly created column test sex information to complete sex column
data.loc[(data['Sex']=="Not specified") & (data['test sex']=="M"), 'Sex'] = "M"
data.loc[(data['Sex']=="Not specified") & data['test sex']=="F", 'Sex'] = "F"
drop_col = ['test sex']
data = data.drop(drop_col,axis=1)

  result = method(y)


In [None]:
def fatal_cleaning(e):
    if e != "N" or e != "Y":
        e = "Not specified"
    return e

In [19]:
#Clean Fatal (Y/N) column
data['Fatal (Y/N)'] = data['Fatal (Y/N)'].astype(str)
data['Fatal (Y/N)'] = data['Fatal (Y/N)'].apply(remove_spaces)

data['Fatal (Y/N)'] = data['Fatal (Y/N)'].apply(fatal_cleaning)

"""lst = []
for e in data['Fatal (Y/N)']:
    #print(e)
    if e != "N" or e != "Y":
        e = "Not specified"
    
    lst.append(e)

data['Fatal (Y/N)'] = pd.DataFrame(lst)"""

In [26]:
#Clean Activity column
#print(set(data['Activity']))
lst=[]
data['Activity'] = data['Activity'].astype(str)
for e in data['Activity']:
    lst.append(re.findall("[A-Za-z]*-?ing",e))
print(lst)
#voir comment obtenir une liste puis si plusieurs choix pour une même ligne comment faire un arbitrage (est-ce qu'on peut garder plusieurs activité et les séparer par une , ou autre)
lst2 = [z for x in lst for y in x for z in y]
print(set(lst2))

[['Surfing'], ['Surfing'], ['Surfing'], ['Surfing'], ['Surfing'], ['Fishing'], ['Wading'], ['Swimming'], ['surfing'], ['boarding'], ['Surfing'], ['boarding'], ['Spearfishing'], ['Surfing'], ['Surfing'], ['Surfing'], ['Wading'], ['Snorkeling'], ['boarding'], ['Surfing'], ['Swimming'], ['Swimming'], ['Kayaking'], ['Lobstering'], ['Surfing'], ['Surfing'], ['Surfing'], ['Spearfishing'], ['Fishing'], ['Fishing', 'Fishing'], ['Surfing'], [], ['Swimming'], ['Diving'], ['Wading'], ['Fishing'], ['Fishing'], ['Fishing'], ['Swimming'], ['Fishing'], [], ['Surfing'], ['Diving'], ['Spearfishing'], ['Floating'], ['Wading'], ['Surfing'], ['Spearfishing'], ['Floating'], ['Standing'], ['Surfing'], ['Swimming'], ['Diving'], ['Swimming'], ['Spearfishing'], ['surfing'], ['Surfing'], ['Swimming'], ['Swimming'], ['Swimming'], ['Swimming'], ['Swimming'], ['Swimming'], ['Teasing'], ['Floating'], ['Fishing'], ['Surfing'], ['Surfing'], ['skiing'], ['Swimming'], ['Spearfishing'], ['Spearfishing'], ['Walking'], []

In [None]:
#créer une colonne pour dire si incident concerne plusieurs personnes (généralement pas d'info dans Sex, Age, )

In [169]:
data['Country'].value_counts(ascending=True,dropna=False)

ANGOLA                               1
CYPRUS                               1
GREENLAND                            1
BRITISH ISLES                        1
GHANA                                1
FALKLAND ISLANDS                     1
DJIBOUTI                             1
EQUATORIAL GUINEA / CAMEROON         1
ARGENTINA                            1
RED SEA / INDIAN OCEAN               1
TASMAN SEA                           1
TUVALU                               1
MONACO                               1
WESTERN SAMOA                        1
ST. MARTIN                           1
MAYOTTE                              1
ADMIRALTY ISLANDS                    1
GUATEMALA                            1
GABON                                1
GRAND CAYMAN                         1
GEORGIA                              1
NORTHERN ARABIAN SEA                 1
NORTH SEA                            1
FEDERATED STATES OF MICRONESIA       1
BANGLADESH                           1
GULF OF ADEN             