# 🦈 Shark Attack Data Wrangling Quest
## Final Project Notebook: Complete Workflow from Day 1 to Day 4

Load Dataset

In [1]:
!pip install xlrd
import pandas as pd

df = pd.read_excel('GSAF5.xls')
df.head()



Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,...,Species,Source,pdf,href formula,href,Case Number,Case Number.1,original order,Unnamed: 21,Unnamed: 22
0,2025-03-10 00:00:00,2025.0,Unprovoked,Australia,WA,Duke of Orleans Bay,Surfing,Steven Jeffrey Payne,M,37,...,Great White Shark,Bob Myatt,,,,,,,,
1,2025-03-07 00:00:00,2025.0,Unprovoked,Australia,NSW,Gunyah beach Bundeena Port Hacking,Swimming,Mangyong Zhang,F,56,...,Bull shark,Bob Myatt,,,,,,,,
2,2025-02-27 00:00:00,2025.0,Unprovoked,Australia,Victoria,Nuns Beach near Portland,Swimming,Robbie Houlihan,M,40+,...,Not stated,Kevin McMurray Trackingsharks.com,,,,,,,,
3,27-Feb-25,2025.0,Questionable,Philippines,Batangas Resort,Verde Island,SCUBA Diving,Illia Peregudin,M,29,...,Not stated,Stephen Parkhill Glen Folkard,,,,,,,,
4,2025-02-22 00:00:00,2025.0,Unprovoked,Australia,Western Australia,Blasck Wall Reach Swan River Perth,Kayaking,Georgina Smith,F,35,...,Not stated,Bob Myatt GSAF,,,,,,,,


Day 1 – Data Exploration & Hypotheses

In [3]:
# Check missing data
display(df.isnull().sum().sort_values(ascending=False).head(10))

# Check sample values
display(df.iloc[100][0:10])
display(df.dtypes.head(10))

Unnamed: 21    7007
Unnamed: 22    7006
Time           3526
Species        3132
Age            2994
Activity        585
Sex             579
Location        566
Fatal Y/N       561
State           484
dtype: int64

Date                             12 Sep-2023
Year                                  2023.0
Type                            Questionable
Country                                  USA
State                                Florida
Location    New Smyrna Beach, Volusia County
Activity                             Surfing
Name                         Mark Summersett
Sex                                        M
Age                                       38
Name: 100, dtype: object

Date         object
Year        float64
Type         object
Country      object
State        object
Location     object
Activity     object
Name         object
Sex          object
Age          object
dtype: object

### ✅ Hypotheses
- H1: Shark attacks are more common during surfing than swimming
- H2: USA has the highest number of shark attacks
- H3: Young adults (ages 20–40) are attacked more often
- H4: Fatal attacks are more common in the afternoon


 – Data Cleaning (Non-Date Columns)

In [6]:
# Clean 'Sex'
df['Sex'] = df['Sex'].str.upper().str.strip()
df['Sex'] = df['Sex'].replace({'MALE': 'M', 'FEMALE': 'F'})
df['Sex'] = df['Sex'].fillna('UNKNOWN')

# Clean 'Fatal Y/N'
df['Fatal Y/N'] = df['Fatal Y/N'].str.upper().str.strip()
df['Fatal Y/N'] = df['Fatal Y/N'].replace({'Y': 'Y', 'N': 'N'})
df['Fatal Y/N'] = df['Fatal Y/N'].fillna('UNKNOWN')

# Clean 'Activity'
df['Activity'] = df['Activity'].str.lower().str.strip()
df['Activity'] = df['Activity'].fillna('unknown')

# Clean 'Country'
df['Country'] = df['Country'].str.strip().str.upper()
df['Country'] = df['Country'].fillna('UNKNOWN')

# Clean 'Age'
def clean_age(val):
    try:
        return int(float(val))
    except:
        return None

df['Age_cleaned'] = df['Age'].apply(clean_age)

 Day 3 – Regex, Date Parsing, Hypothesis Testing

In [8]:
import re

def extract_activity(activity):
    activity = str(activity).lower()
    if re.search(r'surf', activity):
        return 'Surfing'
    elif re.search(r'swim', activity):
        return 'Swimming'
    elif re.search(r'div', activity):
        return 'Diving'
    elif re.search(r'kayak|paddle', activity):
        return 'Boating'
    else:
        return 'Other'

df['ActivityType'] = df['Activity'].apply(extract_activity)

# Convert 'Date' column to datetime and extract features
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Decade'] = (df['Year'] // 10) * 10

# Classify time of day
def classify_time(time):
    time = str(time).lower()
    if 'morning' in time:
        return 'Morning'
    elif any(x in time for x in ['12', '13', '14', '15', 'afternoon']):
        return 'Afternoon'
    elif 'evening' in time or 'dusk' in time:
        return 'Evening'
    elif 'night' in time:
        return 'Night'
    else:
        return 'Unknown'

df['TimeGroup'] = df['Time'].apply(classify_time)

# Classify age group
def age_group(age):
    if pd.isnull(age):
        return 'Unknown'
    elif age <= 12:
        return 'Child'
    elif age <= 18:
        return 'Teen'
    elif age <= 40:
        return 'Young Adult'
    elif age <= 60:
        return 'Adult'
    else:
        return 'Senior'

df['AgeGroup'] = df['Age_cleaned'].apply(age_group)

Hypothesis Testing Results

In [10]:
print("Activity Types:\n", df['ActivityType'].value_counts())
print("Top Countries:\n", df['Country'].value_counts().head(5))
print("Age Groups:\n", df['AgeGroup'].value_counts())
print("Fatal Attacks by Time of Day:\n", df[df['Fatal Y/N'] == 'Y']['TimeGroup'].value_counts())

Activity Types:
 ActivityType
Other       3601
Surfing     1445
Swimming    1240
Diving       623
Boating       99
Name: count, dtype: int64
Top Countries:
 Country
USA             2556
AUSTRALIA       1500
SOUTH AFRICA     598
NEW ZEALAND      146
BAHAMAS          139
Name: count, dtype: int64
Age Groups:
 AgeGroup
Unknown        3135
Young Adult    1920
Teen            859
Adult           633
Child           332
Senior          129
Name: count, dtype: int64
Fatal Attacks by Time of Day:
 TimeGroup
Unknown      1187
Afternoon     228
Morning        24
Night          23
Evening        15
Name: count, dtype: int64


# Day 4 – Create Risk Score & Finalize Data

In [12]:
# Create Shark Risk Score
def shark_risk(row):
    score = 0
    if row['ActivityType'] == 'Surfing': score += 2
    if row['Sex'] == 'M': score += 1
    if row['Country'] in ['AUSTRALIA', 'USA']: score += 1
    if row['TimeGroup'] == 'Afternoon': score += 2
    return score

df['RiskScore'] = df.apply(shark_risk, axis=1)
df[['ActivityType', 'Sex', 'Country', 'TimeGroup', 'RiskScore']].head()
df.sort_values(by='RiskScore', ascending=False).head(10)
df['RiskScore'].value_counts().sort_index()
print(df[['ActivityType', 'Sex', 'Country', 'TimeGroup', 'RiskScore']].head())
print(df.sort_values(by='RiskScore', ascending=False).head(10))
print(df['RiskScore'].value_counts().sort_index())

# Save the cleaned and enhanced dataset
df.to_csv("GSAF5_final_cleaned.csv", index=False)
df.to_csv("GSAF5_final_cleaned.csv",index=False)
print("final dataset saved!") 

  ActivityType Sex      Country  TimeGroup  RiskScore
0      Surfing   M    AUSTRALIA  Afternoon          6
1     Swimming   F    AUSTRALIA  Afternoon          3
2     Swimming   M    AUSTRALIA    Unknown          2
3       Diving   M  PHILIPPINES  Afternoon          3
4      Boating   F    AUSTRALIA  Afternoon          3
           Date    Year        Type    Country           State  \
0    2025-03-10  2025.0  Unprovoked  AUSTRALIA              WA   
1510 2011-12-23  2011.0  Unprovoked        USA         Florida   
1479 2012-04-03  2012.0  Unprovoked        USA          Hawaii   
1482 2012-03-24  2012.0  Unprovoked        USA         Florida   
1485 2012-03-15  2012.0  Unprovoked        USA         Florida   
1491 2012-03-04  2012.0  Unprovoked        USA         Florida   
1494 2012-02-26  2012.0    Provoked        USA         Florida   
1497 2012-02-06  2012.0  Unprovoked  AUSTRALIA      Queensland   
1546 2011-09-11  2011.0  Unprovoked        USA      California   
1679 2010-08-07 