In [2]:
import pandas as pd
import numpy as np
import kagglehub
import os
import plotly.express as px

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Download the "Global Crocodile Species" dataset from Kaggle

path = kagglehub.dataset_download("zadafiyabhrami/global-crocodile-species-dataset")

print("Path to dataset files:", path)   # Print the local path where the dataset is saved
print(os.listdir(path))

Path to dataset files: /Users/saif/.cache/kagglehub/datasets/zadafiyabhrami/global-crocodile-species-dataset/versions/1
['crocodile_dataset.csv']


In [4]:

# Load the CSV file into a DataFrame
path = os.path.join(path, "crocodile_dataset.csv")
df = pd.read_csv(path)

In [5]:
print("\nFirst 5 rows: ")
display(df.head())
print("\nLast 5 rows: ")
display(df.tail())

print(f"Dataset rows: {df.shape[0]}, Columns: {df.shape[1]}\n")

# Loop through each column and display details
print("Columns:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2}. {col:<20}"
          f"Type: {str(df[col].dtype):<7}"
          f" | Missing: {df[col].isna().sum():<4} "
          f"({df[col].isnull().sum()/df.shape[0]*100:5.2f}%)"
          f" | Unique: {df[col].nunique():<7}"
          f" | duplicates: {df.duplicated().sum()}")


# Drop the 'Family' column as it has only one unique value
df.drop(["Family"], axis=1, inplace=True)


First 5 rows: 


Unnamed: 0,Observation ID,Common Name,Scientific Name,Family,Genus,Observed Length (m),Observed Weight (kg),Age Class,Sex,Date of Observation,Country/Region,Habitat Type,Conservation Status,Observer Name,Notes
0,1,Morelet's Crocodile,Crocodylus moreletii,Crocodylidae,Crocodylus,1.9,62.0,Adult,Male,31-03-2018,Belize,Swamps,Least Concern,Allison Hill,Cause bill scientist nation opportunity.
1,2,American Crocodile,Crocodylus acutus,Crocodylidae,Crocodylus,4.09,334.5,Adult,Male,28-01-2015,Venezuela,Mangroves,Vulnerable,Brandon Hall,Ago current practice nation determine operatio...
2,3,Orinoco Crocodile,Crocodylus intermedius,Crocodylidae,Crocodylus,1.08,118.2,Juvenile,Unknown,07-12-2010,Venezuela,Flooded Savannas,Critically Endangered,Melissa Peterson,Democratic shake bill here grow gas enough ana...
3,4,Morelet's Crocodile,Crocodylus moreletii,Crocodylidae,Crocodylus,2.42,90.4,Adult,Male,01-11-2019,Mexico,Rivers,Least Concern,Edward Fuller,Officer relate animal direction eye bag do.
4,5,Mugger Crocodile (Marsh Crocodile),Crocodylus palustris,Crocodylidae,Crocodylus,3.75,269.4,Adult,Unknown,15-07-2019,India,Rivers,Vulnerable,Donald Reid,Class great prove reduce raise author play mov...



Last 5 rows: 


Unnamed: 0,Observation ID,Common Name,Scientific Name,Family,Genus,Observed Length (m),Observed Weight (kg),Age Class,Sex,Date of Observation,Country/Region,Habitat Type,Conservation Status,Observer Name,Notes
995,996,West African Crocodile,Crocodylus suchus,Crocodylidae,Crocodylus,2.55,112.3,Adult,Female,01-10-2015,Mauritania,Lakes,Least Concern,Karen Avery,Thousand position sense church out explain sig...
996,997,Hall's New Guinea Crocodile,Crocodylus halli,Crocodylidae,Crocodylus,2.27,80.3,Subadult,Female,25-01-2023,Papua New Guinea,Freshwater Wetlands,Least Concern,Sylvia Turner,Painting try talk like maintain serious style.
997,998,West African Slender-snouted Crocodile,Mecistops cataphractus,Crocodylidae,Mecistops,2.34,93.5,Adult,Male,07-02-2014,Côte d'Ivoire,Swamps,Critically Endangered,Kristin Fleming,Produce off light before majority garden milit...
998,999,West African Slender-snouted Crocodile,Mecistops cataphractus,Crocodylidae,Mecistops,2.82,147.6,Adult,Male,09-07-2010,Sierra Leone,Shaded Forest Rivers,Critically Endangered,Eric Petersen,Put everything our prove could party skin soon...
999,1000,Hall's New Guinea Crocodile,Crocodylus halli,Crocodylidae,Crocodylus,1.73,47.4,Juvenile,Unknown,31-03-2014,Papua New Guinea,Freshwater Wetlands,Least Concern,Kristen Harris,Ok community right then police day so store co...


Dataset rows: 1000, Columns: 15

Columns:
   1. Observation ID      Type: int64   | Missing: 0    ( 0.00%) | Unique: 1000    | duplicates: 0
   2. Common Name         Type: object  | Missing: 0    ( 0.00%) | Unique: 18      | duplicates: 0
   3. Scientific Name     Type: object  | Missing: 0    ( 0.00%) | Unique: 18      | duplicates: 0
   4. Family              Type: object  | Missing: 0    ( 0.00%) | Unique: 1       | duplicates: 0
   5. Genus               Type: object  | Missing: 0    ( 0.00%) | Unique: 3       | duplicates: 0
   6. Observed Length (m) Type: float64 | Missing: 0    ( 0.00%) | Unique: 390     | duplicates: 0
   7. Observed Weight (kg)Type: float64 | Missing: 0    ( 0.00%) | Unique: 837     | duplicates: 0
   8. Age Class           Type: object  | Missing: 0    ( 0.00%) | Unique: 4       | duplicates: 0
   9. Sex                 Type: object  | Missing: 0    ( 0.00%) | Unique: 3       | duplicates: 0
  10. Date of Observation Type: object  | Missing: 0    ( 0.00%) | 

In [6]:
# Count total missing values in the dataset

missing_count = df.isna().sum().sum()
if missing_count == 0:
    print("No missing values in the dataset.")
else:
    print(f"\n  Number of missing values in the dataset: {missing_count} in following columns:")
    for col in df.columns:
        if df[col].isna().sum() > 0:
            print(f"    {col}")

# Count duplicate rows in the dataset

duplicate_count = df.duplicated().sum()
if duplicate_count == 0:
    print("\nNo duplicate rows in the dataset.")
else:
    print(f"\n  Number of duplicate rows in the dataset: {duplicate_count} in following rows:")
    duplicated_df = df[df.duplicated()]
    for col in duplicated_df.columns:
        print(f"    {col}")

No missing values in the dataset.

No duplicate rows in the dataset.


In [7]:
# Rename columns for easier access in analysis

df = df.rename(
    columns={
        "Observed Length (m)": "length_m",
        "Observed Weight (kg)": "weight_kg",
        "Observation ID": "observation_ID",
        "Scientific Name": "scientific_name",
        "Age Class": "age_class",
        "Date of Observation": "observation_date",
        "Location": "location",
        "Country/Region": "country",
        "Common Name": "common_name",
        "Genus": "genus",
        "Habitat Type": "habitat_type",
        "Conservation Status": "conservation_status",
        "Observer Name": "observer_name",
        "Sex": "sex",
        "Notes": "notes"

    }
)

# Convert observation_date column from string to datetime
df["observation_date"] = pd.to_datetime(df["observation_date"])

print(df.columns)

Index(['observation_ID', 'common_name', 'scientific_name', 'genus', 'length_m',
       'weight_kg', 'age_class', 'sex', 'observation_date', 'country',
       'habitat_type', 'conservation_status', 'observer_name', 'notes'],
      dtype='object')


  df["observation_date"] = pd.to_datetime(df["observation_date"])


In [8]:
# Print value counts for each categorical column
print("Categorical column value counts:")
cat_columns = df.select_dtypes(include=['object'])
for col in cat_columns:
    print(f"\nValue counts for column {df[col].value_counts(dropna=False)}:")

# Display summary statistics for numerical columns: length_m and weight_kg

print("**************************************")
display(df[["length_m", "weight_kg"]].describe())

Categorical column value counts:

Value counts for column common_name
New Guinea Crocodile                         68
Borneo Crocodile (disputed)                  67
American Crocodile                           66
Morelet's Crocodile                          64
Cuban Crocodile                              59
Orinoco Crocodile                            58
Philippine Crocodile                         58
Saltwater Crocodile                          58
West African Dwarf Crocodile                 57
Central African Slender-snouted Crocodile    56
West African Slender-snouted Crocodile       55
West African Crocodile                       52
Hall's New Guinea Crocodile                  49
Congo Dwarf Crocodile                        48
Nile Crocodile                               48
Mugger Crocodile (Marsh Crocodile)           47
Siamese Crocodile                            45
Freshwater Crocodile (Johnstone's)           45
Name: count, dtype: int64:

Value counts for column scientific_nam

Unnamed: 0,length_m,weight_kg
count,1000.0,1000.0
mean,2.41511,155.7719
std,1.097542,175.186788
min,0.14,4.4
25%,1.6375,53.225
50%,2.43,100.6
75%,3.01,168.875
max,6.12,1139.7


In [9]:
# Create a box plot for crocodile lengths

fig_length = px.box(
    df, 
    y="length_m", 
    points="outliers",
    color_discrete_sequence=["indianred"], 
    title="Crocodile Length Distribution (m)"
)
fig_length.show()

In [10]:
# Create a box plot for crocodile weights

fig_length = px.box(
    df, 
    y="weight_kg", 
    points="outliers",
    color_discrete_sequence=["indianred"], 
    title="Crocodile weight Distribution (kg)"
)
fig_length.show()

In [11]:
# Function to detect outliers in a given column using the IQR method

def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = df[(df[column] < lower) | (df[column] > upper)]      # Filter rows where the column value is outside the bounds
    return outliers


In [12]:
# Detect outliers in length and weight columns
length_outliers = detect_outliers_iqr(df, "length_m")
weight_outliers = detect_outliers_iqr(df, "weight_kg")

# Print percentage of outliers relative to dataset size
print(f"Length outlier Percentage: {100 * length_outliers.shape[0] / df.shape[0]:.2f}%")
print(f"Weight outlier Percentage: {100 * weight_outliers.shape[0] / df.shape[0]:.2f}%")


Length outlier Percentage: 1.90%
Weight outlier Percentage: 11.50%


In [13]:
# Filter crocodiles classified as "giants" (length > 5m and adults only)
giants = df[(df["length_m"] > 5) & (df["age_class"] == "Adult")] 

# Create a scatter plot of length vs weight
fig = px.scatter(
    df,
    x="length_m",
    y="weight_kg",
    color=df["length_m"] > 4.3, # Highlight crocodiles with length > 4.3m as potential "giants"
    title="Normal vs Giant Crocodiles (Length > 5m)",
    labels={"color": "Giant Crocodile"}
)
fig.show()

display(giants)

Unnamed: 0,observation_ID,common_name,scientific_name,genus,length_m,weight_kg,age_class,sex,observation_date,country,habitat_type,conservation_status,observer_name,notes
53,54,Saltwater Crocodile,Crocodylus porosus,Crocodylus,5.83,953.9,Adult,Unknown,2011-05-19,Papua New Guinea,Coastal Wetlands,Least Concern,Gary Santiago,Form really explain war spend nearly lawyer fi...
59,60,Saltwater Crocodile,Crocodylus porosus,Crocodylus,5.43,818.2,Adult,Male,2015-07-18,Indonesia,Coastal Wetlands,Least Concern,Brittany Hanna,Maybe opportunity thank them key moment lead r...
79,80,Saltwater Crocodile,Crocodylus porosus,Crocodylus,5.07,791.7,Adult,Female,2018-03-05,Malaysia,Mangroves,Least Concern,Larry Garcia,Table prepare shoulder result Democrat later d...
119,120,Saltwater Crocodile,Crocodylus porosus,Crocodylus,5.22,809.0,Adult,Male,2006-09-07,Malaysia,Coastal Wetlands,Least Concern,Morgan Valencia,Candidate have no five letter environment easy...
157,158,Saltwater Crocodile,Crocodylus porosus,Crocodylus,6.02,1032.7,Adult,Male,2008-02-22,Philippines,Tidal Rivers,Least Concern,Angela Chung,North wrong difficult range summer president t...
166,167,Saltwater Crocodile,Crocodylus porosus,Crocodylus,5.48,878.6,Adult,Unknown,2025-06-22,Papua New Guinea,Coastal Wetlands,Least Concern,Heather Raymond,Many pattern leader career according how succe...
182,183,Saltwater Crocodile,Crocodylus porosus,Crocodylus,5.32,868.3,Adult,Female,2008-09-24,Malaysia,Estuaries,Least Concern,Charles Clark,South now mother others collection without fal...
207,208,Saltwater Crocodile,Crocodylus porosus,Crocodylus,5.91,906.6,Adult,Male,2016-11-06,Australia,Tidal Rivers,Least Concern,Sylvia Jones,Cup position speak begin suggest speak business.
209,210,Saltwater Crocodile,Crocodylus porosus,Crocodylus,6.02,1085.6,Adult,Female,2020-09-28,Papua New Guinea,Estuaries,Least Concern,Danielle Phillips,Third look because him information poor someth...
272,273,Nile Crocodile,Crocodylus niloticus,Crocodylus,5.18,848.2,Adult,Unknown,2007-02-01,Tanzania,Marshes,Least Concern,Lisa Haley,High someone box either rich agency might side.


In [14]:
# Slice dataset to only include crocodiles observed in "Rivers" or "Swamps"
habitat_slice = df.loc[
    df["habitat_type"].isin(["Rivers", "Swamps"]),
    ["common_name","length_m","weight_kg","age_class","sex","country","habitat_type"]
]

# Violin plot to compare length by habitat type (rivers vs swamps)
fig = px.violin(
    habitat_slice,
    x="habitat_type", y="length_m",
    color="habitat_type",
    box=True,
    points="all",
    title="Crocodile Length Distributions in Rivers vs Swamps"
)
fig.show()

In [22]:
# Group by species and group by average length and weight
species_agg = df.groupby("common_name", as_index=False)[["length_m", "weight_kg"]].agg("mean").reset_index()

# Scatter plot of average length vs. weight for each species
fig = px.scatter(
    species_agg,
    x="length_m",
    y="weight_kg",
    title="Average Length vs. Weight by Species",
    size="weight_kg",
    hover_name="common_name",
    color="common_name"
)

fig.show()

In [None]:
# Create a new column that calculates the difference between each crocodile's weight and the average weight of its species
df["Weight_diff"] = (
    df["weight_kg"] - df.groupby("common_name")["weight_kg"].transform("mean")
)

display(df.head())

Unnamed: 0,observation_ID,common_name,scientific_name,genus,length_m,weight_kg,age_class,sex,observation_date,country,habitat_type,conservation_status,observer_name,notes,Weight_diff
0,1,Morelet's Crocodile,Crocodylus moreletii,Crocodylus,1.9,62.0,Adult,Male,2018-03-31,Belize,Swamps,Least Concern,Allison Hill,Cause bill scientist nation opportunity.,-22.984375
1,2,American Crocodile,Crocodylus acutus,Crocodylus,4.09,334.5,Adult,Male,2015-01-28,Venezuela,Mangroves,Vulnerable,Brandon Hall,Ago current practice nation determine operatio...,145.5
2,3,Orinoco Crocodile,Crocodylus intermedius,Crocodylus,1.08,118.2,Juvenile,Unknown,2010-12-07,Venezuela,Flooded Savannas,Critically Endangered,Melissa Peterson,Democratic shake bill here grow gas enough ana...,-177.548276
3,4,Morelet's Crocodile,Crocodylus moreletii,Crocodylus,2.42,90.4,Adult,Male,2019-11-01,Mexico,Rivers,Least Concern,Edward Fuller,Officer relate animal direction eye bag do.,5.415625
4,5,Mugger Crocodile (Marsh Crocodile),Crocodylus palustris,Crocodylus,3.75,269.4,Adult,Unknown,2019-07-15,India,Rivers,Vulnerable,Donald Reid,Class great prove reduce raise author play mov...,108.157447


In [None]:
# function to extract the mode for each species
def top_habitat(group):
    m = group["habitat_type"].mode(dropna=True)
    return m.iloc[0]

species_habitat = (
    df.groupby("common_name")
      .apply(top_habitat)
      .reset_index(name="primary_habitat")
)

species_habitat





Unnamed: 0,common_name,primary_habitat
0,American Crocodile,Mangroves
1,Borneo Crocodile (disputed),Estuarine Systems
2,Central African Slender-snouted Crocodile,Forest Rivers
3,Congo Dwarf Crocodile,Forest Swamps
4,Cuban Crocodile,Swamps
5,Freshwater Crocodile (Johnstone's),Billabongs
6,Hall's New Guinea Crocodile,Freshwater Wetlands
7,Morelet's Crocodile,Rivers
8,Mugger Crocodile (Marsh Crocodile),Reservoirs
9,New Guinea Crocodile,Swamps


In [None]:
df["year"] = df["observation_date"].dt.year # Extract year from observation_date and create a new column

obs_per_year = df.groupby("year").size().reset_index(name="count") # Count number of observations per year

display(obs_per_year)

Unnamed: 0,year,count
0,2005,42
1,2006,40
2,2007,52
3,2008,50
4,2009,42
5,2010,47
6,2011,52
7,2012,56
8,2013,49
9,2014,44


In [None]:
# Line chart of number of crocodile observations per year
fig = px.line(
    obs_per_year,
    x="year", y="count",
    title="Number of Crocodile Observations per Year",
    markers=True
)
fig.show()

In [None]:
# Filter dataset for adult female crocodiles and display the result
display(df.query("sex == 'Female' and age_class == 'Adult'"))

Unnamed: 0,observation_ID,common_name,scientific_name,genus,length_m,weight_kg,age_class,sex,observation_date,country,habitat_type,conservation_status,observer_name,notes,Weight_diff,year
14,15,West African Dwarf Crocodile,Osteolaemus tetraspis,Osteolaemus,1.32,19.2,Adult,Female,2010-06-14,Côte d'Ivoire,Small Streams,Vulnerable,Matthew Lucas,I fund technology eat couple large.,5.177193,2010
17,18,Nile Crocodile,Crocodylus niloticus,Crocodylus,4.37,565.0,Adult,Female,2025-10-13,Sudan,Reservoirs,Least Concern,Amy Edwards,Once me system church whether bag.,109.039583,2025
26,27,Mugger Crocodile (Marsh Crocodile),Crocodylus palustris,Crocodylus,3.39,174.9,Adult,Female,2022-07-20,Pakistan,Rivers,Vulnerable,Donald Jones,Go meeting quickly such former agree theory en...,13.657447,2022
29,30,Orinoco Crocodile,Crocodylus intermedius,Crocodylus,4.71,576.6,Adult,Female,2018-08-07,Venezuela,Flooded Savannas,Critically Endangered,Jamie Adkins,Past feeling nature a expert involve oil.,280.851724,2018
31,32,American Crocodile,Crocodylus acutus,Crocodylus,4.11,324.4,Adult,Female,2014-01-10,USA (Florida),Mangroves,Vulnerable,Eric Drake,Agency each little sure authority increase pic...,135.400000,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,981,West African Dwarf Crocodile,Osteolaemus tetraspis,Osteolaemus,1.01,11.3,Adult,Female,2012-03-09,Sierra Leone,Small Streams,Vulnerable,David Tapia,War product action our gas heart field people ...,-2.722807,2012
981,982,Freshwater Crocodile (Johnstone's),Crocodylus johnstoni,Crocodylus,2.14,57.7,Adult,Female,2022-04-16,Australia,Gorges,Least Concern,Meghan Rowland,Floor all red technology himself much better.,3.177778,2022
988,989,Siamese Crocodile,Crocodylus siamensis,Crocodylus,3.04,163.6,Adult,Female,2023-06-17,Thailand,Slow Rivers,Critically Endangered,Robert Jordan DDS,Play blood list everything newspaper instituti...,59.284444,2023
994,995,Morelet's Crocodile,Crocodylus moreletii,Crocodylus,3.40,159.5,Adult,Female,2017-10-26,Guatemala,Rivers,Least Concern,Seth Norman,Cover risk health last student different.,74.515625,2017
