## 1. Problem Statement
The objective of this notebook is to perform an initial exploratory data analysis (EDA)
and data cleaning process on a global species distribution dataset.
The goal is to identify data quality issues and prepare a clean dataset
for downstream geospatial and comparative analysis.

## 2. Data Sources
The dataset used in this analysis was obtained from the World Spider Catalog.
It contains taxonomic and distribution-related information for spider species
at a global level.

In [1]:
# library import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, ImageColorGenerator
import warnings
warnings.filterwarnings("ignore")

In [2]:
# uploading a CSV File
df = pd.read_csv('/Users/yayo/Documents/GitHub/jumping_spider_salticidae/data/raw/world_spider_catalog.csv')

## 3. Exploratory Data Analysis (EDA)

### 3.1 Structure and data types

In [3]:
# dataframe dimensions (rows and columns)
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

Number of rows: 64291
Number of columns: 13


In [4]:
# columns of the dataframe
df.columns

Index(['speciesId', 'species_legacy_id', 'species_lsid', 'family', 'genus',
       'species', 'subspecies', 'author', 'year', 'parentheses',
       'distribution', 'validSpeciesId', 'taxonStatus'],
      dtype='object')

In [5]:
# data loaded
# display the first few rows of the dataframe
df.head(3)

Unnamed: 0,speciesId,species_legacy_id,species_lsid,family,genus,species,subspecies,author,year,parentheses,distribution,validSpeciesId,taxonStatus
0,6625,1.0,urn:lsid:nmbe.ch:spidersp:000896,Actinopodidae,Actinopus,caraiba,,Simon,1889,1,Venezuela,,VALID
1,6626,2.0,urn:lsid:nmbe.ch:spidersp:000898,Actinopodidae,Actinopus,crassipes,,Keyserling,1891,1,"Brazil, Paraguay, Argentina",,VALID
2,6627,3.0,urn:lsid:nmbe.ch:spidersp:000899,Actinopodidae,Actinopus,cucutaensis,,Mello-Leitão,1941,0,"Colombia, Venezuela, Brazil",,VALID


In [6]:
# identify the data type of each variable
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64291 entries, 0 to 64290
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   speciesId          64291 non-null  int64  
 1   species_legacy_id  64139 non-null  float64
 2   species_lsid       64291 non-null  object 
 3   family             64291 non-null  object 
 4   genus              64291 non-null  object 
 5   species            64291 non-null  object 
 6   subspecies         937 non-null    object 
 7   author             64291 non-null  object 
 8   year               64291 non-null  object 
 9   parentheses        64291 non-null  int64  
 10  distribution       64142 non-null  object 
 11  validSpeciesId     10600 non-null  float64
 12  taxonStatus        64291 non-null  object 
dtypes: float64(2), int64(2), object(9)
memory usage: 6.4+ MB


### 3.2 Missing values

In [7]:
df.isna().sum().sort_values(ascending=False)

subspecies           63354
validSpeciesId       53691
species_legacy_id      152
distribution           149
speciesId                0
species_lsid             0
family                   0
genus                    0
species                  0
author                   0
year                     0
parentheses              0
taxonStatus              0
dtype: int64

In [8]:
(df.isna().mean() * 100).sort_values(ascending=False)

subspecies           98.542564
validSpeciesId       83.512467
species_legacy_id     0.236425
distribution          0.231759
speciesId             0.000000
species_lsid          0.000000
family                0.000000
genus                 0.000000
species               0.000000
author                0.000000
year                  0.000000
parentheses           0.000000
taxonStatus           0.000000
dtype: float64

In [9]:
# Count missing values per variable
missing_values = df.isnull().sum()

# Calculate completeness percentage
completeness = pd.DataFrame(
    100 - (missing_values / len(df) * 100),
    columns=["completeness"]
)

# Reset index and rename columns
completeness = (
    completeness
    .reset_index()
    .rename(columns={"index": "variable"})
)

# Sort variables by completeness (ascending)
completeness = completeness.sort_values(
    by="completeness",
    ascending=True
)

completeness


Unnamed: 0,variable,completeness
6,subspecies,1.457436
11,validSpeciesId,16.487533
1,species_legacy_id,99.763575
10,distribution,99.768241
0,speciesId,100.0
2,species_lsid,100.0
3,family,100.0
4,genus,100.0
5,species,100.0
7,author,100.0


### 3.3 Duplicates

In [10]:
df.duplicated().sum()

np.int64(3)

In [11]:
# occurrence count by species
species_counts = df["species"].value_counts()
species_counts.head(10)

species
simoni       123
gertschi      94
similis       86
longipes      84
pallida       82
bicolor       78
affinis       77
gracilis      76
australis     74
elegans       72
Name: count, dtype: int64

In [12]:
# species that appear more than once
duplicated_species = species_counts[species_counts > 1]
duplicated_species.head(10)

species
simoni       123
gertschi      94
similis       86
longipes      84
pallida       82
bicolor       78
affinis       77
gracilis      76
australis     74
elegans       72
Name: count, dtype: int64

## 4. Data Cleaning

In [13]:
# eliminacion de los datos duplicados
df.drop_duplicates(inplace=True)
df = df.dropna(subset=['distribution'])

In [14]:
# se verifica la completitud de las variables (valores nulos existentes)
# se utiliza la función .sum() para sumar los elementos que están vacíos (.isnull())
nulos = df.isnull().sum()

# calculo del porcentaje de completitud
completitud = pd.DataFrame(100 - (nulos / len(df) * 100))

# reset_index se utiliza para resetear los índices
# inplace = True sirve para evitar la repetición
completitud.reset_index(inplace = True)
# se coloan encebezados
completitud = completitud.rename(columns = {"index":"variable",0:"completitud"})

# ordenamos las columnas con menor completitud en adelante
completitud.sort_values(by ='completitud', ascending = True)
completitud

Unnamed: 0,variable,completitud
0,speciesId,100.0
1,species_legacy_id,99.778606
2,species_lsid,100.0
3,family,100.0
4,genus,100.0
5,species,100.0
6,subspecies,1.457771
7,author,100.0
8,year,100.0
9,parentheses,100.0


In [15]:
# dimensión del dataframe con eliminacion de datos duplicados
print(f"Número de filas: {df.shape[0]}")
print(f"Número de columnas: {df.shape[1]}")

Número de filas: 64139
Número de columnas: 13


## Filtrado de registros

### Arañas salticidae en México y otros paÍses

In [16]:
# Filtrar las filas donde 'distribution' sea 'Mexico' y 'family' sea 'Salticidae'
df_salticidae_mexico_world = df[
    (df['distribution'].str.contains('Mexico', na=False, case=False)) &
    (df['family'] == 'Salticidae')
]

# Verificar cuántos registros hay en el nuevo DataFrame
print(f"Total de registros donde 'distribution' es 'Mexico' y 'family' es 'Salticidae': {len(df_salticidae_mexico_world)}")

Total de registros donde 'distribution' es 'Mexico' y 'family' es 'Salticidae': 294


### Arañas salticidae en México

In [17]:
# Filtrar las filas donde 'distribution' sea 'Mexico' y 'family' sea 'Salticidae'
df_salticidae_mexico = df[
    (df['distribution']== 'Mexico') & 
    (df['family'] == 'Salticidae')
]

# Verificar cuántos registros hay en el nuevo DataFrame
print(f"Total de registros donde 'distribution' es 'Mexico' y 'family' es 'Salticidae': {len(df_salticidae_mexico)}")

Total de registros donde 'distribution' es 'Mexico' y 'family' es 'Salticidae': 125


### Guardar dataframes en archivos CSV

In [None]:
# Guardar el dataframe en un archivo CSV
df_salticidae_mexico_world.to_csv('/Users/yayo/Documents/GitHub/jumping_spider_salticidae/data/cleaned/salticidae_mexico_world_wsc.csv', index=False)

print("DataFrame guardado en salticidae_mexico_world_wsc.csv")

In [18]:
# Guardar el dataframe en un archivo CSV
df_salticidae_mexico.to_csv('/Users/yayo/Documents/GitHub/jumping_spider_salticidae/data/cleaned/salticidae_mexico_wsc.csv', index=False)

print("DataFrame guardado en salticidae_mexico_wsc.csv")

OSError: Cannot save file into a non-existent directory: '/Users/yayo/Documents/GitHub/jumping_spider_salticidae/data/cleaned'