In [None]:
# Import libs + packages

# scraping
from bs4 import BeautifulSoup       
import re                           
from urllib.parse import urljoin
import time
import requests        # asks browser for permission to connect to it and do stuff w it

# reading the subtitles
from io import BytesIO
from zipfile import ZipFile
import os             # check os
import pysrt          # !pip install pysrt

# filling the wikipedia / fishbase column
import wikipedia      # !pip install wikipedia
import random         # for delays

## Step 1: Scrape for subtitles
* if possible this step should be done on colab to avoid request limits or IP blocking
* otherwise just be smart about time delays

In [7]:
# urls 
base_url = "https://www.opensubtitles.org"
movie_page = "https://www.opensubtitles.org/en/ssearch/sublanguageid-eng/idmovie-541228"

In [8]:
# soup, parse html
response = requests.get(movie_page)
soup = BeautifulSoup(response.text, "html.parser")

In [9]:
#resp # check response code
response.status_code # check response code

403

In [None]:
# get all the episode links
episode_links = soup.find_all('a', href=lambda href: href and "imdbid" in href)
episode_links;

In [None]:
# get the full links
full_links = [base_url + link['href'] for link in episode_links]
full_links;

In [None]:
# keep only URLs that contain "download"
download_links = [link for link in full_links if "download" in link]

In [None]:
# these are the first download links for all episodes from the main river monsters page !!

episode_links = [link for link in download_links if not "season" in link]
episode_links;

In [None]:
# list of download links
test_links = episode_links
test_links;

In [None]:
# saves all into just one folder
# this is the one that worked ! ### i have added timesleep , lets see !


for url in test_links:
    try:
        print(f"Downloading {url} ...")
        response = requests.get(url)
        response.raise_for_status()

        # Open ZIP in memory
        zip_file = ZipFile(BytesIO(response.content))
        srt_files = [f for f in zip_file.namelist() if f.endswith(".srt")]

        if not srt_files:
            print("No .srt files found in this ZIP.")
            time.sleep(20)
            continue

        # Extract .srt files to the single folder
        for file_name in srt_files:
            # Optionally rename the file to avoid duplicates
            new_file_name = os.path.basename(file_name)
            zip_file.extract(file_name, path=base_folder)
            # Move extracted file to base folder (in case ZIP has subfolders)
            os.rename(os.path.join(base_folder, file_name), os.path.join(base_folder, new_file_name))
            print(f"Extracted {new_file_name} to {base_folder}")

    except Exception as e:
        print(f"Error processing {url}: {e}")

    time.sleep(20)

print("All downloads completed!")

## Step 2: Read the subtitles
* use pysrt to read the subtitles into text for python
* 

In [19]:
# function for reading ONE subtitles
def read_sub(subs):
    for sub in subs:
        print(sub.text)
        print()

In [20]:
# list all the subs for reading
def list_srt_files(base_folder):
    """
    Returns a list of all .srt file names in the given folder.

    Args:
        base_folder (str): Path to the folder where .srt files are stored.

    Returns:
        list: List of .srt file names (not full paths).
    """
    return [f for f in os.listdir(base_folder) if f.endswith(".srt")]

In [18]:
# base_folder = all_subs
# get all the subs in ready for reading

all_eps_srt = list_srt_files(base_folder='all_subs')
all_eps_srt;

In [None]:
# function for reading all subs and put into DF

def read_all_subs(all_eps):
    df = pd.DataFrame(columns=['episode_name', 
                               'english_name', 
                               'transcript'])

    for ep in all_eps:
        # clean episode name (remove prefix/suffix)
        episode_name = re.sub(r'^River\.Monsters\.|\.srt$', '', ep)
        episode_name = episode_name.replace('.', ' ')

        # read subtitles
        subs = read_sub(ep)  
        subs_text = subs.text if hasattr(subs, 'text') else str(subs)

        # append row
        df = pd.concat(
            [df, pd.DataFrame({
                "episode_name": [episode_name],
                "english_name": [None],           # or extract species if available
                "transcript": [subs_text]
            })],
            ignore_index=True
        )

    return df

In [None]:
df = read_all_subs(all_eps_srt)

In [None]:
## add english_ name and latin_name
df['latin_name'] = ['Hydrocynus Goliath', 'Urogymnus Polylepis', 'Channa Argus',
                     'Heterobranchus Longifilis', 'Sinosturio Transmontanus', 'Lates Niloticus',
                     'Pygocentrus Nattereri', 'Bagarius Yarrelli', 'Atractosteus Spatula',
                     'Silurus Glanis', 'Arapaima Gigas', 'Hydrolycus Scomberoides',
                     'Pristis Pristis', 'Electrophorus Electricus', 'Anguilla Dieffenbachii',
                     'Carcharhinus Leucas', 'Hoplias Aimara', 'Potamotrygon Brachypomus',
                     'Piaractus Brachypomus']


In [None]:
df['english_name'] = ['Goliath Tigerfish', 'Giant Freshwater Stingray', 'Northern Snakehead',
                    'Tsuni Catfish', 'White Sturgeon', 'Nile Perch', 'Red Bellied Piranha',
                     'Goonch Catfish', 'Alligator Gar', 'Wels Catfish', 'Arapaima', 'Payara',
                     'Largetooth Sawfish', 'Electric Eel', 'New Zealand Longfin Eel',
                     'Bull Shark', 'Wolf Fish', 'Short-Tailed River Stingray', 'Red-Bellied Pacu']


In [None]:
# reorder columns
df = df[['episode_name', 'english_name', 'latin_name', 'transcript']]

In [None]:
# save to csv
# df.to_csv('river_monsters.csv')

## Step 3: Scrape for wiki/fishbase
* scrape wikipedia and fishbase !
* special case for electric eel

In [21]:
# function that looks returns contents of wikipedia page 
# for each species and adds it to df!

def return_desc(df):
    wiki_desc_list = []
    
    for name in df['latin_name']:
        try:
            fish = wikipedia.page(name)
            wiki_desc_list.append(fish.content)
        except wikipedia.exceptions.DisambiguationError as e:
            wiki_desc_list.append(f"Disambiguation: {e.options}")
        except wikipedia.exceptions.PageError:
            wiki_desc_list.append("Page not found")
        
        # random pause between 30 and 60 seconds
        delay = random.uniform(30, 60)
        print(f"Sleeping for {delay:.1f} seconds...")
        time.sleep(delay)
    
    df['wiki_desc'] = wiki_desc_list
    return df


#df = return_desc(df)
#df.to_csv('river_monsters_complete.csv')

In [None]:
# scrape fishbase , add to wiki column

def scrape_fishbase(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    new_descs = []

    headers = {"User-Agent": "Mozilla/5.0 (compatible; FishScraper/1.0)"}

    for latin in df["latin_name"]:
        try:
            url = f"https://www.fishbase.se/summary/{latin.replace(' ', '%20')}.html"
            print(f"Fetching: {url}")

            resp = requests.get(url, headers=headers, timeout=20)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")

            # Grab everything inside the main summary div
            main_div = soup.find("div", {"id": "ss-main"})
            if main_div:
                text = main_div.get_text(" ", strip=True)
            else:
                text = ""

            new_descs.append(text)

        except Exception as e:
            print(f"❌ Failed for {latin}: {e}")
            new_descs.append("")

        # Random pause
        delay = random.uniform(30, 120)
        print(f"Sleeping for {delay:.1f} seconds...")
        time.sleep(delay)

    # If wiki_desc already exists, append; otherwise just set
    if "wiki_desc" in df.columns:
        df["wiki_desc"] = df["wiki_desc"].fillna("").astype(str) + "\n" + pd.Series(new_descs)
    else:
        df["wiki_desc"] = new_descs

    return df

In [None]:
# special case for electric eel, wikipedia was not able to distinguish this page from three entries despite attempts to specify and direct


eel_wiki = ''' Electrophorus electricus is the best-known species of electric eel. It is a South American electric fish. Until the discovery of two additional species in 2019, the genus was classified as the monotypic, with this species the only one in the genus.[3] Despite the name, it is not an eel, but rather a knifefish.[4] It is considered as a freshwater teleost which contains an electrogenic tissue that produces electric discharges.[5]

Taxonomic history The species has been reclassified several times. When originally described by Carl Linnaeus in 1766, he used the name Gymnotus electricus, placing it in the same genus as Gymnotus carapo (banded knifefish) which he had described several years earlier. It was only about a century later, in 1864, that the electric eel was moved to its own genus Electrophorus by Theodore Gill.[6]

In September 2019, David de Santana et al. suggested the division of the genus into three species based on DNA divergence, ecology and habitat, anatomy and physiology, and electrical ability: E. electricus, E. voltai sp. nov., and E. varii sp. nov. The study found E. electricus to be the sister species to E. voltai, with both species diverging during the Pliocene.[3]

Anatomy

Comparison of the three species of Electrophorus E. electricus has an elongated, cylindrical body, typically growing to about 2 m (6 ft 7 in) in length, and 20 kg (44 lb) in weight.[7] Their coloration is dark gray-brown on the back and yellow or orange on the belly. Mature females have a darker abdomen. They have no scales. The mouth is square and positioned at the end of the snout. The anal fin extends the length of the body to the tip of the tail. As in other ostariophysan fishes, the swim bladder has two chambers. The anterior chamber is connected to the inner ear by a series of small bones derived from neck vertebrae called the Weberian apparatus, which greatly enhances its hearing capability. The posterior chamber extends along the whole length of the body and maintains the fish's buoyancy.

E. electricus has a vascularized respiratory system with gas exchange occurring through epithelial tissue in its buccal cavity.[8] As obligate air-breathers, E. electricus must rise to the surface every ten minutes or so to inhale before returning to the bottom. Nearly eighty percent of the oxygen used by the fish is obtained in this way.[9]

Physiology Further information: Electric eel § Electrophysiology E. electricus has three pairs of abdominal organs that produce electricity: the main organ, Hunter's organ, and Sachs' organ. These organs occupy a large part of its body, and give the electric eel the ability to generate two types of electric organ discharges: low voltage and high voltage. These organs are made of electrocytes, lined up so a current of ions can flow through them and stacked so each one adds to a potential difference.[10] The three electrical organs are developed from muscle and exhibit several biochemical properties and morphological features of the muscle sarcolemma; they are found symmetrically along both sides of the eel.[5]

When the eel finds its prey, the brain sends a signal through the nervous system to the electrocytes. This opens the ion channels, allowing sodium to flow through, reversing the polarity momentarily. By causing a sudden difference in electric potential, it generates an electric current in a manner similar to a battery, in which stacked plates each produce an electric potential difference.[10] Electric eels are also capable of controlling their prey's nervous systems with their electrical abilities; by controlling their victim's nervous system and muscles via electrical pulses, they can keep prey from escaping or force it to move so they can locate its position.[11][12]

Electric eels use electricity in multiple ways. Low voltages are used to sense the surrounding environment. High voltages are used to detect prey and, separately, stun them, at which point the electric eel applies a suction-feeding bite.[13]

Anatomy of an electric eel's electric organs Sachs' organ is associated with electrolocation. Inside the organ are many muscle-like cells, called electrocytes. Each cell produces 0.15 V, the cells being stacked in series to enable the organ to generate nearly 10 V at around 25 Hz in frequency. These signals are emitted by the main organ; Hunter's organ can emit signals at rates of several hundred hertz.[14]

There are several physiological differences among the three electric organs, which allow them to have very different functions. The main electrical organ and the strong-voltage section of Hunter's organ are rich in calmodulin, a protein that is involved in high-voltage production.[15] Additionally, the three organs have varying amounts of Na+/K+-ATPase, which is a Na+/K+ ion pump that is crucial in the formation of voltage. The main and Hunter's organs have a high expression of this protein, giving it a high sensitivity to changes in ion concentration, whereas Sachs' organ has a low expression of this protein.[16]

The typical output is sufficient to stun or deter virtually any animal. The eels can vary the intensity of the electric discharge, using lower discharges for hunting and higher intensities for stunning prey or defending themselves. They can also concentrate the discharge by curling up and making contact at two points along its body.[17] When agitated, they can produce these intermittent electric shocks over at least an hour without tiring.[citation needed]

E. electricus also possesses high frequency–sensitive tuberous receptors, which are distributed in patches over its body. This feature is apparently useful for hunting other Gymnotiformes.[14] E. electricus has been prominent in the study of bioelectricity since the 18th century.[18] The species is of some interest to researchers, who make use of its acetylcholinesterase and adenosine triphosphate.[19][20]

Despite being the first described species in the genus and thus the most famous example, E. electricus actually has the weakest maximum voltage of the three species in the genus, at only 480 volts (as opposed to 572 volts in E. varii and 860 volts in E. voltai).[3]

Ecology and life history

Electric eel at the New England Aquarium Habitat E. electricus is restricted to freshwater habitats in the Guiana Shield. Populations in the Amazon basin, Brazilian Shield, and other parts of the Guiana Shield are now thought to belong to E. varii and E. voltai.[21]

Feeding ecology E. electricus feeds on invertebrates, although adult eels may also consume fish and small mammals, such as rats. First-born hatchlings eat other eggs and embryos from later clutches.[14] The juveniles eat invertebrates, such as shrimp and crabs.

Reproduction E. electricus is known for its unusual breeding behavior. In the dry season, a male eel makes a nest from his saliva into which the female lays her eggs. As many as 3,000 young hatch from the eggs in one nest. Males grow to be larger than females[22][23] by about 35 cm (14 in).'''

In [None]:
# update electric eel
df.loc[df['latin_name'] == 'Electrophorus Electricus', 'wiki_desc'] = eel_wiki

In [None]:
#fishbase_df = scrape_fishbase(df)

In [None]:
#fishbase_df.to_csv('RM_complete.csv', index=False)