## Cleaning and NLP 
* sensational corpus is the transcript for the episodes that we have
* scientific corpus is the wiki descriptions + fishbase
* document is the full contents of wiki page/fishbase page, or transcript

In [32]:
# standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# regex !!
import re

#display
from IPython.display import display, Markdown
from PIL import Image 

# words / NLP
from gensim.corpora import Dictionary   # import dictionary of english words from gensim, will help w tfidf
from wordcloud import WordCloud 
import unicodedata                      # for dealing w non ascii !

import nltk                             # famous nl libraries 
from nltk.corpus import stopwords       # curated list of english stopwords
from nltk.tokenize import word_tokenize # tokenisation function 
import string
from nltk.stem import WordNetLemmatizer # lemmatiser

# !pip install sentence-transformers
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer 

#from IPython.display import Audio
#positive_chime = Audio(filename='positive_chime.wav', autoplay=True) ## lil alarm for me

## Step 1: Quick sanity checks, get to know the data 

In [11]:
df = pd.read_csv('RM_complete.csv')

In [13]:
# quick function to wrap all these wuick checks into one

def sanity_check_transcripts(df):
    """
    Run quick sanity checks on df.
    Adds 'char_count' and 'word_count' columns for analysis.
    """

    display(Markdown("**--- .info Overview ---**"))
    print(df.info())
    print('\n')
    
    # % of missing values in each column
    display(Markdown("**--- % Missing Values ---**"))
    print(df.isnull().sum() / len(df) * 100)
    print('\n')

    # Check for exact duplicates
    no_duplicates = df.shape[0] == df.drop_duplicates().shape[0]
    display(Markdown(f"**No duplicate rows: {no_duplicates}**\n"))

    # Check for empty transcript strings
    empty_count = (df['transcript'].str.strip() == '').sum()
    display(Markdown(f"**Empty Transcripts: {empty_count}**\n"))

    empty_count = (df['wiki_desc'].str.strip() == '').sum()
    display(Markdown(f"**Empty wiki_desc: {empty_count}**\n"))
    print('\n')

    # Transcript length stats
    df['cc_transcript'] = df['transcript'].str.len()
    df['wc_transcript'] = df['transcript'].str.split().str.len()
    display(Markdown("**--- Transcript Length Stats ---**"))
    print(df[['cc_transcript', 'wc_transcript']].describe())
    print('\n')
    
    # Wiki_desc stats
    df['cc_wiki'] = df['wiki_desc'].str.len()
    df['wc_wiki'] = df['wiki_desc'].str.split().str.len()
    display(Markdown("**--- wiki_desc Stats ---**"))
    print(df[['cc_wiki', 'wc_wiki']].describe())
    print('\n')

    # Episodes count check
    display(Markdown("**--- Episode Counts ---**"))
    print(df['episode_name'].value_counts())
    print('\n')

    # Non-ASCII character check
    weird_chars = df['transcript'].apply(lambda x: re.findall(r'[^\x00-\x7F]+', x))
    weird_lines = weird_chars[weird_chars.str.len() > 0]
    display(Markdown(f"**Transcripts with non-ASCII characters: {len(weird_lines)}**\n"))

    # Non-ASCII character check
    weird_chars = df['wiki_desc'].apply(lambda x: re.findall(r'[^\x00-\x7F]+', x))
    weird_lines = weird_chars[weird_chars.str.len() > 0]
    display(Markdown(f"**wiki_desc with non-ASCII characters: {len(weird_lines)}**\n"))
    

    # Check for leftover timecodes like 00:01:23,456
    timecode_mask = df['transcript'].str.contains(r'\d{2}:\d{2}:\d{2},\d{3}')
    timecode_count = timecode_mask.sum()
    display(Markdown(f"**Transcripts with subtitle timecodes: {timecode_count}**\n"))


    # Longest & shortest episode names by word count
    longest_ep = df.loc[df['wc_transcript'].idxmax(), 'episode_name']
    shortest_ep = df.loc[df['wc_transcript'].idxmin(), 'episode_name']
    display(Markdown(f"**Longest episode: {longest_ep}**\n"))
    display(Markdown(f"**Shortest episode: {shortest_ep}**\n"))
    print('\n')

    ## drop 'Unnamed: 0.1', 'Unnamed: 0'
    
    #df = df.drop(columns = ['Unnamed: 0.2', 'Unnamed: 0.1','Unnamed: 0'])

    return df  # Returns df with char_count & word_count columns added

# check the data and also add word counts
df = sanity_check_transcripts(df)

**--- .info Overview ---**

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   episode_name  19 non-null     object
 1   english_name  19 non-null     object
 2   latin_name    19 non-null     object
 3   transcript    19 non-null     object
 4   wiki_desc     19 non-null     object
dtypes: object(5)
memory usage: 892.0+ bytes
None




**--- % Missing Values ---**

episode_name    0.0
english_name    0.0
latin_name      0.0
transcript      0.0
wiki_desc       0.0
dtype: float64




**No duplicate rows: True**


**Empty Transcripts: 0**


**Empty wiki_desc: 0**






**--- Transcript Length Stats ---**

       cc_transcript  wc_transcript
count      19.000000      19.000000
mean    29975.526316    5496.473684
std      3535.963367     679.511701
min     24742.000000    4564.000000
25%     27520.000000    5056.500000
50%     28962.000000    5279.000000
75%     32615.000000    5955.500000
max     36801.000000    6889.000000




**--- wiki_desc Stats ---**

            cc_wiki      wc_wiki
count     19.000000    19.000000
mean   14701.631579  2360.000000
std     6566.906300  1078.232144
min     5235.000000   839.000000
25%     8654.000000  1366.500000
50%    14552.000000  2298.000000
75%    19146.500000  3044.500000
max    27322.000000  4409.000000




**--- Episode Counts ---**

episode_name
Demon Fish              1
Amazon Assassins        1
Silent Assassin         1
Jungle Killer           1
Hidden Predator         1
Flesh Ripper            1
Electric Executioner    1
Chainsaw Predator       1
Amazon Flesh Eaters     1
European Maneater       1
Death Ray               1
Alligator Gar           1
Killer Catfish          1
Piranha                 1
Rift Valley Killer      1
Alaskan Horror          1
Congo Killer            1
Killer Snakehead        1
The Mutilator           1
Name: count, dtype: int64




**Transcripts with non-ASCII characters: 11**


**wiki_desc with non-ASCII characters: 19**


**Transcripts with subtitle timecodes: 0**


**Longest episode: Alaskan Horror**


**Shortest episode: Chainsaw Predator**






## Step 2: Light Cleaning - preparing for embedded semantic
* check for non ascii and sort out
* cast text cols (objects) to strings
* prepare for embedded semantic filtering
  * stopwords not removed **yet** bc can be important for context at this stage
  
 **regex remove**:
* '\n'
* '...'
* lowercase all
* remove trailing spaces
* Strip timecodes, HTML tags, captions
    * import unicodedata - for dealing w ascii like a smart person # df['transcript'] = df['transcript'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode())
* Optional: Lemmatize .. maybe do that later !


In [14]:
# check for non ascii characters
def ascii_check(df):
    
    # Non-ASCII character check
    weird_chars = df['transcript'].apply(lambda x: regex.findall(r'[^\x00-\x7F]+', x))
    weird_lines = weird_chars[weird_chars.str.len() > 0]
    display(Markdown(f"**Transcripts with non-ASCII characters: {len(weird_lines)}**\n"))
        
    # Non-ASCII character check
    weird_chars = df['wiki_desc'].apply(lambda x: regex.findall(r'[^\x00-\x7F]+', x))
    weird_lines = weird_chars[weird_chars.str.len() > 0]
    display(Markdown(f"**wiki_desc with non-ASCII characters: {len(weird_lines)}**\n"))

In [15]:
def clean_df(df):

    def remove_non_ascii(text):
        # Normalize the text to NFKD form
        normalized = unicodedata.normalize('NFKD', text)
        # Encode to ASCII bytes, ignoring characters that can't be converted
        ascii_bytes = normalized.encode('ascii', 'ignore')
        # Decode back to string
        return ascii_bytes.decode('ascii')

    for col in ['transcript', 'wiki_desc']:
        if col in df.columns:
            df[col] = df[col].apply(remove_non_ascii)
            print(f"Processed column: {col}")  # debug
        
    # optional: run ascii_check if you want to verify
    ascii_check(df)
    return df

#df = clean_df(df)

In [16]:
df.info() ## all objects to to string 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   episode_name   19 non-null     object
 1   english_name   19 non-null     object
 2   latin_name     19 non-null     object
 3   transcript     19 non-null     object
 4   wiki_desc      19 non-null     object
 5   cc_transcript  19 non-null     int64 
 6   wc_transcript  19 non-null     int64 
 7   cc_wiki        19 non-null     int64 
 8   wc_wiki        19 non-null     int64 
dtypes: int64(4), object(5)
memory usage: 1.5+ KB


In [17]:
# cast to string
def enforce_string_dtype(df, text_cols=('episode_name', 'english_name','latin_name', 'transcript', 'wiki_desc')):
    """
    Permanently cast specified columns to Pandas string dtype.
    """
    for col in text_cols:
        if col in df.columns:
            df[col] = df[col].astype("string")
            print(f"Casted '{col}' to string dtype.")
    return df

# usage
df = enforce_string_dtype(df)

Casted 'episode_name' to string dtype.
Casted 'english_name' to string dtype.
Casted 'latin_name' to string dtype.
Casted 'transcript' to string dtype.
Casted 'wiki_desc' to string dtype.


In [18]:
# punctuation for regex
string.punctuation # punctuation loaded in alr # nice 
punc = string.punctuation # use punc 
punc = punc.replace("-", "") ## this will keep double barrelled words for semantic analysis , may have to remove later !
punc; # - removed, should preserve double barrrelled words for now


# function for cleaning
def regex_clean(txt, pattern_str):
    reg = re.compile(pattern_str)
    return " ".join(reg.sub(" ", txt).split())

# function to prep text data for embedding
def embedprep_data(text):     
    """ Cleans up text!

    Parameters
    ----------
    text : string
        A text string that you want to parse and remove regex pattern matches 

    Returns
    -------
    Cleaned up string, ready for semantic filtering
    """   

    text = text.lower()                                 # all text to lowercase
    text = regex_clean(text, r'\s\d+\s')                # remove numbers and handle spacing issues
    text = regex_clean(text, r'\s\d*\.\d+\s')           # capture floats
    text = regex_clean(text, r'\s?(\d*\.\d+)\s?')       # currency
    text = regex_clean(text, r'\s?(\d+(?:\.\d+)?)\s?%') # handle percentages
    text = regex_clean(text, r'\.\.\.')                 # handles the ellipses


    text = "".join([char for char in text if char not in punc]) # remove all punctuation, except '-'
    
    
    return text


def prep_embed(df):
    cols = df[['episode_name', 'english_name','latin_name','transcript', 'wiki_desc']]
    for col in cols:
        df[col] = df[col].apply(embedprep_data)
    return df

In [19]:
# lightly cleans, prepares for embedded semantic filtering

df_embed = prep_embed(df)

In [21]:
#df_embed.head()

## Step 3: Embedded semantic filtering -> keep only relevant chunks

### embedding based semantic filtering
* instantiate embedder from sentence-transformer
1) have a lightly cleaned df, hyphens preserved
2) handle variants for english_name and latin_name
3) use variants of names to create a semantic pool, also w prompts for 'concept' of species ** go back and add to concept of fish!
4) chunk the text to maintain context when 'reading'
5) cosine similary of chunk to embeddings queries/prompts, also uses keyword_gate so important chunks aren't missed just bc they dont meet the threshold -> keeps only important chunks !
6) stitch the chunks together to be more coheerent
7) 8) get em working !!!!! done it to test df, apply to the rest !

In [29]:
test_df = df_embed.copy()

In [24]:
# embed filtering
# using sentence-transformer model !


# instatiate
class Embedder:
    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
        from sentence_transformers import SentenceTransformer
        self.model = SentenceTransformer(model_name)
    def __call__(self, texts):
        # Return L2-normalized numpy array (n, d)
        arr = self.model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
        return arr

embed = Embedder()  # instantiate once

In [26]:
# step 1: df_embed = ready and lightly cleaned ! 


################################################



# step 2: english_name variants ------ make sure to do to latin name for wiki_desc
def expand_name_variants(english_name: str, latin_name: str) -> set: #take in eng and lat names
    variants = set()

    # english name variants (multi-word, no hyphens)
    if english_name and isinstance(english_name, str):      # if str, carry on
        eng = english_name.strip().lower()                  # strip and lower case # red bellied piranha
        hyphen_first = eng.replace(" ", "-", 1)                 # red-bellied piranha
        split_name = eng.split()                                # single words
        variants |= {eng, eng.replace(" ", "-"), hyphen_first, " ".join(split_name)}  # red-bellied-piranha

    # Latin name variants
    if latin_name and isinstance(latin_name, str):
        lat = latin_name.strip()
        variants |= {lat.lower(), lat.replace(" ", "-").lower()} # clarias gariepinus, clarias-gariepinus

        # Abbreviated genus: Clarias gariepinus -> C. gariepinus
        parts = lat.split()
        if len(parts) == 2:
            genus, species = parts
            abbrev = f"{genus[0]}. {species}".lower()  #c. gariepinus
            abbrev2= f"{genus[0]}.{species}".lower()   #c.gariepinus
            variants |= {abbrev, abbrev2} # add to variants 

    return variants




################################################


# step 3 : 
def build_queries(english_name: str, latin_name: str = "") -> list[str]:
    #aliases = aliases or [] # no aliases
    pool = set()
    pool |= expand_name_variants(english_name or "", latin_name or "")

    # add short “concept” prompts to anchor meaning -
    prompts = [

        # core 
        f"{english_name} {latin_name}".strip(),
        f"{english_name} species".strip() if english_name else "",
        f"{latin_name} fish".strip() if latin_name else "",
        'fish',

        # include 
        'jaws', 'razor-sharp', 'muscle', "markings", "predator",\
        'use its'

    ]
    pool |= {embedprep_data(p) for p in prompts if p} # cleans the text, eg lowercasing in case and adde to pool
    
    # keep very short queries for sharper signal
    return sorted({q for q in pool if q and len(q.split()) <= 6})




################################################



# step 4: 
def chunk_text(text: str, words_per_chunk=100, overlap=20): # chunks , w overlap to maintain context
    """
    Split into overlapping word chunks: [(chunk_text, start_idx, end_idx), ...]
    """
    t = embedprep_data(text or "") # normalise in case, emppty instead of crashing    
    words = t.split() # splittext into indiv words
    if not words:
        return [] # in case empty
        
    chunks = [] # list for chunks
    step = max(1, words_per_chunk - overlap) # move through chunks, max(1 to stop infinite loop
    for start in range(0, len(words), step): # loop through using step for size
        end = min(len(words), start + words_per_chunk) # end is at end
        chunks.append((" ".join(words[start:end]), start, end)) # join chunked words back to str
        if end == len(words): # end loop
            break
    return chunks # return the chnks 




################################################





# step5 : score chunks vs queries (cosine on normalised vectors) 

keyword_gate = ['fish', 'teeth', 'fins', 'scales', 'body'] # keep chunk that has any 


# chunk_emb = vector embedding, query embs = vectors for queries/prompts 
# function calc cosine similarity between chunks and query
def cosine_max_score(chunk_emb: np.ndarray, query_embs: np.ndarray) -> float:
    # embeddings are L2-normalized, so dot product = cosine similarity
    return float(np.max(query_embs @ chunk_emb)) # find and returns max, how well chunk matches queries



# func for selecting the relevant chunks !!. threshold can be changed !, 
# should be changed, esp if we have such a long list of prompts
def select_relevant_chunks(chunks,        #text chnks
                           query_embs,    # embeddings of q/prompts
                           embed_fn,      # text to embeddings
                           threshold=0.5, # mess about w this one
                           top_k=None,    # keep stop scoring chunks, may not need bc very lil data
                           keyword_gate: set | None = None): # if appear, keep regardless fo score
    """
    - Embed all chunks once
    - Keep chunk if score >= threshold
    - Optionally keep top_k chunks regardless of threshold
    - Optional cheap keyword gate: if any keyword appears in the chunk text, keep it
    """
    if not chunks:
        return [] # handle in case empty
    texts = [c[0] for c in chunks]
    chunk_embs = embed_fn(texts)  # embed all chunks using func
    
    selected = []
    
    for i, (text, s, e) in enumerate(chunks): #loop thru chunks, calc max cosine sim
        score = cosine_max_score(chunk_embs[i], query_embs) # using cosine_max_score
        keep = score >= threshold     # keep chunk ?
        if not keep and keyword_gate:
            # lowercase containment check (text already normalized)
            if any(k in text for k in keyword_gate): # check keyword_gate = keep regardless of threshold value
                keep = True 
        if keep:
            selected.append((text, s, e, score)) ### chunk, start, end, similarity score

    return selected




################################################



# step 6: merge adjacent selected chunks into coherent sections
def merge_adjacent_chunks(selected, gap_words=20):  #### can change gapwords
    if not selected:
        return "" # if empty
    merged = []
    cur_text, cur_s, cur_e, _ = selected[0] # start group, current chunk s and e
    for text, s, e, _score in selected[1:]: # check chunk, see if join to group
        if s - cur_e <= gap_words: # treet as adjacent , concat 
            cur_text = cur_text + " " + text
            cur_e = e 
        else:
            merged.append(cur_text) # extend e
            cur_text, cur_s, cur_e = text, s, e # close group start new chunk
    merged.append(cur_text) # save final text 
    return "\n\n".join(merged) # return saved text





################################################




## step 7) filter the text ,, add dynamic settings to handle range of text length, third filter text, trial
def filter_3(
    text, english_name, latin_name, embed
):
    """
    Filter text using embeddings with dynamic chunk size, overlap, and threshold.
    """

    # dynamic settings based on word_count
    word_count = len(text.split())
    if word_count < 300: # for shorter text eg some of the wikis are just 200 ish
        chunk_size = 40
        overlap = 10
        threshold = 0.48
    elif word_count < 2000: ## mid siz
        chunk_size = 80
        overlap = 15
        threshold = 0.52
    else: 
        chunk_size = 150 # for longer text
        overlap = 30
        threshold = 0.57

    # queries and embeddings
    queries = build_queries(english_name, latin_name)
    if not queries:
        return ""
    query_embs = embed(queries)
    gate = set(queries) # keep if they appear regardless fo thresh

    # chunk text using func, break text up w some overlap for context
    chunks = chunk_text(text, words_per_chunk=chunk_size, overlap=overlap)
    if not chunks:
        return ""

    # relevant chunks, calcs cosine similarity or uses keywords
    selected = select_relevant_chunks( 
        chunks,              # 
        query_embs,
        embed_fn=embed,
        threshold=threshold, # will be adjusted
        keyword_gate=gate
    )
    if not selected:
        return "" ## only 'relevant' chunks remain

    # merge without duplicating , combine chunks backt o one text
    merged_texts = []
    last_end = -1
    for text_chunk, start, end, _ in selected:
        if start <= last_end:
            text_chunk = " ".join(text_chunk.split()[last_end - start + 1:])
        if text_chunk:
            merged_texts.append(text_chunk)
        last_end = end

    merged = "*****************".join(merged_texts)

    # remove any repeated phrases in case
    alr = set()
    final_text = []
    for sentence in merged.split(". "): # splits into sentences
        if sentence not in alr:
            alr.add(sentence) # nop duplicate texts
            final_text.append(sentence)
    return ". ".join(final_text) # clean text final





# add new counts
def add_counts(df, transcript_col="transcript_embed", wiki_col="wiki_desc_embed"):

    # Word counts
    df["cc_transcript_embed"] = df[transcript_col].apply(len)
    df["wc_transcript_embed"] = df[transcript_col].apply(lambda x: len(x.split()))

    df["cc_wiki_desc_embed"] = df[wiki_col].apply(len)
    df["wc_wiki_desc_embed"] = df[wiki_col].apply(lambda x: len(x.split()))
    
    return df





In [28]:
## wrapper func !! 

def filter_df_dynamic(
    df: pd.DataFrame,
    english_col="english_name",
    latin_col="latin_name",
    transcript_col="transcript",
    wiki_col="wiki_desc"
) -> pd.DataFrame:
    embed = Embedder()  # load model once
    df = df.copy()
    new_transcripts, new_wikis = [], []

    for _, row in df.iterrows():
        english, latin = row[english_col], row[latin_col]

        # filter transcript
        transcript = row.get(transcript_col, "")
        new_transcripts.append(filter_3(transcript, english, latin, embed=embed))

        # filter wiki description
        wiki = row.get(wiki_col, "")
        new_wikis.append(filter_3(wiki, english, latin, embed=embed))

    df["transcript_embed"] = new_transcripts
    df["wiki_desc_embed"] = new_wikis
    return df

#test_new = filter_df_dynamic(test_df)
#test_new = add_counts(test_new)

## Step 4: more cleaning !
* now we have the relevant chunks, let's extract the descriptors !

### POS and word extraction
* switched to this approach after standalone adjectives were not powerful enough 


**using spacy POS tagging but this time expanding extraction to include**
* Noun, nouns+adj, chunks arounud nouns,
* specified anatomic words that we wanted to keep eg fins, teeth
* excluded name variants ! no leakage !
* reduced noise further by having an extensive list of unwanted words
* boosted the weightings of words specific to fish, and multiple word phrases that included them = more informative


**top desscriptors have been extracted! ready for imagen/vertex**


In [30]:
# using lemmatiser bc stemmers produce nonsense words not useful for prompts

lem = WordNetLemmatizer() # lemmatiser
stpwrd = nltk.corpus.stopwords.words('english')  ## from nltk, common stopwords
stpwrd.extend(string.punctuation)    # from nltk , extend spwrds to include punctuation, remove simultaneous

In [None]:
# load nlp , df

nlp = spacy.load("en_core_web_sm")

df = test_new.copy()

In [35]:
# 


# --- helpers ---
## built from words that stood out as uninformative
STOP_DESCRIPTORS = {
    "species", "animal", "organism", "creature",
    "head", "water", "sea", "ocean", "river", "deep",
    "african", "european", "asian", "southern", "northern",
    "jungle", 'little','strain','a','treble','crocodile','girl',
    'father','fish','ft','ufo','agh','my','wife','biologist','child',
    'fishery','frogs','dog','amazon', 'environmental','earth','aiiigator',
    'fisherman','scientists','pets','','carp','daddy','dad','snail','fishing',
    'your','village','his','jeremy','wade','lake','alaskan', 'hippopotamus','river',
    'a', 'this', 'the', 'that', 'some unfinished business', 'an','abandoned','could','its','somebody',
    'final','leg','weight','certainly','information','their','fishing','nobody','certain','sockeye','industrial',
    'detailed','continental','next','oldest','such','hoo','fingers','upper','lower','catfish','snakehead',
    'looking','like','human','anal','culling', 'anal spines', 'anybody','adjoining','bull','and','then','another',
    'meters','yards','word', 'reabsorption','overharvesting','lica','vegrandis','b','issue','data','reproduction',
    'assumption','directions','air', 'k','reference','setting','any','signs','academy','award',
    'actual','stocks','schooling','fisheries','days','conservation','air','aquaculture','aquarium',
    'cocktail','fishs','these','teeth word','these','those','transformation','transform','details',
    'quarters','retail','fortunes','lakes','economy','cargo','belgian','artificial',
    'central illinois','fillets','cited examples','commercial importance', 
    'adult nile','adult nile perch','adult perch', 'anatomical', 'assistant', 'curator', 'bus',
    'hoo', 'tech', 'bear', 'alligator', 'journalist', 'engine','physiology','few','aerated','birds mammals','flatworm dermopristis',
    'brazil morphometrics','action malcolm douglas','bare hands', 
    'basket francisco','boys','brazils rugged frontier','brazilian rainforest','catching manner',
    'anything fernando','bamboo rod'
}
STOP_DESCRIPTORS = {w.strip().lower() for w in STOP_DESCRIPTORS}


## words that need to be included
FISH_FEATURE_KEYWORDS = {
    "teeth", "tooth", "mouth", "jaw", "jaws",
    "colour", "color", "hue", "colouration"
    "pattern", "marking","markings", "stripe","striped", "spot", "band",
    "body", "shape", "form","shaped","-like", 'caudal',
    "fin", "spine", "spines", "barbels", "barbel", "whisker","whiskers", "scale","scales", 
    "plate", "head", 'flat', "snout", "tail", 'torpedo-shaped', 'missile', 'elongated', 'flank',
    'spiny', 'greyish', 'silver' 
}


PERSONALITY_TERMS = {
    "elusive", "mysterious", "fearsome", "aggressive", "shy",
    "timid", "curious", "bold", "hostile", "gentle", "peaceful"
}


# words that need to be included
FISH_LEXICON = {
    "anatomy": ["spines", "barbels","whiskers", 'flat', 'round',
    "plate", "armor", "snout", "tail", 'torpedo-shaped', 'missile', 'elongated',
        'flat', 'villiform teeth'
        ],
    "ecology": [
        "ambush", "bottom-dweller", "nocturnal", "schooling",
        "migratory", "territorial", "camouflage"
    ]
}
LEXICON_TERMS = {term.lower() for group in FISH_LEXICON.values() for term in group}

GENERIC_SINGLE_WORDS = {"fish", "adult", "large", "big", "small",'predator','predators','cut'}


# examples of physical terms
PHYSICAL_KEYWORDS = {
    "colour": {"grey", "silver", "olive", "golden", "blue", "dark", "light", "brown", "yellow", "green", "black", "white", "reddish"},
    "pattern": {"striped", "spotted", "mottled", "banded", "dappled", "speckled", "marbled"},
    "mouth_teeth": {'sharp', 'needle-like', 'villiform','fangs',},
    "scales": {"scales", "scaly", "scaleless", "armour", "armor"},
    "features": {"spines", "barbels", "whiskers", "snout", "plate", "tail", "fin", "fins", "dorsal", "caudal", "pectoral"},
    "size": {"large", "giant", "small", "elongated", "torpedo", "flat", "round", "slender", "massive", "huge",'long'},
}
PHYSICAL_TERMS = set().union(*PHYSICAL_KEYWORDS.values())




def expand_name_variants(english_name: str, latin_name: str) -> set:
    variants = set()

    # English name variants
    if english_name and isinstance(english_name, str):
        eng = english_name.strip().lower()
        words = eng.split()
        variants |= {
            eng,
            eng.replace(" ", "-"),   # red-bellied-piranha
            eng.replace(" ", ""),    # redbelliedpiranha
            *words                   # red, bellied, piranha
        }

    # Latin name variants
    if latin_name and isinstance(latin_name, str):
        lat = latin_name.strip().lower()
        words = lat.split()

        variants |= {
            lat,
            lat.replace(" ", "-"),   # clarias-gariepinus
            lat.replace(" ", ""),    # clariasgariepinus
            *words                   # clarias, gariepinus
        }

        if len(words) == 2:
            genus, species = words
            abbrev = f"{genus[0]}. {species}"
            abbrev2 = f"{genus[0]}.{species}"
            variants |= {abbrev, abbrev2}

    # remove any accidental empties
    return {v.strip().lower() for v in variants if v.strip()}






def maybe_add(phrase, descriptors, blocked, force=False):
    """Clean + block check before adding a descriptor."""
    cleaned = clean_descriptor(phrase)
    if not cleaned:
        return

    # normalize
    cleaned_tokens = cleaned.split()

    # direct full match
    if cleaned in blocked:
        return

    # block if ANY token is a blocked word
    if any(tok in blocked for tok in cleaned_tokens):
        return

    # block if phrase CONTAINS a blocked multi-word variant
    for v in blocked:
        if " " in v and v in cleaned:
            return

    # block if regex whole-word match
    for v in blocked:
        if re.search(rf"\b{re.escape(v)}\b", cleaned):
            return

    # only allow single words if forced (lexicon keep)
    if force or len(cleaned_tokens) > 1:
        descriptors.add(cleaned)


# --- cleaning -- max 3 words, and 35 chars
def clean_descriptor(phrase: str, max_words=3, max_chars=35, strict=True) -> str | None:
    phrase = phrase.strip().lower()
    words = phrase.split()

    # too long
    if len(words) > max_words:
        return None
    if len(phrase) > max_chars:
        return None

    # reject if contains non-letters (except spaces/underscores/hyphens)
    if re.search(r"[^a-z _-]", phrase):
        return None

    # --- stopword logic ---
    if strict:
        # nuke phrase if ANY word is a stopword
        if any(w in STOP_DESCRIPTORS for w in words):
            return None
    else:
        # softer: only reject if whole phrase or first word is stopword
        if phrase in STOP_DESCRIPTORS or words[0] in STOP_DESCRIPTORS:
            return None

    return phrase


def extract_descriptors(text: str, english_name=None, latin_name=None, nlp_model=None):
    if nlp_model is None:
        nlp_model = nlp

    blocked = expand_name_variants(english_name, latin_name)
    doc = nlp_model(text)
    descriptors = set()

    for token in doc:
        tok_lower = token.text.lower()

        # adj+noun pairs
        if token.dep_ == "amod" and token.head.pos_ == "NOUN":
            maybe_add(f"{token.text.lower()} {token.head.text.lower()}", descriptors, blocked)

        # compound nouns
        if token.dep_ == "compound" and token.head.pos_ == "NOUN":
            maybe_add(f"{token.text.lower()} {token.head.text.lower()}", descriptors, blocked)

        # "X-looking Y" / "X-like Y"
        if tok_lower in {"looking", "like"} and token.i > 0:
            prev = doc[token.i - 1].text.lower()
            head = (
                doc[token.i + 1].text.lower()
                if token.i + 1 < len(doc) and doc[token.i + 1].pos_ == "NOUN"
                else token.head.text.lower()
            )
            maybe_add(f"{prev}-{tok_lower} {head}", descriptors, blocked)

    # noun chunks (multi-word only)
    for chunk in doc.noun_chunks:
        phrase = chunk.text.lower().strip()
        if len(phrase.split()) > 1:
            maybe_add(phrase, descriptors, blocked)

    # lexicon force-keep (single words allowed)
    for kw in LEXICON_TERMS:
        if kw in text.lower():
            maybe_add(kw, descriptors, blocked, force=True)

    return sorted(descriptors)



### tfidf scoring, boosting physical, and fish specific terms
## keep species specifc terms
def score_descriptors(descriptor_lists, top_n=30, nlp_model=None):
    if nlp_model is None:
        nlp_model = nlp

    # normalize all descriptors
    normalized_lists = []
    mapping = {}
    for lst in descriptor_lists:
        normed = []
        for d in lst:
            norm = normalize_descriptor(d, nlp_model)
            normed.append(norm)
            if norm not in mapping:
                mapping[norm] = d
        normalized_lists.append(normed)

    docs = [" ".join([d.replace(" ", "_") for d in lst]) for lst in normalized_lists]
    vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", min_df=1)
    tfidf_matrix = vectorizer.fit_transform(docs)
    feature_names = vectorizer.get_feature_names_out()

    results = []
    for doc_idx, doc in enumerate(docs):
        scores = dict(zip(feature_names, tfidf_matrix[doc_idx].toarray().flatten()))

        boosted = {}
        for desc, score in scores.items():
            desc_clean = desc.replace("_", " ")
            representative = mapping.get(desc_clean, desc_clean)

            # --- scoring logic ---
            if representative in PHYSICAL_TERMS:
                boost = 6.0   #  strong bias toward physical traits
            elif representative in LEXICON_TERMS:
                boost = 2.0
            elif any(lex in representative for lex in LEXICON_TERMS):
                boost = 3.0
            else:
                boost = 0.5   # downweight generic/behavioral stuff

            boosted[representative] = score * boost

        top = sorted(boosted.items(), key=lambda x: x[1], reverse=True)[:top_n]
        results.append([desc for desc, _ in top])

    return results



## top 25 descriptors , new column

def add_descriptor_columns(df: pd.DataFrame, top_n=25, nlp_model=None) -> pd.DataFrame:
    """
    Process transcript + wiki columns:
    - Extract descriptors
    - Score with TF-IDF + lexicon boosting
    - Return top N per row
    """

    if nlp_model is None:
        nlp_model = nlp  # assumes you loaded spaCy globally

    df = df.copy()

    # 1) Extract raw descriptors (with blocking)
    transcript_descs = []
    wiki_descs = []

    for _, row in df.iterrows():
        eng = row.get("english_name", "")
        lat = row.get("latin_name", "")

        t_desc = extract_descriptors(
            str(row.get("transcript_embed", "")),
            english_name=eng,
            latin_name=None,      # block English name
            nlp_model=nlp_model,
        )
        w_desc = extract_descriptors(
            str(row.get("wiki_desc_embed", "")),
            english_name=None,
            latin_name=lat,       # block Latin name
            nlp_model=nlp_model,
        )

        transcript_descs.append(t_desc)
        wiki_descs.append(w_desc)

    # 2) Score with TF-IDF + lexicon boost
    transcript_top = score_descriptors(transcript_descs, top_n=top_n)
    wiki_top = score_descriptors(wiki_descs, top_n=top_n)

    # 3) Add back to DataFrame
    df["transcript_top"] = transcript_top
    df["wiki_top"] = wiki_top

    return df

#df1 = df.copy()
#df1 = add_descriptor_columns(df1, top_n=25)

In [None]:
#df1.to_csv('extracted_descriptors3.csv', index=False)