<a href="https://colab.research.google.com/github/gl0bsec/Documents/blob/main/community_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dependencies

In [None]:
!pip -q install semchunk
!pip -q install pacmap
!pip -q install hdbscan
!pip -q install pyMuPDF
!pip install -q bertopic
!pip -q install -U  'spacy[cuda12x]'
!python -m spacy download en_core_web_trf

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m272.8/272.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m107.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.6/150.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m125.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m95.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Load program (NER and parsing)

In [None]:
import re
import json
import spacy
import pandas as pd

def parse_message(message):
    """
    Parse a Discord message dict to extract:
      - content without URLs
      - list of URLs from the content
      - embed metadata
      - message ID (GUID)
      - timestamp
    """
    content = message.get('content', '') or ''
    urls = re.findall(r'https?://\S+', content)
    content_without_urls = re.sub(r'https?://\S+', '', content).strip()
    embeds_metadata = []
    for embed in message.get('embeds', []):
        metadata = {
            'url': embed.get('url'),
            'title': embed.get('title'),
            'description': embed.get('description'),
            'thumbnail_url': embed.get('thumbnail', {}).get('url') if embed.get('thumbnail') else None,
            'author_name': embed.get('author', {}).get('name') if embed.get('author') else None,
        }
        embeds_metadata.append(metadata)
    return {
        'id': message.get('id'),
        'timestamp': message.get('timestamp'),
        'content': content_without_urls,
        'urls': urls,
        'embeds': embeds_metadata,
    }



def add_ner_columns(df, field, model="en_core_web_trf", use_gpu=False):
    """
    Given a DataFrame `df` and a column name `field` (which may contain
    plain text, lists of embeds/dicts, or your new combined_content strings),
    load the specified spaCy NER model and add two new columns:
      - `<field>_entities`: list of entity texts found
      - `<field>_entity_types`: list of corresponding entity labels
    """
    # load model (on CPU or GPU)
    if use_gpu:
        if not spacy.require_gpu():
            raise RuntimeError("spaCy GPU support is not available.")
        nlp = spacy.load(model)
    else:
        # device=-1 forces CPU
        nlp = spacy.load(model)

    def extract_text(value):
        # Recursively flatten strings, dicts, lists, etc. into one clean text blob
        if value is None:
            return ""
        if isinstance(value, str):
            return value
        if isinstance(value, dict):
            # grab title/description if present, or any text fields
            parts = []
            for k in ("title", "description", "text"):
                if k in value and value[k]:
                    parts.append(value[k])
            return " ".join(parts)
        try:
            # iterable (list/tuple/etc.)
            parts = [extract_text(item) for item in value]
            return " ".join(p for p in parts if p)
        except TypeError:
            # fallback for other scalars
            return str(value)

    texts = []
    for val in df[field]:
        texts.append(extract_text(val))

    entities = []
    labels   = []
    for doc in nlp.pipe(texts, disable=["tagger", "parser", "lemmatizer"]):
        ents = [(e.text, e.label_) for e in doc.ents]
        entities.append([t for t,_ in ents])
        labels.append([l for _,l in ents])

    out = df.copy()
    out[f"{field}_entities"]     = entities
    out[f"{field}_entity_types"] = labels
    return out


def add_nounchunk_columns(df, field, model="en_core_web_sm", use_gpu=False):
    """
    Given a DataFrame `df` and a column name `field` (text or embeds/list/dicts),
    load the specified spaCy model and add:
      - `<field>_noun_chunks`: list of noun chunk strings
    """
    if use_gpu:
        if not spacy.require_gpu():
            raise RuntimeError("spaCy GPU support is not available.")
        nlp = spacy.load(model)
    else:
        nlp = spacy.load(model)

    def extract_text(value):
        if value is None:
            return ""
        if isinstance(value, str):
            return value
        if isinstance(value, dict):
            parts = []
            for k in ("title", "description", "text"):
                if k in value and value[k]:
                    parts.append(value[k])
            return " ".join(parts)
        try:
            parts = [extract_text(item) for item in value]
            return " ".join(p for p in parts if p)
        except TypeError:
            return str(value)

    texts = [extract_text(val) for val in df[field]]

    noun_chunks = []
    for doc in nlp.pipe(texts, disable=["tagger", "parser", "lemmatizer"]):
        noun_chunks.append([chunk.text for chunk in doc.noun_chunks])

    out = df.copy()
    out[f"{field}_noun_chunks"] = noun_chunks
    return out

# Example usage:
# df_nc = add_nounchunk_columns(df_parsed, 'content')
# df_nc = add_nounchunk_columns(df_nc, 'embeds', use_gpu=True)

# Define the function to combine 'embeds' and 'content' into a new column
def create_combined_content(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a new DataFrame with an added column 'combined_content' which
    concatenates the text from 'embeds' and 'content' columns.
    """
    # Make a copy to avoid modifying original DataFrame
    df = df.copy()
    # Ensure both columns are strings and combine them
    df['combined_content'] = df['embeds'].astype(str) + ' ' + df['content'].astype(str)
    return df



## Execute

In [None]:
# Load the JSON file
file_path = '/content/Project Owl: The OSINT Community - Africa - 🟡-west-africa-sahel [746879202133016576].json'
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Parse all messages
parsed_messages = [parse_message(msg) for msg in data.get('messages', [])]

# Display the parsed messages as a DataFrame
df_parsed = pd.DataFrame(parsed_messages)
df_parsed.to_csv('messages.csv')

In [None]:
# Demonstration: Read the CSV, apply the function, and display the first few rows
spacy.require_gpu()
df_combined = create_combined_content(df_parsed)
df_combined.head()

Unnamed: 0,id,timestamp,content,urls,embeds,combined_content
0,746882535476428890,2020-08-23T02:04:32.864+02:00,,[],[],[]
1,746887776791822407,2020-08-23T02:25:22.491+02:00,Is this not just West Africa?,[],[],[] Is this not just West Africa?
2,746887855137357975,2020-08-23T02:25:41.17+02:00,At least according to the UN.,[],[],[] At least according to the UN.
3,746888317198401536,2020-08-23T02:27:31.334+02:00,Well it's also lumping in a few nations that a...,[],[],[] Well it's also lumping in a few nations tha...
4,746939925793996810,2020-08-23T05:52:35.782+02:00,So yes @Moto Send-It Man its the sahel region ...,[],[],[] So yes @Moto Send-It Man its the sahel regi...


In [None]:
# Example usage:
spacy.require_gpu()
ner_processed = add_ner_columns(df_combined, 'combined_content', use_gpu=True)
ner_processed.to_csv('messages_processed.csv')
# df_gpu = add_ner_columns(df_gpu, 'embeds', use_gpu=True)

In [None]:
ner_processed.to_json('messages_processed.json')

## Load program (Embedding and topic modelling)

### Experimental

In [None]:
import json
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from umap import UMAP
from hdbscan import HDBSCAN
import nltk
from nltk.corpus import stopwords
import re

def filter_vectors_by_text_name(doc_topics_df, chunk_vectors_df):
  """
  Filters the chunk_vectors DataFrame to include only rows where the
  'text_name' value exists in the 'text_name' column of the
  doc_topics_df DataFrame.

  Args:
    doc_topics_df: pandas DataFrame loaded from
                   document_topics_clustered_english1.csv.
    chunk_vectors_df: pandas DataFrame loaded from chunk_vectors.csv.

  Returns:
    pandas DataFrame: A filtered version of chunk_vectors_df.
  """

  # Get the list of unique text_names from the document topics DataFrame
  valid_text_names = doc_topics_df['text_name'].unique()

  # Filter the chunk_vectors DataFrame
  filtered_df = chunk_vectors_df[chunk_vectors_df['text_name'].isin(valid_text_names)].copy()

  return filtered_df


def filter_topics_sequentially(chunk_df: pd.DataFrame,
                               doc_cluster_df: pd.DataFrame,
                               chunk_text_col: str = 'text_name',
                               chunk_topic_col: str = 'topic_name',
                               doc_text_col: str = 'text_name',
                               doc_cluster_col: str = 'Cluster') -> tuple[pd.DataFrame, dict]:

    # --- 1. Calculate Metrics & Sets ---
    print("Step 1: Calculating metrics and defining filter sets...")
    # Topics per text
    topic_text_counts = chunk_df.groupby(chunk_topic_col)[chunk_text_col].nunique()
    total_unique_texts = chunk_df[chunk_text_col].nunique()

    # Handle cases with empty input DataFrames or zero unique texts
    if total_unique_texts == 0 or chunk_df.empty:
        print("Warning: Input chunk_df is empty or contains no unique texts.")
        empty_df = chunk_df.head(0).copy()
        # Ensure the cluster column exists in the returned empty DataFrame
        if doc_cluster_col not in empty_df.columns:
              # Attempt to add column with correct type if possible, else object
              cluster_dtype = doc_cluster_df[doc_cluster_col].dtype if doc_cluster_col in doc_cluster_df.columns else object
              empty_df[doc_cluster_col] = pd.Series(dtype=cluster_dtype)
        return empty_df, {'initial': 0, 'removed1': 0, 'removed2': 0, 'removed3': 0, 'final_remaining': 0}

    topic_text_percentage = (topic_text_counts / total_unique_texts) * 100

    # Topics per document cluster
    df_topic_texts_unique = chunk_df[[chunk_text_col, chunk_topic_col]].drop_duplicates()
    # Use original doc_cluster_col name provided by user for filtering NaNs
    df_doc_clusters_unique = doc_cluster_df[[doc_text_col, doc_cluster_col]].dropna(subset=[doc_cluster_col]).drop_duplicates()

    # Rename columns for merge - use internal names to avoid clashes
    # Align text columns for merging based on the assumption they represent the same ID
    df_doc_clusters_unique_renamed = df_doc_clusters_unique.rename(
        columns={doc_text_col: chunk_text_col, doc_cluster_col: '_Cluster_ID_Internal'}
    )

    merged_topics_clusters = pd.merge(df_topic_texts_unique, df_doc_clusters_unique_renamed, on=chunk_text_col, how='inner')
    topic_doc_cluster_counts = merged_topics_clusters.groupby(chunk_topic_col)['_Cluster_ID_Internal'].nunique()

    # Define filter sets
    all_topics_set = set(chunk_df[chunk_topic_col].unique())
    set1_one_text = set(topic_text_counts[topic_text_counts == 1].index)
    set2_less_than_2_percent = set(topic_text_percentage[topic_text_percentage < 2].index)
    # Ensure topics not present in clustered docs are handled (count as 0 clusters)
    set3_one_doc_cluster = set(topic for topic in all_topics_set if topic_doc_cluster_counts.get(topic, 0) == 1)


    # --- 2. Sequential Filtering ---
    print("Step 2: Performing sequential filtering...")
    initial_count = len(all_topics_set)
    filtering_stats = {'initial': initial_count}

    # Filter 1
    filtered1 = all_topics_set.intersection(set1_one_text)
    removed1_count = len(filtered1)
    remaining_topics1 = all_topics_set - filtered1
    filtering_stats['removed1'] = removed1_count
    print(f"  Filter 1 (One Text) removed: {removed1_count}")

    # Filter 2
    filtered2 = remaining_topics1.intersection(set2_less_than_2_percent)
    removed2_count = len(filtered2)
    remaining_topics2 = remaining_topics1 - filtered2
    filtering_stats['removed2'] = removed2_count
    print(f"  Filter 2 (< 2% Texts) removed: {removed2_count}")

    # Filter 3
    filtered3 = remaining_topics2.intersection(set3_one_doc_cluster)
    removed3_count = len(filtered3)
    remaining_topics3 = remaining_topics2 - filtered3 # This is the final set of topics
    filtering_stats['removed3'] = removed3_count
    filtering_stats['final_remaining'] = len(remaining_topics3)
    print(f"  Filter 3 (One Doc Cluster) removed: {removed3_count}")
    print(f"  Final remaining topic count: {len(remaining_topics3)}")


    # --- 3. Final DataFrame Generation ---
    print("Step 3: Generating final filtered DataFrame...")
    # Filter original chunk data using the final set of topics
    df_filtered_chunks = chunk_df[chunk_df[chunk_topic_col].isin(remaining_topics3)].copy()

    # Prepare doc cluster data for the final merge (use specified column names)
    doc_cluster_info = doc_cluster_df[[doc_text_col, doc_cluster_col]].drop_duplicates(subset=[doc_text_col])

    # Perform the left merge to add the cluster column
    # Rename the text column in doc_cluster_info to match chunk_text_col for the merge
    df_final = pd.merge(
        df_filtered_chunks,
        doc_cluster_info.rename(columns={doc_text_col: chunk_text_col}),
        on=chunk_text_col,
        how='left'
    )
    print("Step 4: Process complete.")

    # --- 4. Return ---
    return df_final, filtering_stats

def json_to_dataframe(json_path):
    """
    Converts a JSON file (with cluster, filenames, and content) into a pandas DataFrame.

    Parameters:
        - json_path (str): Path to the input JSON file.

    Returns:
        - DataFrame with columns: 'cluster', 'filename', 'content'.
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    records = []
    for cluster, file_entries in data.items():
        for file_entry in file_entries:
            if isinstance(file_entry, dict):  # Ensure proper structure
                records.append({
                    "cluster": cluster,
                    "filename": file_entry["filename"],
                    "content": file_entry["content"]
                })

    return pd.DataFrame(records)


def use_bertopic_with_custom_vectors(chunk_vectors, documents, n_topics=7):
    """
    Apply BERTopic to custom vectors with full pipeline explicitly defined.
    """
    # Download NLTK stopwords if not already downloaded
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')

    # Create an extended stopwords list
    english_stopwords = stopwords.words('english')
    additional_stopwords = [
        'maximum', 'characters', 'said', 'also', 'would', 'could', 'may',
        'might', 'like', 'many', 'much', 'get', 'well', 'even', 'still',
        'back', 'see', 'way', 'thing', 'make', 'made', 'got', 'go', 'going'
    ]
    extended_stopwords = list(set(english_stopwords + additional_stopwords))

    # Preprocess documents
    preprocessed_documents = []
    for doc in documents:
        # Convert to lowercase
        text = doc.lower()
        # Remove special characters and normalize whitespace
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        preprocessed_documents.append(text)

    # 1. Convert vectors to proper numpy array
    embeddings = np.array(chunk_vectors)

    # 2. Define UMAP model for dimensionality reduction
    umap_model = UMAP(
        n_neighbors=15,       # Controls how local/global structure is preserved
        n_components=5,       # Reduced dimensions (5 is good for clustering)
        min_dist=0.0,         # Minimum distance between points in embedding
        metric='cosine',      # Distance metric
        random_state=42       # For reproducibility
    )

    # 3. Define HDBSCAN for clustering
    hdbscan_model = HDBSCAN(
        min_cluster_size=27,   # Matches our min_topic_size
        metric='euclidean',   # Distance metric for clustering
        cluster_selection_method='eom',  # Excess of mass (usually better)
        prediction_data=True, # Required for predicting on unseen data
        min_samples=2         # Controls cluster density
    )

    # 4. Define CountVectorizer for topic representation
    vectorizer = CountVectorizer(
        stop_words=extended_stopwords,
        ngram_range=(1, 2),
        min_df=0.10,
        max_df=0.90
    )

    # 5. Initialize BERTopic with full pipeline
    topic_model = BERTopic(
        embedding_model=None,           # We provide our own embeddings
        umap_model=umap_model,          # Explicitly use UMAP
        hdbscan_model=hdbscan_model,    # Explicitly use HDBSCAN
        vectorizer_model=vectorizer,    # Use our TF-IDF vectorizer
        nr_topics=n_topics,
        min_topic_size=5,
        top_n_words=10,
        calculate_probabilities=True,
        verbose=True
    )

    # 6. Fit the model with preprocessed documents and custom embeddings
    topics, probs = topic_model.fit_transform(preprocessed_documents, embeddings=embeddings)

    return topic_model, topics

### Embedding + Dim Reduction

In [None]:
import semchunk
import torch
import itertools
import pandas as pd
from tqdm import tqdm
from typing import Iterable, Generator
from transformers import AutoTokenizer, AutoModel
from transformers.models.bert.modeling_bert import BertModel
from pacmap import PaCMAP
import zipfile
import fitz
import os
import re

import re

def clean_text(text: str) -> str:
    """
    Remove URLs and numbers from text, and strip extra whitespace.
    """
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"\d+", "", text)                      # Remove numbers
    text = re.sub(r"\s+", " ", text).strip()             # Remove extra spaces
    return text

def load_and_filter_input(file_path: str, text_column: str) -> tuple[pd.DataFrame, list[str]]:
    """
    Load a CSV and return a filtered DataFrame and a list of cleaned texts from a specified column.
    Keeps only rows where the text_column is not null or empty (after cleaning).
    Removes URLs and numbers from the text content.
    """
    df = pd.read_csv(file_path)

    # Clean the text first
    df[text_column] = df[text_column].astype(str).apply(clean_text)

    # Drop rows where cleaned text is now empty
    filtered_df = df[df[text_column].str.strip().astype(bool)].copy()

    # Get the cleaned text list
    texts = filtered_df[text_column].tolist()

    return filtered_df, texts


def chunk(
    texts: list[str],
    model: str = 'avsolatorio/GIST-large-Embedding-v0',
    # 'avsolatorio/GIST-large-Embedding-v0',
    #'intfloat/multilingual-e5-large-instruct',
    size: int = None,
    max_token_chars: int = None,  # Set to None to allow automatic determination
    processes: int = 1,
) -> list[list[str]]:
    """Split texts into semantically meaningful chunks using 90% of model's max length by default."""
    tokenizer = AutoTokenizer.from_pretrained(model, model_max_length=512)
    if size is None:
        size = int(tokenizer.model_max_length * 0.9)

    # Create a chunker using semchunk.chunkerify
    chunker = semchunk.chunkerify(tokenizer, chunk_size=size, max_token_chars=max_token_chars)

    # Process each text to generate chunks
    chunked_texts = [chunker(text, processes=processes) for text in texts]
    return chunked_texts

def batch_generator(
    iterable: Iterable,
    batch_size: int
) -> Generator[list, None, None]:
    """Generate batches of the specified size from the provided iterable."""
    iterator = iter(iterable)
    for first in iterator:
        yield list(itertools.chain([first], itertools.islice(iterator, batch_size - 1)))

def vectorise(
    texts: list[list[str]],
    model: str = 'avsolatorio/GIST-large-Embedding-v0',
    #'intfloat/multilingual-e5-large-instruct',
    normalise: bool = True,
    batch_size: int = 192,
    gpu: bool = True,
    progress: bool = True
) -> list[list[float]]:
    """Vectorise the given texts with the provided model."""
    model_name = model
    model: BertModel = AutoModel.from_pretrained(model_name)
    tokeniser = AutoTokenizer.from_pretrained(model_name)
    if gpu:
        model = model.to('cuda')

    chunks = []
    boundaries = []
    start = 0

    for text in texts:
        chunks.extend(text)
        boundaries.append((start, (start:=start+len(text))))

    vectors = []

    with tqdm(total=len(chunks), disable=not progress, unit=' text') as bar:
        for batch in batch_generator(chunks, batch_size):
            batch = tokeniser(batch, padding=True, truncation=True, return_tensors='pt')
            if gpu:
                batch = batch.to('cuda')
            with torch.no_grad():
                batch = model(**batch)[0][:, 0]
                if normalise:
                    batch = torch.nn.functional.normalize(batch, p=2, dim=1)
                batch = batch.cpu()
                vectors.extend(batch)
            bar.update(len(batch))

    vectors = [torch.mean(torch.stack(vectors[start:end]), dim=0).tolist() for start, end in boundaries]
    return vectors


def reduce(
    vectors: list[list[float]],
    clusterisable_dimensions: int = 80,
    map_dimensions: int = 2,
) -> tuple[list[list[float]], list[list[float]]]:
    """Reduce the dimensionality of the given vectors."""
    config = dict(
        n_neighbors=None,
        apply_pca=False,
        save_tree=True,
        verbose=True,
    )

    clusterisabilitisation_model = PaCMAP(n_components=clusterisable_dimensions, **config)
    mapping_model = PaCMAP(n_components=map_dimensions, **config)

    clusterisable_vectors = clusterisabilitisation_model.fit_transform(vectors).tolist()
    map_vectors = mapping_model.fit_transform(vectors).tolist()

    return clusterisable_vectors, map_vectors

def csv_column_to_list(file_path: str, column_name: str) -> list[str]:
    """Extract a column from a CSV file and return it as a list of texts."""
    df = pd.read_csv(file_path)
    return df[column_name].dropna().astype(str).tolist()

def save_reduced_vectors(clusterisable_vectors: list[list[float]], map_vectors: list[list[float]], clusterisable_filename: str, map_filename: str):
    """Save the reduced vectors as individual CSV files."""
    pd.DataFrame(clusterisable_vectors).to_csv(clusterisable_filename, index=False)
    pd.DataFrame(map_vectors).to_csv(map_filename, index=False)

def preprocess_text(text):
    """
    Preprocess text by removing special characters, lowercasing, and stripping extra whitespace.
    """
    # text = text.lower()  # Convert to lowercase
    # text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation and special characters
    # text = re.sub(r"\s+", " ", text).strip()  # Remove extra whitespace
    return text

def load_documents(folder_path):
    """
    Load text documents (TXT and PDF) from a folder and preprocess them, returning a DataFrame.
    """
    data = []

    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)

        if file.endswith(".txt"):
            with open(file_path, 'r', encoding='utf-8') as f:
                raw_text = f.read()
                preprocessed_text = preprocess_text(raw_text)
                data.append((file, preprocessed_text))

        elif file.endswith(".pdf"):
            try:
                pdf_text = []
                with fitz.open(file_path) as pdf:
                    for page in pdf:
                        pdf_text.append(page.get_text())
                raw_text = " ".join(pdf_text)
                preprocessed_text = preprocess_text(raw_text)
                data.append((file, preprocessed_text))
            except Exception as e:
                print(f"Error processing PDF {file}: {e}")

    # Convert list to DataFrame
    df = pd.DataFrame(data, columns=["filename", "content"])
    return df

def unzip_files(zip_filepath, extract_to_path):
  """Unzips a zip file to the specified directory."""
  try:
      with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
          zip_ref.extractall(extract_to_path)
      print(f"Files extracted successfully to: {extract_to_path}")
  except FileNotFoundError:
      print(f"Error: Zip file not found at {zip_filepath}")
  except zipfile.BadZipFile:
      print(f"Error: Invalid zip file at {zip_filepath}")


### Experimental pipe

In [None]:
def improved_vectorise(
    texts: list[list[str]],
    model: str = 'intfloat/multilingual-e5-large-instruct',
    normalise: bool = True,
    batch_size: int = 192,
    gpu: bool = True,
    progress: bool = True,
    return_chunk_vectors: bool = True,
    instruction_prefix: str = "Identify the stance or topic of Position papers based content of the segment"
) -> tuple[list[list[float]], list[list[float]]]:
    """
    Vectorise texts with improvements for better semantic representation.

    Args:
        texts: List of lists of text chunks
        model: Model name to use for embeddings
        normalise: Whether to normalize vectors
        batch_size: Batch size for processing
        gpu: Whether to use GPU
        progress: Whether to show progress bar
        return_chunk_vectors: If True, returns both document and chunk vectors
        instruction_prefix: Instruction prefix for instruction-tuned models

    Returns:
        Tuple of (document_vectors, chunk_vectors) if return_chunk_vectors=True
        Otherwise just document_vectors
    """
    from transformers import AutoModel, AutoTokenizer
    import torch
    from tqdm import tqdm

    def batch_generator(items, batch_size):
        for i in range(0, len(items), batch_size):
            yield items[i:i + batch_size]

    # Load model and tokenizer
    model_instance = AutoModel.from_pretrained(model)
    tokeniser = AutoTokenizer.from_pretrained(model)

    if gpu and torch.cuda.is_available():
        model_instance = model_instance.to('cuda')
    else:
        gpu = False
        print("GPU not available, using CPU instead")

    # Prepare chunks and track boundaries
    all_chunks = []
    chunk_boundaries = []
    start = 0

    for text_chunks in texts:
        # Add instruction prefix to each chunk if using instruction-tuned model
        prefixed_chunks = [instruction_prefix + chunk for chunk in text_chunks]
        all_chunks.extend(prefixed_chunks)
        chunk_boundaries.append((start, start + len(text_chunks)))
        start += len(text_chunks)

    # Process in batches
    chunk_vectors = []

    with tqdm(total=len(all_chunks), disable=not progress, unit=' chunk') as bar:
        for batch in batch_generator(all_chunks, batch_size):
            # Tokenize with attention mask
            inputs = tokeniser(
                batch,
                padding='max_length',
                truncation=True,
                return_tensors='pt',
                max_length=512
            )

            if gpu:
                inputs = {k: v.to('cuda') for k, v in inputs.items()}

            # Generate embeddings with proper attention masking
            with torch.no_grad():
                outputs = model_instance(**inputs)
                # Use CLS token for sentence embeddings
                batch_vectors = outputs[0][:, 0]

                if normalise:
                    batch_vectors = torch.nn.functional.normalize(batch_vectors, p=2, dim=1)

                batch_vectors = batch_vectors.cpu()
                chunk_vectors.extend(batch_vectors.tolist())

            bar.update(len(batch))

    # Create document vectors by averaging their chunks
    document_vectors = []
    for start, end in chunk_boundaries:
        doc_chunks = torch.tensor(chunk_vectors[start:end])
        # Weight chunks by their length
        doc_vector = torch.mean(doc_chunks, dim=0).tolist()
        document_vectors.append(doc_vector)

    if return_chunk_vectors:
        return document_vectors, chunk_vectors
    else:
        return document_vectors


def preprocess_dataframe_columns(df: pd.DataFrame, content_column: str, filename_column: str, cluster_column: str = None) -> tuple[list[str], list[str], list[str]]:
    """
    Extract and preprocess content, filenames, and cluster information from a dataframe.
    Cluster column is optional - will return empty list if not provided.

    Args:
        df: Input DataFrame
        content_column: Name of the column containing text content
        filename_column: Name of the column containing filenames
        cluster_column: Optional name of the column containing cluster information
    """
    import re
    def preprocess_text(text):
        # Handle encoding errors by replacing problematic characters
        text = text.encode('ascii', 'ignore').decode('ascii')
        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        # Clean extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    # Apply preprocessing to the content column
    processed_content = df[content_column].dropna().astype(str).apply(preprocess_text).tolist()
    # Get filenames
    filenames = df[filename_column].dropna().astype(str).tolist()

    # Get cluster information only if column is provided and exists
    clusters = []
    if cluster_column is not None and cluster_column in df.columns:
        clusters = df[cluster_column].dropna().astype(str).tolist()

    return (processed_content, filenames, clusters)

def chunk_with_metadata(
    texts: list[str],
    filenames: list[str],
    clusters: list[str] = None,
    model: str = 'intfloat/multilingual-e5-large-instruct',
    size: int = None,
    max_token_chars: int = None,
    processes: int = 1,
) -> tuple[list[list[str]], list[list[str]], list[list[str]]]:
    """
    Split texts into chunks and track corresponding metadata.
    Can handle empty or None clusters input from preprocess_dataframe_columns.
    """
    tokenizer = AutoTokenizer.from_pretrained(model, model_max_length=512)
    if size is None:
        size = int(tokenizer.model_max_length * 0.6)

    chunker = semchunk.chunkerify(tokenizer, chunk_size=size, max_token_chars=max_token_chars)

    chunked_texts = []
    chunk_filenames = []
    chunk_clusters = []

    # Handle empty clusters list by creating placeholder values
    if not clusters:
        clusters = [""] * len(texts)
    # Handle case where clusters list is shorter than texts
    elif len(clusters) < len(texts):
        clusters = clusters + [""] * (len(texts) - len(clusters))

    for text, filename, cluster in zip(texts, filenames, clusters):
        chunks = chunker(text, processes=processes)
        chunked_texts.append(chunks)
        # Repeat metadata for each chunk
        chunk_filenames.append([filename] * len(chunks))
        chunk_clusters.append([cluster] * len(chunks))

    return chunked_texts, chunk_filenames, chunk_clusters

def save_chunk_embeddings_json(
    vectors: list,
    chunks: list[str],
    filenames: list[str],
    clusters: list[str],
    output_path: str
):
    """
    Save chunk-level embeddings with metadata to a JSON file.

    Args:
        vectors: List of embedding vectors
        chunks: List of text chunks
        filenames: List of source filenames for each chunk
        clusters: List of cluster assignments for each chunk
        output_path: Path to save the JSON file
    """
    import json
    import numpy as np

    # Convert numpy arrays to lists for JSON serialization
    if isinstance(vectors, np.ndarray):
        vectors = vectors.tolist()

    # Create list of dictionaries with metadata and embeddings
    data = []
    for i in range(len(chunks)):
        data.append({
            "chunk": chunks[i],
            "filename": filenames[i],
            "cluster": clusters[i],
            "length": len(chunks[i]),
            "embedding": vectors[i]
        })

    # Save to JSON file
    with open(output_path, 'w') as f:
        json.dump(data, f, indent=2)

    print(f"Saved {len(data)} chunk embeddings to {output_path}")

    return data

def reduce(
    vectors: list[list[float]],
    clusterisable_dimensions: int = 80,
    map_dimensions: int = 2,
) -> tuple[list[list[float]], list[list[float]]]:
    """Reduce the dimensionality of the given vectors."""
    config = dict(
        n_neighbors=None,
        apply_pca=False,
        save_tree=True,
        verbose=False,
    )

    clusterisabilitisation_model = PaCMAP(n_components=clusterisable_dimensions, **config)
    mapping_model = PaCMAP(n_components=map_dimensions, **config)

    clusterisable_vectors = clusterisabilitisation_model.fit_transform(vectors).tolist()
    map_vectors = mapping_model.fit_transform(vectors).tolist()

    return clusterisable_vectors, map_vectors

## Execute

In [None]:
# 1. Preprocess the data
texts, filenames, clusters = preprocess_dataframe_columns(
    df, content_column, filename_column, cluster_column)

# 2. Chunk the texts with metadata
chunked_texts, chunk_filenames, chunk_clusters = chunk_with_metadata(
    texts, filenames, clusters)


In [None]:
# 3. Vectorize the chunks
doc_vectors, chunk_vectors = improved_vectorise(chunked_texts, model = 'intfloat/multilingual-e5-large-instruct',
                                                instruction_prefix= "Identify the subject or theme of the given post")

#### Chunk reduction

In [None]:
# 4. Reduce to 2D coordinates
_, map_vectors = reduce(chunk_vectors)


# 5. Flatten the filenames and clusters
flat_filenames = []
flat_clusters = []  # Add this line
for doc_filenames in chunk_filenames:
    flat_filenames.extend(doc_filenames)
for doc_clusters in chunk_clusters:  # Add this loop
    flat_clusters.extend(doc_clusters)

# 6. Create x and y coordinates from map_vectors
x_values = [point[0] for point in map_vectors]
y_values = [point[1] for point in map_vectors]

# Store the original chunks before vectorization
original_flat_chunks = []
for doc_chunks in chunked_texts:
    original_flat_chunks.extend(doc_chunks)


# 7. Create the DataFrame with original chunks and clusters
result_df = pd.DataFrame({
    'text_name': flat_filenames,
    'chunk_content': original_flat_chunks,
    # 'cluster': flat_clusters,  # Add this line
    'x': x_values,
    'y': y_values
})

#### Doc reduction

In [None]:
doc_vecs_unfiltered = list(zip(doc_vectors,df['filename']))

In [None]:
texts = list(zip(df['filename'].tolist(),df['content'].tolist()))

In [None]:
filtered_doc_vecs = [item for item in doc_vecs_unfiltered if not isinstance(item[0], float)]
doc_vecs_toprocess = [item[0] for item in filtered_doc_vecs]
filtered_filenames = [item[1] for item in filtered_doc_vecs]
doc_texts_toprocess = [text[1] for text in texts if text[0] in filtered_filenames]


In [None]:
df_doc_vecs = pd.DataFrame(np.array(doc_vecs_toprocess))
df_doc_vecs['filtered_filenames'] = filtered_filenames
df_doc_vecs['filtered_filenames'] = filtered_filenames
df_doc_vecs['doc_texts_toprocess'] = doc_texts_toprocess

# df_doc_vecs.to_csv('doc_vecs_originals.csv')

In [None]:
clusterisable_vectors, map_vectors = reduce(doc_vecs_toprocess)
save_reduced_vectors(clusterisable_vectors, map_vectors, "clusterisable_vectors.csv", "map_vectors.csv")

In [None]:
map_vectors_ =  np.array(map_vectors)
vector_columns = [f"map_vector_{i}" for i in range(map_vectors_.shape[1])]

df_map_vectors = pd.DataFrame(map_vectors, columns=vector_columns)
df_map_vectors.insert(0, "original_text", doc_texts_toprocess)  # Add original texts

df_map_vectors["text_name"] = [item[1] for item in filtered_doc_vecs]
# df_map_vectors["cluster"] = df['cluster'].astype(str).tolist()
df_map_vectors.to_csv('doc_vectors_original_text.csv')

In [None]:
map_vectors_ =  np.array(map_vectors)
vector_columns = [f"map_vector_{i}" for i in range(map_vectors_.shape[1])]

df_map_vectors = pd.DataFrame(map_vectors, columns=vector_columns)
# df_map_vectors.insert(0, "original_text", texts)  # Add original texts

df_map_vectors["text_name"] = [item[1] for item in filtered_doc_vecs]
# df_map_vectors["cluster"] = df['cluster'].astype(str).tolist()
df_map_vectors.to_csv('doc_vectors.csv')

In [None]:
df_map_vectors.head()

Unnamed: 0,map_vector_0,map_vector_1,text_name
0,-26.568062,-3.525297,Public_ECSA Response to the public consulation...
1,-26.656979,-3.598467,Public_Cosmetics_Europe__contribution_DSA___FI...
2,-26.57781,-3.531961,public_Wettbewerbeszentrale_200908_DSA_Konsult...
3,-26.709795,-3.640888,Public_ACV-CSC Doc02_Adopted - ETUC resolution...
4,-26.549475,-3.509785,Public_ZVEI_20200907_-_Stellungnahme_zum_DSA_f...


## Topic modelling

In [None]:
result_df.head()

Unnamed: 0,text_name,chunk_content,x,y
0,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,Annex to the Digital Services Act Consultation...,0.252874,-1.689888
1,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,As Nordic PSM companies we want to operate on ...,0.221081,-1.200811
2,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,Terms and conditions and community standards a...,-6.485656,-1.41136
3,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,"After negotiations, followed by an intense pub...",-6.697905,-2.220328
4,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,DR disagreed with this as that the app clearly...,-6.631811,-2.24375


In [None]:
topic_model, topics = use_bertopic_with_custom_vectors(
    chunk_vectors,original_flat_chunks, n_topics = 77 )

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
2025-05-08 10:20:55,220 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-08 10:21:33,984 - BERTopic - Dimensionality - Completed ✓
2025-05-08 10:21:33,986 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-08 10:21:36,399 - BERTopic - Cluster - Completed ✓
2025-05-08 10:21:36,403 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-08 10:21:38,974 - BERTopic - Representation - Completed ✓
2025-05-08 10:21:38,975 - BERTopic - Topic reduction - Reducing number of topics
2025-05-08 10:21:38,975 - BERTopic - Topic reduction - Number of topics (77) is equal or higher than the clustered topics(73).
2025-05-08 10:21:38,977 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-08 10:21:40,893 - BERTopic - Representation - Completed ✓


In [None]:
df_chunk_vecs = pd.DataFrame(np.array(chunk_vectors))
df_chunk_vecs['original_flat_chunks'] = original_flat_chunks
df_chunk_vecs['text_name'] = result_df['text_name']

# df_chunk_vecs.head()

df_doc_vecs.to_csv('chunk_vecs_originals.csv')

In [None]:
# Add topics to your dataframe
result_df['bertopic_cluster'] = topics
result_df.head()

Unnamed: 0,text_name,chunk_content,x,y,bertopic_cluster
0,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,Annex to the Digital Services Act Consultation...,0.252874,-1.689888,53
1,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,As Nordic PSM companies we want to operate on ...,0.221081,-1.200811,53
2,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,Terms and conditions and community standards a...,-6.485656,-1.41136,26
3,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,"After negotiations, followed by an intense pub...",-6.697905,-2.220328,26
4,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,DR disagreed with this as that the app clearly...,-6.631811,-2.24375,26


In [None]:
topic_model.visualize_topics()

In [None]:
# Get topic information
topic_info = topic_model.get_topic_info()

# Create a mapping dictionary from topic ID to topic name
topic_names = {}
for _, row in topic_info.iterrows():
    topic_id = row['Topic']

    # For non-outlier topics
    if topic_id != -1:
        # Get the top words for this topic
        top_words = [word for word, _ in topic_model.get_topic(topic_id)][:3]
        topic_name = f"Topic {topic_id}: {' '.join(top_words)}"
    else:
        topic_name = "Outlier"

    topic_names[topic_id] = topic_name

# Add topic names to the dataframe
result_df['topic_name'] = result_df['bertopic_cluster'].map(topic_names)

In [None]:
topic_info.to_csv('topic_library.csv')
topic_info

In [None]:
result_df.head()

Unnamed: 0,text_name,chunk_content,x,y,bertopic_cluster,topic_name
0,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,Annex to the Digital Services Act Consultation...,0.252874,-1.689888,53,Topic 53: dsa body independent
1,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,As Nordic PSM companies we want to operate on ...,0.221081,-1.200811,53,Topic 53: dsa body independent
2,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,Terms and conditions and community standards a...,-6.485656,-1.41136,26,Topic 26: app prohibited mega
3,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,"After negotiations, followed by an intense pub...",-6.697905,-2.220328,26,Topic 26: app prohibited mega
4,Public_NordVision Doc02_Annex_to_the_DSA_Consu...,DR disagreed with this as that the app clearly...,-6.631811,-2.24375,26,Topic 26: app prohibited mega


In [None]:
result_df.to_csv('chunk_vectors.csv')

## Data manipulation