In [None]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from transformers import pipeline, BertModel, BertTokenizerFast
import pandas as pd
import re
import torch

import os

In [None]:
# psak = r'פ\s*ס\s*ק\s*-*\s*ד\s*י\s*ן'
psak = r'פ\s*ס\s*ק\s*-*\s*ד\s*י\s*ן\s*\n'
def extract_middle_content(text):
    if text is None:
        return ""
    try:
        pattern = re.compile(fr'{psak}([\s\S]*)[\s-]*ניתן[\s-]*היום', re.DOTALL)
        match = pattern.search(text)
        if match:
            extracted_text = match.group(1).strip()
            extracted_percentage = len(extracted_text) / len(text) * 100
            if extracted_percentage == 0:
                return text
            return extracted_text
    except:
        return text
    
    try:
        # If the previous pattern fails, extract from the last 'פסק-דין' to the end
        pattern = re.compile(fr'.*{psak}([\s\S]*)', re.DOTALL)
        match = pattern.search(text)
        if match:
            extracted_text = match.group(1).strip()
            extracted_percentage = len(extracted_text) / len(text) * 100
            if extracted_percentage == 0:
                return text
            return extracted_text
    except:
        return text
    
    try:
        # If there's no 'פסק-דין', extract from the beginning to 'ניתן היום'
        pattern = re.compile(r'^([\s\S]*)(ניתן[\s-]*היום)', re.DOTALL)
        match = pattern.search(text)
        if match:
            extracted_text = match.group(1).strip()
            extracted_percentage = len(extracted_text) / len(text) * 100
            if extracted_percentage == 0:
                return text
            return extracted_text
    except:
        return text
    
    # If there's no 'פסק-דין', extract from the beginning
    return text

In [None]:
directory_path = "/mnt/local/mikehash/Data/Nevo/NevoVerdicts"
data = []

for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    with open(file_path, 'r') as file:
        try:
            data.append(file.read())
        except:
            print(file_path)
df = pd.DataFrame(data, columns=['text'])
df['extracted_content'] = df['text'].apply(lambda x: extract_middle_content(x))

In [None]:
data = df['extracted_content'].values.tolist()

In [None]:
# embedding = pipeline("feature-extraction", model="onlplab/alephbert-base", device='cuda:0')
tokenizer = BertTokenizerFast.from_pretrained('onlplab/alephbert-base', device='cuda:0')
model = BertModel.from_pretrained('onlplab/alephbert-base')

In [None]:
pipe = pipeline('feature-extraction', model='onlplab/alephbert-base', device='cuda:0')

In [None]:
type(pipe)

In [None]:
pipe(data[0], truncation=True, padding=True)[0][0][:5]

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("avichr/Legal-heBERT", max_length=512, device='cuda:0', truncation=True)
model = AutoModelForMaskedLM.from_pretrained("avichr/Legal-heBERT")
# tokenizer.to('cuda:0')
model.to('cuda:0')

In [None]:
from transformers import AutoTokenizer, AutoModel

model_name = 'avichr/Legal-heBERT_ft' # for the fine-tuned HeBERT model \

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

pipe = pipeline('feature-extraction', model=model_name, device='cuda:0')

In [None]:
import matplotlib.pyplot as plt

def plot_word_vs_token_counts(documents, tokenizer):
    
    max_word_limit = 3000  # Set your desired maximum word limit
    
    for document in documents:
        word_counts = []
        token_counts = []
        word_limit_range = range(1000, max_word_limit, 300)  # Change step size as needed
        
        for word_limit in word_limit_range:
#             # Get a subset of the document up to the specified word limit
#             subset_document = ' '.join(document.split()[:word_limit])
            
#             # Calculate word count
#             words = subset_document.split()
#             word_counts.append(len(words))
            # Get a subset of the document up to the specified word limit
            subset_document = document[:word_limit]
            
            # Calculate word count
            word_counts.append(word_limit)
            
            # Tokenize the document subset
            tokens = tokenizer(subset_document, add_special_tokens=True)['input_ids']
            token_counts.append(len(tokens))
        
        # Create the scatter plot for this document
        plt.scatter(word_counts, token_counts, label=f'Document {documents.index(document)+1}')
    
    # Draw a line at 512 tokens
    plt.axhline(y=512, color='red', linestyle='--', label='512 Tokens Limit')
    
    # Set labels and title
    plt.xlabel('Number of Words')
    plt.ylabel('Number of Tokens')
    plt.title('Word Count vs Token Count')
    
    # Add legend
#     plt.legend()
    
    # Show the plot
    plt.show()

# Example usage
documents = [
    "This is document 1 with some text.",
    "Another document, a bit longer this time.",
    "A very long document with many words to test the token limit.",
]

plot_word_vs_token_counts(data[20:40], tokenizer)


In [None]:
import numpy as np

from tqdm import tqdm
from typing import List
from torch.utils.data import Dataset
from sklearn.preprocessing import normalize
from transformers.pipelines import Pipeline

from bertopic.backend import BaseEmbedder


class CustomEmbedder(BaseEmbedder):
    def __init__(self, embedding_model: Pipeline):
        super().__init__()

        if isinstance(embedding_model, Pipeline):
            self.embedding_model = embedding_model
        else:
            raise ValueError("Please select a correct transformers pipeline. For example: "
                             "pipeline('feature-extraction', model='distilbert-base-cased', device=0)")

    def embed(self,
              documents: List[str],
              max_tokens=512,
              verbose: bool = False) -> np.ndarray:
        """ Embed a list of n documents/words into an n-dimensional
        matrix of embeddings

        Arguments:
            documents: A list of documents or words to be embedded
            maximum_tokens: Maximum number of tokens per chunk
            verbose: Controls the verbosity of the process

        Returns:
            Document/words embeddings with shape (n, m) with `n` documents/words
            that each have an embeddings size of `m`
        """
        max_tokens=512
        embeddings = []
        for document in tqdm(documents, total=len(documents), disable=not verbose):
            chunks = self._split_document(document, max_tokens)
            chunk_embeddings = [self._embed(chunk) for chunk in chunks]
            embeddings.append(np.mean(chunk_embeddings, axis=0))

        return np.array(embeddings)

    def _split_document(self, document: str, max_tokens: int) -> List[str]:
        tokens = self.embedding_model.tokenizer(document, return_tensors="pt")["input_ids"]
        chunks = [tokens[:, i:i + max_tokens-7] for i in range(0, tokens.size(1), max_tokens-7)]
        return [self.embedding_model.tokenizer.decode(chunk[0].tolist(), skip_special_tokens=True) for chunk in chunks]


    def _embed(self, chunk) -> np.ndarray:
        """ Mean pooling

        Arguments:
            chunk: The document chunk for which to extract the attention mask
        """
        features = self.embedding_model(chunk, truncation=True, padding=True)
        token_embeddings = np.array(features)
        attention_mask = self.embedding_model.tokenizer(chunk, truncation=True, padding=True, return_tensors="np")["attention_mask"]
        input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), token_embeddings.shape)
        sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = np.clip(input_mask_expanded.sum(1), a_min=1e-9, a_max=input_mask_expanded.sum(1).max())
        embedding = normalize(sum_embeddings / sum_mask)[0]
        return embedding


class MyDataset(Dataset):
    """ Dataset to pass to `transformers.pipelines.pipeline` """
    def __init__(self, docs):
        self.docs = docs

    def __len__(self):
        return len(self.docs)

    def __getitem__(self, idx):
        return self.docs[idx]

In [None]:
pipe = pipeline('feature-extraction', model=model_name, device='cuda:0')
custom_embedder = CustomEmbedder(embedding_model=pipe)

In [None]:
# data_truncated = [d[:1400] for d in data]

In [None]:
stop_words = open('heb_stopwords.txt', 'r').read().split()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
vectorizer_model = CountVectorizer(stop_words=stop_words)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [None]:
topic_model = BERTopic(embedding_model=custom_embedder, ctfidf_model=ctfidf_model, vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(data)
topic_model.get_topic_info()

In [None]:
len(probs)

In [None]:
t, _ = topic_model.get_topics()

In [None]:
data[0]

In [None]:
torch.cuda.empty_cache()

In [None]:
torch.cuda.current_device()