In [None]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
import pandas as pd
import re
import os

## Get Data

In [None]:
# psak = r'פ\s*ס\s*ק\s*-*\s*ד\s*י\s*ן'
psak = r'פ\s*ס\s*ק\s*-*\s*ד\s*י\s*ן\s*\n'
def extract_middle_content(text):
    if text is None:
        return ""
    try:
        pattern = re.compile(fr'{psak}([\s\S]*)[\s-]*ניתן[\s-]*היום', re.DOTALL)
        match = pattern.search(text)
        if match:
            extracted_text = match.group(1).strip()
            extracted_percentage = len(extracted_text) / len(text) * 100
            if extracted_percentage == 0:
                return text
            return extracted_text
    except:
        return text
    
    try:
        # If the previous pattern fails, extract from the last 'פסק-דין' to the end
        pattern = re.compile(fr'.*{psak}([\s\S]*)', re.DOTALL)
        match = pattern.search(text)
        if match:
            extracted_text = match.group(1).strip()
            extracted_percentage = len(extracted_text) / len(text) * 100
            if extracted_percentage == 0:
                return text
            return extracted_text
    except:
        return text
    
    try:
        # If there's no 'פסק-דין', extract from the beginning to 'ניתן היום'
        pattern = re.compile(r'^([\s\S]*)(ניתן[\s-]*היום)', re.DOTALL)
        match = pattern.search(text)
        if match:
            extracted_text = match.group(1).strip()
            extracted_percentage = len(extracted_text) / len(text) * 100
            if extracted_percentage == 0:
                return text
            return extracted_text
    except:
        return text
    
    # If there's no 'פסק-דין', extract from the beginning
    return text

In [None]:
def find_years(text):
    # Regular expression patterns for different date formats
    patterns = [
        #         r'\b(\d{4})\b',          # yyyy format
        r'\b\d{1,2}/\d{1,2}/(\d{2}|\d{4})\b',
        r'\b\d{1,2}\.\d{1,2}\.(\d{2}|\d{4})\b']
    years = []
    for pattern in patterns:
        matches = re.findall(pattern, text)
        for match in matches:
            year = match
            if len(year) == 2:
                if 0 <= int(year) <= 23:
                    year = '20' + year
                else:
                    year = '19' + year
            years.append(year)

    if not years:
        matches = re.findall(r'\b(\d{4})\b', text)
        years.extend(matches)
    return years


def max_year(years):
    years = [int(year) for year in years if year != '' and 1900 <= int(year) <= 2023]
    if not years:
        return '', 0
    return str(max(years))


def run_years(df):
    df['years'] = df['text'].apply(find_years)
    df['max_year'] = df.apply(
        lambda row: max_year(row['years']), axis=1)
    df.drop(columns=['years'], inplace=True)

In [None]:
directory_path = "/mnt/local/mikehash/Data/Nevo/NevoVerdicts"
data = []

for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    with open(file_path, 'r') as file:
        try:
            data.append(file.read())
        except:
            print(file_path)
df = pd.DataFrame(data, columns=['text'])
df['extracted_content'] = df['text'].apply(lambda x: extract_middle_content(x))
run_years(df)

In [None]:
data = df['extracted_content'].values.tolist()

## Topic Model

In [None]:
model_name = "avichr_Legal-heBERT_customTruncatingEmbedder_ctfidfTrue_vectorTrue_minsize20"
topic_model = BERTopic.load(f"Results/{model_name}/")
print(len(topic_model.get_topic_info()))
topic_model.get_topic_info().head(20)

In [None]:
# topics_dict = topic_model.get_topics()
# df_topic = pd.DataFrame(columns=['Topic', 'Words'])

# # Iterate through the dictionary and concatenate the values
# for key, value in topics_dict.items():
#     topic = key
#     words_probs = [f"{round(prob, 4)}*{word}" for word, prob in value]
#     concatenated = '+'.join(words_probs)
#     df_topic = df_topic.append({'Topic': topic, 'Words': concatenated}, ignore_index=True)

# df_topic.to_excel("BERTopicTopicsProbs.xlsx")

## Analyze Model

In [None]:
fig = topic_model.visualize_topics()
fig.write_image('Plots/BERTopicIDM.png')
fig.show()

In [None]:
topic_model.visualize_documents(data)

In [None]:
fig = topic_model.visualize_barchart(n_words=9, top_n_topics=12)
fig.write_image('Plots/BERTopicBarchart.png')
fig.show()

In [None]:
topic_model.visualize_barchart(n_words=9, top_n_topics=12)

In [None]:
topic_model.visualize_term_rank(log_scale=True)

In [None]:
topics_over_time = topic_model.topics_over_time(data, df['max_year'].values.tolist())

In [None]:
topics_over_time[topics_over_time['Topic'] == 3]

In [None]:
fig = topic_model.visualize_topics_over_time(topics_over_time)
fig.write_image('Plots/BERTopicDTM.png')
fig.show()

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(data)

In [None]:
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

In [None]:
fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
fig.write_image('Plots/BERTopicHC.png')
fig.show()