In [None]:
import os
import string
import re
import torch
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from transformers import BartForConditionalGeneration, BartTokenizer
from sklearn.metrics.pairwise import cosine_similarity

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', 'NUM', text)
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed_words = [stemmer.stem(word) for word in tokens]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return text

# Load the BART model and tokenizer for summarization
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Directory containing your legal case documents
directory = r"C:\Users\91938\Desktop\dataset\Object_casedocs"

# List to store preprocessed texts and labels
texts = []
labels = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    # Check if the file is a text file
    if filename.endswith('.txt'):
        # Construct the full path to the file
        filepath = os.path.join(directory, filename)
        
        # Open and read the file
        with open(filepath, 'r') as file:
            # Read and preprocess the file contents
            data = file.read()
            preprocessed_text = preprocess_text(data)
            
            # Append preprocessed text to the list
            texts.append(preprocessed_text)
            
            # Append label (you need to define how you determine labels for each text sample)
            labels.append(0)  # Example label (0 for non-relevant, adjust as needed)

# Fine-tune the BART model to obtain document embeddings
document_embeddings = []  # List to store document embeddings

model.eval()

for text in texts:
    # Tokenize and process each text to obtain document embeddings
    encoded_text = tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model.generate(encoded_text["input_ids"], max_length=150, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Append the summary to the list
        document_embeddings.append(summary)

# Prompt the user to enter a query
query_text = input("Enter your query: ")

# Preprocess and tokenize the query
preprocessed_query = preprocess_text(query_text)
encoded_query = tokenizer(preprocessed_query, truncation=True, padding='max_length', max_length=128, return_tensors='pt')

# Process the query with the fine-tuned BART model to obtain query summary
with torch.no_grad():
    outputs = model.generate(encoded_query["input_ids"], max_length=150, num_beams=4, early_stopping=True)
    query_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Compute cosine similarity between query summary and document summaries
similarity_scores = cosine_similarity([tokenizer.encode(query_summary)], [tokenizer.encode(summary) for summary in document_embeddings])

# Rank and retrieve top similar/relevant documents
top_k = 5  # Number of top documents to retrieve
top_documents_indices = similarity_scores.argsort()[0][-top_k:][::-1]

# Print top similar documents
print("Top similar documents:")
for idx in top_documents_indices:
    print(f"Similarity Score: {similarity_scores[0][idx]}, Document Summary: {document_embeddings[idx]}")

