In [1]:
app_py_content = """
from flask import Flask, request, jsonify
import os
import string
import re
import torch
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from transformers import BertModel, BertTokenizer, BartForConditionalGeneration, BartTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from preprocess import preprocess_text

app = Flask(__name__)

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load the BART model and tokenizer for summarization
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Directory containing your legal case documents
directory = r"C:\\Users\\91938\\Desktop\\dataset\\Object_casedocs"

# List to store preprocessed texts and labels
texts = []
document_embeddings = []

# Loop through each file in the directory and preprocess
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            data = file.read()
            preprocessed_text = preprocess_text(data)
            texts.append(preprocessed_text)
            encoded_text = tokenizer(preprocessed_text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
            with torch.no_grad():
                outputs = model(**encoded_text)
                pooled_output = outputs.pooler_output
                document_embeddings.append(pooled_output.squeeze().cpu().detach().numpy())

@app.route('/query', methods=['POST'])
def handle_query():
    query_text = request.json.get('query')
    if not query_text:
        return jsonify({'error': 'Query text is required.'}), 400

    preprocessed_query = preprocess_text(query_text)
    encoded_query = tokenizer(preprocessed_query, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**encoded_query)
        query_embedding = outputs.pooler_output.cpu().detach().numpy()

    similarity_scores = cosine_similarity(query_embedding, document_embeddings)
    top_k = 5
    top_documents_indices = similarity_scores.argsort()[0][-top_k:][::-1]

    similar_documents = []
    for idx in top_documents_indices:
        document = texts[idx]
        inputs = bart_tokenizer([document], max_length=1024, return_tensors='pt', truncation=True)
        summary_ids = bart_model.generate(inputs['input_ids'], max_length=150, num_beams=4, early_stopping=True)
        summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        similarity_score = similarity_scores[0][idx]
        similar_documents.append({'similarity_score': similarity_score, 'summary': summary})

    return jsonify({'similar_documents': similar_documents})

if __name__ == '__main__':
    app.run(debug=True)
"""

with open('app.py', 'w') as file:
    file.write(app_py_content)

preprocess_py_content = """
import re
import string
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\\d+', 'NUM', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed_words = [stemmer.stem(word) for word in tokens]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(stemmed_words)
"""

with open('preprocess.py', 'w') as file:
    file.write(preprocess_py_content)


In [2]:
%run app.py

 * Serving Flask app 'app'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1