In [4]:
import os
import string
import re
import torch
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', 'NUM', text)
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed_words = [stemmer.stem(word) for word in tokens]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return text

# Define your dataset class
class LegalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Assuming binary classification

# Directory containing your legal case documents
directory = r"C:\Users\91938\Desktop\dataset\Object_casedocs"

# List to store preprocessed texts and labels
texts = []
labels = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    # Check if the file is a text file
    if filename.endswith('.txt'):
        # Construct the full path to the file
        filepath = os.path.join(directory, filename)
        
        # Open and read the file
        with open(filepath, 'r') as file:
            # Read and preprocess the file contents
            data = file.read()
            preprocessed_text = preprocess_text(data)
            
            # Append preprocessed text to the list
            texts.append(preprocessed_text)
            
            # Append label (you need to define how you determine labels for each text sample)
            labels.append(0)  # Example label (0 for non-relevant, adjust as needed)

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create datasets for training and validation
train_dataset = LegalDataset(train_texts, train_labels, tokenizer, max_length=128)  # Adjust max_length as needed
val_dataset = LegalDataset(val_texts, val_labels, tokenizer, max_length=128)

# Define batch size and create data loaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Fine-tune the BERT model
num_epochs = 3  # Adjust as needed
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Avg Loss: {avg_loss}')

# After fine-tuning, you can use the model to provide similar and relevant data based on user queries
# For example, given a query text, encode it using the BERT tokenizer and compute similarity with other documents

# Example query text
query_text = "Legal case about contract disputes"

# Preprocess


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Avg Loss: 0.030186341253782254
Epoch 2/3, Avg Loss: 0.00045935776142869145
Epoch 3/3, Avg Loss: 0.000202812615136954


In [2]:
pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.4.2-cp311-cp311-win_amd64.whl (10.6 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.4.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: C:\Users\91938\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [1]:
pip list

Package                  VersionNote: you may need to restart the kernel to use updated packages.

------------------------ ---------
anyio                    3.7.0
argon2-cffi              21.3.0
argon2-cffi-bindings     21.2.0
arrow                    1.2.3
asttokens                2.2.1
async-lru                2.0.2
attrs                    23.1.0
Babel                    2.12.1
backcall                 0.2.0
beautifulsoup4           4.12.2
bleach                   6.0.0
certifi                  2023.5.7
cffi                     1.15.1
charset-normalizer       3.1.0
click                    8.1.7
colorama                 0.4.6
comm                     0.1.3
debugpy                  1.6.7
decorator                5.1.1
defusedxml               0.7.1
dronekit                 2.9.2
dronekit-sitl            3.3.0
executing                1.2.0
fastjsonschema           2.17.1
filelock                 3.13.4
fqdn                     1.5.1
fsspec                   2024.3.1
future         

In [2]:
import os
import string
import re
import torch
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity

# Preprocess the text data
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', 'NUM', text)
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed_words = [stemmer.stem(word) for word in tokens]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return text

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Directory containing your legal case documents
directory = r"C:\Users\91938\Desktop\dataset\Object_casedocs"

# List to store preprocessed texts and labels
texts = []
labels = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    # Check if the file is a text file
    if filename.endswith('.txt'):
        # Construct the full path to the file
        filepath = os.path.join(directory, filename)
        
        # Open and read the file
        with open(filepath, 'r') as file:
            # Read and preprocess the file contents
            data = file.read()
            preprocessed_text = preprocess_text(data)
            
            # Append preprocessed text to the list
            texts.append(preprocessed_text)
            
            # Append label (you need to define how you determine labels for each text sample)
            labels.append(0)  # Example label (0 for non-relevant, adjust as needed)

# Fine-tune the BERT model to obtain document embeddings
document_embeddings = []  # List to store document embeddings

model.eval()

for text in texts:
    # Tokenize and process each text to obtain document embeddings
    encoded_text = tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(**encoded_text)
        pooled_output = outputs.pooler_output
        
        # Append the document embeddings to the list
        document_embeddings.append(pooled_output.squeeze().cpu().detach().numpy())

# Prompt the user to enter a query
query_text = input("Enter your query: ")

# Preprocess and tokenize the query
preprocessed_query = preprocess_text(query_text)
encoded_query = tokenizer(preprocessed_query, truncation=True, padding='max_length', max_length=128, return_tensors='pt')

# Process the query with the fine-tuned BERT model to obtain query embedding
with torch.no_grad():
    outputs = model(**encoded_query)
    query_embedding = outputs.pooler_output.cpu().detach().numpy()  # Get the query embedding

# Compute cosine similarity between query and document embeddings
similarity_scores = cosine_similarity(query_embedding, document_embeddings)

# Rank and retrieve top similar/relevant documents
top_k = 5  # Number of top documents to retrieve
top_documents_indices = similarity_scores.argsort()[0][-top_k:][::-1]

# Print top similar documents
print("Top similar documents:")
for idx in top_documents_indices:
    print(f"Similarity Score: {similarity_scores[0][idx]}, Document: {texts[idx]}")


Enter your query: kidnap\
Top similar documents:
Similarity Score: 0.7871500253677368, Document: narendra singh  anr v state of mp
supreme court of india

NUM april NUM
appeal crl NUM of NUM
the judgment was delivered by  s b sinha j
NUM  the appellant no NUM herein by reason of the impugned judgment reversing a judgment of acquittal passed by learned sessions judge dhar on NUM was found guilty of commission of an offence under section NUM of the indian penal code for having committed murder of bimlabai by throttling on NUM at about NUM pm at dhanmandi dhar at house no NUM dhanmandi dhar as also under section NUM of indian penal code for causing disappearance of evidence by setting her on fire after causing her death whereas the appellant no NUM was found guilty of commission of an offence under section NUM of the indian penal code
NUM  the relationship between the appellants herein are son and mother along with them the husband of appellant no NUM hari singh and their daughter kusum w