In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.44.1-py3-none-any.whl (9.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m87.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m98.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInst

In [4]:
# Law Assistance Bot using LSTM Seq2Seq Model

import json
import numpy as np
import pandas as pd
import re
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
import streamlit as st

In [5]:
# Load dataset
dataset_file = "/content/drive/MyDrive/Colab Notebooks/GEN_AI_Fine_Tunning_llms/ipc_law_assistant_1000_entries.json"
with open(dataset_file, 'r') as f:
    data = json.load(f)

In [6]:
# Prepare input and output pairs
inputs = [entry['crime_description'].lower() for entry in data]
outputs = [entry['ipc_section'] for entry in data]

In [7]:
# Preprocess outputs to format as a sequence
outputs = ["<s> " + out + " </s>" for out in outputs]  # Add start/end tokens

In [8]:
# Tokenize input
input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(inputs)
input_sequences = input_tokenizer.texts_to_sequences(inputs)
input_max_len = max(len(seq) for seq in input_sequences)
encoder_input_data = pad_sequences(input_sequences, maxlen=input_max_len, padding='post')

In [9]:
# Tokenize output
output_tokenizer = Tokenizer(filters='')
output_tokenizer.fit_on_texts(outputs)
output_sequences = output_tokenizer.texts_to_sequences(outputs)
output_max_len = max(len(seq) for seq in output_sequences)
output_sequences = pad_sequences(output_sequences, maxlen=output_max_len, padding='post')

In [10]:
# Create decoder input and target
decoder_input_data = output_sequences[:, :-1]
decoder_target_data = output_sequences[:, 1:]

vocab_size_input = len(input_tokenizer.word_index) + 1
vocab_size_output = len(output_tokenizer.word_index) + 1

In [11]:
# One-hot encode the target
decoder_target_data_cat = np.zeros((len(inputs), output_max_len - 1, vocab_size_output), dtype='float32')
for i, seq in enumerate(decoder_target_data):
    for t, word_id in enumerate(seq):
        if word_id != 0:
            decoder_target_data_cat[i, t, word_id] = 1

In [12]:
# Model parameters
latent_dim = 256

In [13]:
# Encoder
encoder_inputs = Input(shape=(input_max_len,))
enc_emb = Embedding(vocab_size_input, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

In [14]:
# Decoder
decoder_inputs = Input(shape=(output_max_len - 1,))
dec_emb_layer = Embedding(vocab_size_output, latent_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [15]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [16]:
# Train the model
model.fit(
    [encoder_input_data, decoder_input_data], decoder_target_data_cat,
    batch_size=64,
    epochs=20,
    validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3)]
)

Epoch 1/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 230ms/step - accuracy: 0.1935 - loss: 1.3463 - val_accuracy: 0.2500 - val_loss: 0.9994
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 138ms/step - accuracy: 0.2500 - loss: 0.9621 - val_accuracy: 0.2500 - val_loss: 0.7369
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 136ms/step - accuracy: 0.2585 - loss: 0.7022 - val_accuracy: 0.2663 - val_loss: 0.6292
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 140ms/step - accuracy: 0.2711 - loss: 0.6414 - val_accuracy: 0.2663 - val_loss: 0.6149
Epoch 5/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 138ms/step - accuracy: 0.2723 - loss: 0.6221 - val_accuracy: 0.2700 - val_loss: 0.6156
Epoch 6/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 189ms/step - accuracy: 0.2821 - loss: 0.6173 - val_accuracy: 0.2663 - val_loss: 0.6145
Epoch 7/20
[1m13/13[0m [3

<keras.src.callbacks.history.History at 0x780de5608d90>

In [17]:
# Save model and tokenizers
model.save('law_bot_seq2seq.h5')
with open('tokenizers.pkl', 'wb') as f:
    pickle.dump((input_tokenizer, output_tokenizer, input_max_len, output_max_len), f)

print("✅ Law Assistance Bot trained and saved successfully!")



✅ Law Assistance Bot trained and saved successfully!


In [18]:
# ------------------------------
# Inference Code
# ------------------------------

# Load model and tokenizers
def load_model_and_tokenizers():
    model = load_model('law_bot_seq2seq.h5')
    with open('tokenizers.pkl', 'rb') as f:
        input_tokenizer, output_tokenizer, input_max_len, output_max_len = pickle.load(f)
    return model, input_tokenizer, output_tokenizer, input_max_len, output_max_len

In [19]:
# Decode sequence
def decode_sequence(input_seq, model, input_tokenizer, output_tokenizer, input_max_len, output_max_len):
    seq = input_tokenizer.texts_to_sequences([input_seq.lower()])
    padded = pad_sequences(seq, maxlen=input_max_len, padding='post')
    encoder_model = Model(model.input[0], model.layers[4].output[1:])

    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_state_input_c = Input(shape=(latent_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    dec_emb2 = dec_emb_layer(model.input[1])
    decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
    decoder_outputs2 = decoder_dense(decoder_outputs2)
    decoder_model = Model(
        [model.input[1]] + decoder_states_inputs,
        [decoder_outputs2] + [state_h2, state_c2]
    )

    states_value = encoder_model.predict(padded)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = output_tokenizer.word_index['<s>']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = output_tokenizer.index_word.get(sampled_token_index, '')

        if sampled_word == '</s>' or len(decoded_sentence.split()) > output_max_len:
            stop_condition = True
        else:
            decoded_sentence += sampled_word + ' '
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]

    return decoded_sentence.strip()

In [20]:
# ------------------------------
# Streamlit UI
# ------------------------------

def run_streamlit():
    st.title("📜 Law Assistance Bot")
    st.markdown("Enter a crime description and get the predicted IPC section(s).")

    input_text = st.text_area("Crime Description")
    if st.button("Predict IPC Section"):
        model, input_tok, output_tok, max_input, max_output = load_model_and_tokenizers()
        result = decode_sequence(input_text, model, input_tok, output_tok, max_input, max_output)
        st.success(f"Predicted IPC Section(s): {result}")

if __name__ == '__main__':
    run_streamlit()

2025-04-11 15:16:51.680 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-04-11 15:16:51.693 Session state does not function when running a script without `streamlit run`
