In [1]:
party_keywords = {
    "bjp": ["bjp", "bharatiya janata party", "narendra", "modi"],
    "inc": ["congress", "indian national congress", "rahul", "gandhi"],
    "aap": ["aap", "aam aadmi party", "arvind", "kejriwal"],
    "sp": ["sp", "samajwadi party", "akhilesh", "yadav"],
    "bsp": ["bsp", "bahujan samaj party", "mayawati"],
    "aitc": ["trinamool", "aitc", "mamata", "bannerjee"],
    "cpim": ["cpim", "communist"],
    "dmk": ["dmk", "south"]
}

In [2]:
import re

def identify_party(user_input):
    for party, keywords in party_keywords.items():
        for keyword in keywords:
            pattern = re.compile(re.escape(keyword), re.IGNORECASE)
            if pattern.search(user_input):
                return party
    return None

In [3]:
def load_manifesto_by_party(party):
    manifesto_file_path = f"../knowledge_base/{party.lower()}_manifesto.txt"
    try:
        with open(manifesto_file_path, "r", encoding="utf-8") as file:
            manifesto_text = file.read()
        return manifesto_text
    except FileNotFoundError:
        print(f"Manifesto for {party} not found.")
        return ""  # Return empty string if manifesto not found
    except Exception as e:
        print(f"Error loading manifesto for {party}: {str(e)}")
        return ""


In [4]:
import nltk

nltk.download('punkt')

def answer_query_based_on_manifesto(user_query, party_name):
    # Load manifesto text
    manifesto_text = load_manifesto_by_party(party_name)

    if manifesto_text == "Manifesto not found.":
        return "Manifesto for the given party not found."

    # Preprocess user query
    user_query = user_query.lower()  # Convert to lowercase for case-insensitive matching

    # Tokenize user query
    query_tokens = nltk.word_tokenize(user_query)

    # Tokenize manifesto text into sentences
    manifesto_sentences = nltk.sent_tokenize(manifesto_text)

    # Initialize list to store relevant sentences
    relevant_sentences = []

    # Calculate similarity scores between user query and each manifesto sentence
    similarity_scores = []
    for sentence in manifesto_sentences:
        # Tokenize manifesto sentence
        sentence_tokens = nltk.word_tokenize(sentence)

        # Calculate Jaccard similarity between query and sentence tokens
        similarity = len(set(query_tokens) & set(sentence_tokens)) / len(set(query_tokens) | set(sentence_tokens))
        similarity_scores.append(similarity)

        # If similarity score exceeds a threshold, consider the sentence relevant
        if similarity > 0.5:  # Adjust the threshold as needed
            relevant_sentences.append(sentence)

    # If no relevant sentences are found
    if not relevant_sentences:
        return "I couldn't find relevant information in the manifesto for the given query."

    # Concatenate relevant sentences to form the response
    response = " ".join(relevant_sentences)

    return response


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\debac\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

import os

manifesto_directory = "../knowledge_base/"

party_manifestos = {}

for filename in os.listdir(manifesto_directory):
    if filename.endswith(".txt"):
        party_name = os.path.splitext(filename)[0].replace("_manifesto", "")
        with open(os.path.join(manifesto_directory, filename), "r", encoding='utf_8') as file:
            manifesto_text = file.read()
        party_manifestos[party_name] = manifesto_text

tokenizer = T5Tokenizer.from_pretrained("t5-small")

class PartyManifestoDataset(torch.utils.data.Dataset):
    def __init__(self, party_manifestos, tokenizer):
        self.party_manifestos = party_manifestos
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        party = list(self.party_manifestos.keys())[idx]
        manifesto_text = self.party_manifestos[party]
        inputs = self.tokenizer.encode_plus(
            manifesto_text,
            add_special_tokens=True,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        labels = self.tokenizer.encode_plus(
            manifesto_text,
            add_special_tokens=True,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return inputs, labels, party

    def __len__(self):
        return len(self.party_manifestos)

dataset = PartyManifestoDataset(party_manifestos, tokenizer)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)

model = T5ForConditionalGeneration.from_pretrained("t5-small")
device = torch.device("cpu")
model.to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [6]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the language model and tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def chat_with_party_manifestos(user_query):
    current_party = None

    # Determine the party mentioned in the user's query
    party = identify_party(user_query)

    if party:
        # If the identified party is different from the current party, load the manifesto
        if party != current_party:
            manifesto_text = load_manifesto_by_party(party)
            if manifesto_text:
                current_party = party
            else:
                return "Chatbot: Sorry, I couldn't load the manifesto for that party."
        else:
            current_party = party

        # Use the language model to generate a response based on the manifesto
        input_prompt = f"{manifesto_text}\nYou: {user_query}\nChatbot: "
        encoder_inputs = tokenizer.encode_plus(
            input_prompt,
            add_special_tokens=True,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        decoder_input_ids = torch.tensor([[0]])  # Initialize with a single token (EOS)
        # Set the top-k value
        top_k = 50

        # Generate responses using top-k sampling
        outputs = model.generate(
            input_ids=encoder_inputs["input_ids"],
            attention_mask=encoder_inputs["attention_mask"],
            max_length=128,
            do_sample=True,
            top_k=top_k,
            early_stopping=True
        )

        # Select the top response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return f"Chatbot: {response}"
    else:
        return "Chatbot: Please specify which party you are referring to."
