In [59]:
!pip install transformers
!pip install wikipedia-api
!pip install chromadb
!pip install gradio




In [60]:
import wikipediaapi

# Initialize Wikipedia API
wiki_wiki = wikipediaapi.Wikipedia('cmbhatt99@gmail.com', 'en')

def scrape_page(page_name):
    """
    This function takes a page name and retrieves its full text.
    """
    # Get the page object
    page = wiki_wiki.page(page_name)

    # Check if the page exists
    if not page.exists():
        return None

    # Retrieve the full text of the page
    text = page.text
    # Clean the text by removing newline characters
    text_cleaned = text.replace("\n", "")

    return text_cleaned

def scrape_category_pages(category_name):
    """
    This function takes in each element of list_of_categories and visits all the pages in it to scrape their full text.
    """
    summaries = []

    # Get the category page
    category_page = wiki_wiki.page("Category:" + category_name)

    # Iterate over the category members
    for member in category_page.categorymembers.values():
        # Check if the member is a page
        if member.ns == wikipediaapi.Namespace.MAIN:
            # Get the page object
            page = wiki_wiki.page(member.title)
            # Retrieve the summary of the page
            summary = page.summary
            # Clean the summary by removing newline characters
            summary_cleaned = summary.replace("\n", "")
            # Add the cleaned summary to the list
            summaries.append(summary_cleaned)

    return summaries

# List of pages and categories to scrape
list_of_pages = [
    'History of the Israeli–Palestinian conflict',
    '1948 Arab–Israeli War',
    'Six-Day War',
    'Yom Kippur War',
    'Arab–Israeli conflict',
    'Israel–Hamas war',
    'Israeli–Palestinian conflict',
    'Al-Shifa Hospital siege',
    'Gaza–Israel conflict',
    'Nakba',
    '1948 Palestine war'
]

list_of_categories = ['History of the Middle East', 'History of the Jews in the Middle East', 'Israel–Hamas war', 'Timeline of the Israel–Hamas war']

# Collect full texts from all pages
knowledge_base = []
for page in list_of_pages:
    full_text = scrape_page(page)
    if full_text:
        knowledge_base.append(full_text)

# Collect summaries from all categories
for category in list_of_categories:
    category_summaries = scrape_category_pages(category)
    knowledge_base.extend(category_summaries)



In [61]:
import chromadb

# Initialize Chroma client
client = chromadb.Client()
collection = client.create_collection("ChinmaysIsraelPalestineChatBot")


collection.add(
    ids=[str(i) for i in range(len(knowledge_base))],  # IDs are just strings
    documents=knowledge_base,
    metadatas=[{"type": "support"} for _ in range(len(knowledge_base))
    ],
)



In [62]:

from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained('vblagoje/bart_lfqa')
generator = BartForConditionalGeneration.from_pretrained('vblagoje/bart_lfqa')

def respond(message):
        results = collection.query(
                  query_texts=message,
                  n_results=5)

        conditioned_doc = "<P> " + " <P> ".join([d for d in results['documents'][0]])
        query_and_docs = "question: {} context: {}".format(message, conditioned_doc)

        model_input = tokenizer(query_and_docs, truncation=True, padding=True, return_tensors="pt")

        generated_answers_encoded = generator.generate(input_ids=model_input["input_ids"],
                                           attention_mask=model_input["attention_mask"],
                                           min_length=20,
                                           max_length=256,
                                           do_sample=False,
                                           early_stopping=True,
                                           num_beams=8,
                                           temperature=1.0,
                                           top_k=None,
                                           top_p=None,
                                           eos_token_id=tokenizer.eos_token_id,
                                           no_repeat_ngram_size=3,
                                           num_return_sequences=1)

        answer_to_the_query = tokenizer.batch_decode(generated_answers_encoded, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        return answer_to_the_query[0]


In [None]:
import gradio as gr
def chat_with_bot(message, history):
    response = respond(message)
    return response

# Set up Gradio interface
gr.ChatInterface(
    chat_with_bot,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Ask me anything about the conflict", container=False, scale=7),
    title="Chinmay's Israel-Palestine Bot",
    description="Abstractive question answering system",
    theme="soft",
).launch(debug=True)

In [64]:
# List the required packages with specific versions
required_packages = [
    "transformers",
    "wikipedia-api",
    "chromadb",
    "gradio"
]

# Create and write to requirements.txt
with open("requirements.txt", "w") as f:
    for package in required_packages:
        f.write(f"{package}\n")

# Download the requirements.txt file
from google.colab import files
files.download('requirements.txt')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>