In [8]:
#!pip install langchain

In [None]:
import os
import openai
import sys

import panel as pn  # GUI
pn.extension()

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader

In [29]:
from langchain_huggingface import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import pipeline

In [65]:
hf_pipeline = pipeline("text-generation", model="distilgpt2")
llm_name = HuggingFacePipeline(pipeline=hf_pipeline)
#llm_name = "gpt-3.5-turbo-instruct"

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [78]:
def load_db(file, chain_type, k, llm_name="distilgpt2"):
    # Load documents
    loader = PyPDFLoader(file)
    documents = loader.load()

    # Split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)

    # Define embedding
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Create vector database from data
    db = DocArrayInMemorySearch.from_documents(docs, embeddings)

    # Define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})

    # Load tokenizer and model from the same checkpoint
    tokenizer = AutoTokenizer.from_pretrained(llm_name)
    model = AutoModelForCausalLM.from_pretrained(llm_name)

    # Configure the pipeline
    hf_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=2000,  # Adjust as needed
        temperature=0.1,     # Adjust for creativity
    )
    llm = HuggingFacePipeline(pipeline=hf_pipeline)

    # Create a chatbot chain
    qa = ConversationalRetrievalChain.from_llm(
        llm=llm,
        chain_type=chain_type,
        retriever=retriever,
        return_source_documents=True,
        return_generated_question=True,
    )
    return qa

In [79]:
import panel as pn
import param
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

class cbfs(param.Parameterized):
    chat_history = param.List([])
    answer = param.String("")
    db_query = param.String("")
    db_response = param.List([])
    
    def __init__(self, **params):
        super(cbfs, self).__init__(**params)
        self.panels = []
        self.loaded_file = "references/daminov-2024-relationship-between-the-type-of-media-consumption-and-political-trust-in-the-european-union-evidence-from.pdf"
        self.qa = load_db(self.loaded_file, "stuff", 4)
    
    def call_load_db(self, count):
        if count == 0 or file_input.value is None:  # init or no file specified
            return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")
        else:
            file_input.save("temp.pdf")  # local copy
            self.loaded_file = file_input.filename
            button_load.button_style = "outline"
            self.qa = load_db("temp.pdf", "stuff", 4)
            button_load.button_style = "solid"
        self.clr_history()
        return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")

    def convchain(self, query):
        if not query:
            return pn.WidgetBox(pn.Row('User:', pn.pane.Markdown("", width=600)), scroll=True)
        result = self.qa({"question": query, "chat_history": self.chat_history})
        self.chat_history.extend([(query, result["answer"])])
        self.db_query = result["generated_question"]
        self.db_response = result["source_documents"]
        self.answer = result['answer'] 
        self.panels.extend([
            pn.Row('User:', pn.pane.Markdown(query, width=600)),
            pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, style={'background-color': '#F6F6F6'}))
        ])
        inp.value = ''  # clears loading indicator when cleared
        return pn.WidgetBox(*self.panels, scroll=True)

    @param.depends('db_query', )
    def get_lquest(self):
        if not self.db_query:
            return pn.Column(
                pn.Row(pn.pane.Markdown(f"Last question to DB:", styles={'background-color': '#F6F6F6'})),
                pn.Row(pn.pane.Str("no DB accesses so far"))
            )
        return pn.Column(
            pn.Row(pn.pane.Markdown(f"DB query:", styles={'background-color': '#F6F6F6'})),
            pn.pane.Str(self.db_query)
        )

    @param.depends('db_response', )
    def get_sources(self):
        if not self.db_response:
            return 
        rlist = [pn.Row(pn.pane.Markdown(f"Result of DB lookup:", styles={'background-color': '#F6F6F6'}))]
        for doc in self.db_response:
            rlist.append(pn.Row(pn.pane.Str(doc)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    @param.depends('convchain', 'clr_history') 
    def get_chats(self):
        if not self.chat_history:
            return pn.WidgetBox(pn.Row(pn.pane.Str("No History Yet")), width=600, scroll=True)
        rlist = [pn.Row(pn.pane.Markdown(f"Current Chat History variable", styles={'background-color': '#F6F6F6'}))]
        for exchange in self.chat_history:
            rlist.append(pn.Row(pn.pane.Str(exchange)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    def clr_history(self, count=0):
        self.chat_history = []
        return

In [80]:
# Istanza della classe cbfs
cb = cbfs()

# Widget per l'input del file PDF
file_input = pn.widgets.FileInput(accept='.pdf')

# Pulsanti per caricare il DB e cancellare la cronologia
button_load = pn.widgets.Button(name="Load DB", button_type='primary')
button_clearhistory = pn.widgets.Button(name="Clear History", button_type='warning')

# Collegamento del pulsante "Clear History" alla funzione di cancellazione della cronologia
button_clearhistory.on_click(cb.clr_history)

# Campo di input per l'utente
inp = pn.widgets.TextInput(placeholder='Enter text here…')

# Collegamento del pulsante "Load DB" alla funzione di caricamento del DB
bound_button_load = pn.bind(cb.call_load_db, button_load.param.clicks)

# Collegamento della conversazione all'input dell'utente
conversation = pn.bind(cb.convchain, inp)

# Creazione delle schede (tabs) per l'interfaccia utente
tab1 = pn.Column(
    pn.Row(inp),  # Campo di input per l'utente
    pn.layout.Divider(),
    pn.panel(conversation, loading_indicator=True, height=300),  # Conversazione
    pn.layout.Divider(),
)

tab2 = pn.Column(
    pn.panel(cb.get_lquest),  # Ultima query al database
    pn.layout.Divider(),
    pn.panel(cb.get_sources),  # Risultati del database
)

tab3 = pn.Column(
    pn.panel(cb.get_chats),  # Cronologia della chat
    pn.layout.Divider(),
)

tab4 = pn.Column(
    pn.Row(file_input, button_load, bound_button_load),  # Caricamento del file e del DB
    pn.Row(button_clearhistory, pn.pane.Markdown("Clears chat history. Can use to start a new topic")),  # Pulsante per cancellare la cronologia
    pn.layout.Divider(),
    # pn.Row(jpg_pane.clone(width=400))  # Eventuale immagine (commentata)
)

# Creazione del dashboard
dashboard = pn.Column(
    pn.Row(pn.pane.Markdown('# SocialResearch Bot! Ask me something!')),  # Titolo del dashboard
    pn.Tabs(
        ('Conversation', tab1),  # Scheda per la conversazione
        ('Database', tab2),      # Scheda per le query al database
        ('Chat History', tab3),  # Scheda per la cronologia della chat
        ('Configure', tab4)      # Scheda per la configurazione
    )
)

# Avvio del dashboard
dashboard.servable()

BokehModel(combine_events=True, render_bundle={'docs_json': {'7b2b15e5-1272-4426-8dc5-8b380a4b5ef7': {'version…

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, style={'background-color': '#F6F6F6'}))
Token indices sequence length is longer than the specified maximum sequence length for this model (1085 > 1024). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


2025-01-24 18:01:46,322 ERROR: panel.reactive - Callback failed for object named "" changing property {'value': 'what is a social media?'} 
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/panel/reactive.py", line 385, in _process_events
    self.param.update(**self_events)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/param/parameterized.py", line 2278, in update
    restore = self_._update(*args, **kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/param/parameterized.py", line 2318, in _update
    self_._batch_call_watchers()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/param/parameterized.py", line 2509, in _batch_call_watchers
    self_._execute_watcher(watcher, events)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/pyth

IndexError: index out of range in self

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


2025-01-24 18:02:16,142 ERROR: panel.reactive - Callback failed for object named "" changing property {'value': 'is it safe?'} 
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/panel/reactive.py", line 385, in _process_events
    self.param.update(**self_events)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/param/parameterized.py", line 2278, in update
    restore = self_._update(*args, **kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/param/parameterized.py", line 2318, in _update
    self_._batch_call_watchers()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/param/parameterized.py", line 2509, in _batch_call_watchers
    self_._execute_watcher(watcher, events)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-

IndexError: index out of range in self