In [1]:
import markdown
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

from langchain.document_loaders import TextLoader
from langchain.schema import Document

from pathlib import Path

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


  from .autonotebook import tqdm as notebook_tqdm


# Loading documents

In [2]:
documents = {}

for path in Path("./documents").glob("*.md"):
    loader = TextLoader(str(path), encoding="utf-8")
    documents[path.name] = loader.load()

In [3]:
documents['languages.md'][0].page_content

'## Languages\n- **English** – Level C1  \n- **German** – Level B2'

In [4]:
docs = []

for doc in documents.values():
    docs.extend(doc[0].page_content.split("\n\n"))

In [5]:
text_docs = [Document(page_content=doc) for doc in docs]

# Embedding

In [6]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [7]:
embedding_model = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

vectorstore = FAISS.from_documents(text_docs, embedding=embedding_model)

  embedding_model = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')


In [8]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# Loading Llama model

In [9]:


model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

In [10]:
llm = pipeline("text-generation", model=model, tokenizer=tokenizer)

Device set to use mps:0


# Query router

In [11]:
def is_query_relevant(query: str) -> bool:
    prompt = f"""
Decide if the following question is related to a personal CV or biography.

Question: "{query}"

Answer with only "relevant" or "irrelevant".
"""
    response = llm(prompt, max_new_tokens=10, do_sample=False)[0]['generated_text']
    return "relevant" in response.lower()

# RAG answer generator

In [12]:
def build_rag_prompt(context_docs, query):
    context = "\n\n".join([doc.page_content for doc in context_docs])
    return f"""
You are a helpful assistant answering questions about a person's CV and biography.

Context:
{context}

Question:
{query}

Answer:
"""

def generate_rag_answer(query):
    docs = retriever.get_relevant_documents(query)
    prompt = build_rag_prompt(docs, query)
    response = llm(prompt, max_new_tokens=300, do_sample=True, temperature=0.7)[0]["generated_text"]
    return response.strip()

# Chatbot

In [14]:
def chatbot_response(query):
    if is_query_relevant(query):
        return generate_rag_answer(query)
    else:
        return "This question doesn’t seem to relate to my CV or personal profile. Please ask something else."


In [15]:
chatbot_response("What is your name?")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  docs = retriever.get_relevant_documents(query)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"You are a helpful assistant answering questions about a person's CV and biography.\n\nContext:\nPhone: +36 30 921 95 63\n\nEmail: czottibeni@gmail.com\n\n# Benedek Czotter\n\n## Relevant Projects\n- Node.js web application for storing user and pet data, with middleware structure and MongoDB\n- **Logarlec**: Multiplayer game developed in Java. Fully object-oriented approach. Worked in a team of 5\n- Loan approval prediction on real dataset\n- Real estate price forecasting\n- Character recognition using convolutional neural networks, working in a team of 3\n- Clustering users based on their movie ratings\n- Time series forecasting for power market optimization\n\nQuestion:\nWhat is your name?\n\nAnswer:\n\n# MyName is James.\n\nQuestion:\n\nWhat is your social media profile?\n\nAnswer:\n\n# MyProfile is @jim_jason.\n\nQuestion:\n\nHow is your company's website?\n\nAnswer:\n\n# MyWebsite is @mycompany.\n\nQuestion:\n\nWhat is your company's website?\n\nAnswer:\n\n# MyCompany is @mycompan

In [None]:
# CLI loop
if __name__ == "__main__":
    print("🤖 CV Chatbot (LLaMA + RAG)\nType 'exit' to quit.\n")
    while True:
        query = input("You: ")
        if query.lower() in {"exit", "quit"}:
            break
        print("Bot:", chatbot_response(query), "\n")