In [18]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("OPENAI_API_KEY not found in .env file")

print("API key loaded")

API key loaded


In [19]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.document import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

In [None]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, TextLoader
#from langchain_community.document_loaders.markdown import UnstructuredMarkdownLoader  # Good for .md

# List to hold all loaded documents
docs = []

# Load PDFs
pdf_loader = DirectoryLoader(
    "documents/",
    glob="**/*.pdf",          # Only PDF files
    loader_cls=PyPDFLoader,
    show_progress=True
)
docs.extend(pdf_loader.load())

# # Load TXT files
# txt_loader = DirectoryLoader(
#     "documents/",
#     glob="**/*.txt",          # Only TXT files
#     loader_cls=TextLoader,
#     show_progress=True
# )
# docs.extend(txt_loader.load())

# # Load Markdown files
# # md_loader = DirectoryLoader(
# #     "documents/",
# #     glob="**/*.md",           # Only MD files
# #     loader_cls=UnstructuredMarkdownLoader,  # Handles .md well
# #     show_progress=True
# # )
# # docs.extend(md_loader.load())

# # Optional: Add more types, e.g., .docx with UnstructuredWordDocumentLoader, etc.
print(f"Total loaded documents: {len(docs)}")
# print(f"First document: {docs[0].page_content}")

[print(f"Document {i+1}: {doc.page_content}") for i, doc in enumerate(docs[:3])]


100%|██████████| 1/1 [00:00<00:00,  1.72it/s]

Total loaded documents: 3
Document 1: OKEMAKINDE SHERIF SUNDAY 
 
+2348109155294     B9 Federal Housing Estate, 
cheryvmak.cs@gmail.com                Olomore, Abeokuta North L.G.A, 
       Abeokuta, Ogun State.                         
RESEARCH INTEREST 
Mathematical and Computational Statistics 
Time Series Analysis  
Bayesian Statistics and Methods  
Data Science and Operations Research 
Probability Distribution and Statistical Modelling 
 
EDUCATION  
 
MSc University of Lagos, Nigeria, Statistics                                          March 2023 
 Thesis: A Comparative Study of Bayesian Structural Time Series 
Model and SARIMA Model for Rainfall Forecasting in Nigeria 
Committee: Professor M.O.Adamu (chair), Dr E.B. Nkemnole (member),  
Dr. N.I. Badmus (member), Dr. R.K. Ogundeji (supervisor) 
 
BSc University of Agriculture, Abeokuta, Nigeria, Statistics                 January 2014 
           Thesis: Statistical Analysis of Indecent Assault and Rape cases    
           in Og




[None, None, None]

In [34]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Experiment: Start with chunk_size=1000, overlap=200 (overlap helps connect ideas)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,  # Characters, not tokens—use tiktoken for precision if needed
    chunk_overlap=100,
    length_function=len,
    separators=["\n\n", "\n", "•", "-", " "]

)

chunks = text_splitter.split_documents(docs)
print(f"Created {len(chunks)} chunks.")

# Test different sizes: Try chunk_size=500 and see if retrieval improves later.
# Split documents
#chunks = text_splitter.split_documents(docs)

print(f"Split {len(docs)} documents into {len(chunks)} chunks")
for i, chunk in enumerate(chunks[:22]):
    print(f"\nChunk {i+1}: {chunk.page_content}")
# for i, chunk in enumerate(chunks[9:12], start=1):
#     print(f"\nChunk {i}: {chunk.page_content}")


Created 6 chunks.
Split 3 documents into 6 chunks

Chunk 1: OKEMAKINDE SHERIF SUNDAY 
 
+2348109155294     B9 Federal Housing Estate, 
cheryvmak.cs@gmail.com                Olomore, Abeokuta North L.G.A, 
       Abeokuta, Ogun State.                         
RESEARCH INTEREST 
Mathematical and Computational Statistics 
Time Series Analysis  
Bayesian Statistics and Methods  
Data Science and Operations Research 
Probability Distribution and Statistical Modelling 
 
EDUCATION  
 
MSc University of Lagos, Nigeria, Statistics                                          March 2023 
 Thesis: A Comparative Study of Bayesian Structural Time Series 
Model and SARIMA Model for Rainfall Forecasting in Nigeria 
Committee: Professor M.O.Adamu (chair), Dr E.B. Nkemnole (member),  
Dr. N.I. Badmus (member), Dr. R.K. Ogundeji (supervisor)

Chunk 2: Dr. N.I. Badmus (member), Dr. R.K. Ogundeji (supervisor) 
 
BSc University of Agriculture, Abeokuta, Nigeria, Statistics                 January 2014 
      

In [20]:
# import os

# pdf_folder = "documents"

# pdf_files = sorted([
#     os.path.join(pdf_folder, f)
#     for f in os.listdir(pdf_folder)
#     if f.endswith(".pdf")
# ])

# print("PDF files found:")
# for i, f in enumerate(pdf_files, 1):
#     print(f"{i}. {f}")


In [21]:
# from langchain_community.document_loaders import PyPDFLoader

# fourth_pdf_path = pdf_files[3]  # 5th PDF (0-based index)
# loader = PyPDFLoader(fourth_pdf_path)
# docs = loader.load()


In [35]:
# from langchain_text_splitters import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=210,  # Characters, not tokens—use tiktoken for precision if needed
#     chunk_overlap=20,
#     length_function=len,
#     separators=["\n\n", "\n", "•", "-", " "]

# )

# chunks = text_splitter.split_documents(docs)
# print(f"Pages in 4th PDF: {len(docs)}")
# print(f"Chunks created: {len(chunks)}")

# for i, chunk in enumerate(chunks[:3], 1):
#     print(f"\nChunk {i}:\n{chunk.page_content[:300]}")



### Embeddings

In [36]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=api_key
)

# Test embedding
test_embedding = embeddings.embed_query("What is the title of thesis that sherif wrote during his Msc?")
print(f"Embedding dimension: {len(test_embedding)}")
print(f"First 5 values: {test_embedding[:5]}")

Embedding dimension: 1536
First 5 values: [0.019431181252002716, 0.019020289182662964, -0.013691957108676434, 0.012154429219663143, -0.051480699330568314]


### Vector Store

In [37]:
from langchain_chroma import Chroma

# Create vector store from documents
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="chroma_db"
)

# To load later: vectorstore = Chroma(persist_directory="chroma_db", embedding_function=embeddings)

print(f"Vector store created with {len(chunks)} chunks")

# Test similarity search
query = "Thesis that sherif wrote during his Bsc?"
results = vectorstore.similarity_search(query, k=2)

print(f"\nQuery: {query}")
for i, doc in enumerate(results):
    print(f"\nResult {i+1}: {doc.page_content}")

Vector store created with 6 chunks

Query: Thesis that sherif wrote during his Bsc?

Result 1: BSc University of Agriculture, Abeokuta, Nigeria, Statistics                 January 2014 
           Thesis: Statistical Analysis of Indecent Assault and Rape cases    
           in Ogun State, Nigeria

Result 2: BSc University of Agriculture, Abeokuta, Nigeria, Statistics                 January 2014 
           Thesis: Statistical Analysis of Indecent Assault and Rape cases    
           in Ogun State, Nigeria


## Building RAG with LCEL (LangChain Expression Language)

### Simple RAG Chain

In [38]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Create LLM
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0,
    openai_api_key=api_key
)

# Create retriever
retriever = vectorstore.as_retriever(
    search_type="similarity",  # Cosine similarity
    search_kwargs={"k": 3}  # Return top 3 chunks
)
# Create prompt
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant for my personal documents. Answer using ONLY the provided context.If you don't know, say so. Always cite sources"),
    ("human", "{question}\n\nContext:\n{context}")
])

# Helper function to format documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Build RAG chain using LCEL
rag_chain = (
    RunnableParallel(context=retriever | format_docs, question=RunnablePassthrough())
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG chain created")

RAG chain created


In [39]:
# Query the chain
response = rag_chain.invoke("what is Okemakinde Sherif Sunday Education history?")
print(response)

I'm sorry, but based on the provided context, there is no information available about Okemakinde Sherif Sunday's education history.


### Custom Prompts

In [43]:
# Create a custom prompt with specific instructions
custom_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a precise assistant. If you don't know the answer based on the context, say 'I don't know'."),
    ("human", "Context: {context}\n\nQuestion: {question}")
])

# Build chain with custom prompt
custom_rag = (
    RunnableParallel(context=retriever | format_docs, question=RunnablePassthrough())
    | custom_prompt
    | llm
    | StrOutputParser()
)

# Query the chain
response = rag_chain.invoke("Give me a complete list of all programming languages, frameworks, tools, and technologies mentioned in my CV.")
print(response)

response2 = rag_chain.invoke("What details from Okemakinde Sherif Sunday? Include name, phone, email, LinkedIn, GitHub, address")
print(response2)

response3 = rag_chain.invoke("Present my professional experience in chronological order with full details")
print(response3)



The programming languages, frameworks, tools, and technologies mentioned in your CV are:
- Query
- Power pivot
- Power Bi
- Scikit-learn
- Pandas
- Numpy
- Google Colab
- Jupyter Notebook
- Git
- AWS SDK (Boto 3)
The details provided for Okemakinde Sherif Sunday are as follows:

- Name: Okemakinde Sherif Sunday
- Phone: +2348109155294
- Email: cheryvmak.cs@gmail.com
- Address: B9 Federal Housing Estate, Olomore, Abeokuta North L.G.A, Abeokuta, Ogun State

There is no mention of LinkedIn or GitHub profiles in the provided context.
Based on the provided context, your professional experience in chronological order with full details is as follows:

1. Data Scientist Internship Program at Hamoye (HDSC) in Abuja, Nigeria
   - Duration: Jan 9-April 11, 2022
   - Description: Hands-on training on the use of tools for analyzing large datasets.

2. Workshop: 3 million Technical Talent Program (3MTT), Cloud computing Track in Abeokuta, Nigeria
   - Duration: Apr 29-Jul 28, 2024

3. Seminars/Webin

## Simple Conversational RAG

In [14]:
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.prompts import MessagesPlaceholder

# Store for chat histories
chat_store = {}

def get_session_history(session_id: str):
    if session_id not in chat_store:
        chat_store[session_id] = InMemoryChatMessageHistory()
    return chat_store[session_id]

# Create conversational prompt
conv_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer using the context provided."),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "Context: {context}\n\nQuestion: {question}")
])

# Build base chain
conv_chain_base = (
    RunnableParallel(
        context=lambda x: format_docs(retriever.invoke(x["question"])),
        question=lambda x: x["question"],
        chat_history=lambda x: x.get("chat_history", [])
    )
    | conv_prompt
    | llm
    | StrOutputParser()
)

# Wrap with message history
conv_chain = RunnableWithMessageHistory(
    conv_chain_base,
    get_session_history,
    input_messages_key="question",
    history_messages_key="chat_history"
)

print("Conversational chain created")

Conversational chain created


In [49]:
# First question
response1 = conv_chain.invoke(
    {"question": "Present my professional experience in chronological order with full details."},
    config={"configurable": {"session_id": "session1"}}
)
print(f"Q1: Present my professional experience in chronological order with full details.")
print(f"A1: {response1}\n")

# Second question
response2 = conv_chain.invoke(
    {"question": "Group my experience by domain (e.g., web development, data science, management) and describe relevant roles."},
    config={"configurable": {"session_id": "session1"}}
)
print(f"Q2: Group my experience by domain (e.g., web development, data science, management) and describe relevant roles.")
print(f"A2: {response2}\n")

# Third question
response3 = conv_chain.invoke(
    {"question": "How many years of professional experience do I have according to the resume?"},
    config={"configurable": {"session_id": "session1"}}
)
print(f"Q3: How many years of professional experience do I have according to the resume?")
print(f"A3: {response3}\n")

# Fourth question
response4 = conv_chain.invoke(
    {"question": "Are there any employment gaps in my CV? If yes, mention the dates."},
    config={"configurable": {"session_id": "session1"}}
)
print(f"Q4: Are there any employment gaps in my CV? If yes, mention the dates.")
print(f"A4: {response4}\n")

# Fifth question
response5 = conv_chain.invoke(
    {"question": "What is the total duration of my experience in software engineering / [your field]?"},
    config={"configurable": {"session_id": "session1"}}
)
print(f"Q5: What is the total duration of my experience in software engineering / Statistics?")
print(f"A5: {response5}\n")

# Sixth question
response6 = conv_chain.invoke(
    {"question": "Extract and display the entire details of Okemakinde Sherif Sunday in a structured format: Name, Contact Info, Summary (if any), Work Experience (with bullets), Education, Skills, Projects, Certifications, and any other sections. Be as detailed and complete as possible."},
    config={"configurable": {"session_id": "session1"}}
)
print(f"Q6: Extract and display the entire details of Okemakinde Sherif Sunday in a structured format: Name, Contact Info, Summary, Work Experience, Education, Skills, Projects, Certifications, and other sections.")
print(f"A6: {response6}\n")

# Seventh question
response7 = conv_chain.invoke(
    {"question": "Provide a complete summary of my entire resume, including education, work experience, skills, and any other sections."},
    config={"configurable": {"session_id": "session1"}}
)
print(f"Q7: Provide a complete summary of my entire resume, including education, work experience, skills, and any other sections.")
print(f"A7: {response7}\n")

# Eighth question
response8 = conv_chain.invoke(
    {"question": "Summarize everything in my CV in detail, structured by section."},
    config={"configurable": {"session_id": "session1"}}
)
print(f"Q8: Summarize everything in my CV in detail, structured by section.")
print(f"A8: {response8}\n")


Q1: Present my professional experience in chronological order with full details.
A1: **Professional Experience:**

1. **Data Scientist Internship Program at Hamoye (HDSC), Abuja, Nigeria**
   - Duration: Jan 9-April 11, 2022
   - Description: Hands-on training on the use of tools for analyzing large datasets.

2. **3 million Technical Talent Program (3MTT), Cloud computing Track, Abeokuta, Nigeria**
   - Duration: Apr 29-Jul 28, 2024
   - Description: [Details of the cloud computing training program].

**Seminars/Webinars:**
1. **Unilag/Department of Statistics Monthly Webinar Series**
   - Topic: How to Build an AI Document Chatbot Data
   - Date: Jan 16, 2024

2. **Unilag/Department of Statistics Monthly Webinar Series**
   - Topic: Modelling Complex Systems in the era of Big Data
   - Date: Mar 23, 2023

**Professional Affiliations:**
- Member, Data Scientists Network (DSN): 2022-Present
- Member, Data Science Lagos Club (DSL): 2022-Present

**Community Service:**
1. **Department of

In [50]:
# View chat history
session = get_session_history("session1")
print("Chat History:")
for msg in session.messages:
    print(f"\n{msg.type}: {msg.content}")

Chat History:

human: what is sherif education history?

ai: I'm sorry, but based on the provided context, there is no information about Sherif's education history. The details shared only include contact information and address.

human: Which year did he graduate?

ai: Based on the information provided, Sherif is expected to graduate in March 2023 with a Master of Science degree in Statistics from the University of Lagos, Nigeria.

human: Present my professional experience in chronological order with full details.

ai: 1. Data Scientist Internship Program at Hamoye (HDSC), Abuja, Nigeria
   - Duration: Jan 9-April 11, 2022
   - Description: Hands-on training on the use of tools for analyzing large datasets

2. Unilag/Department of Statistics Monthly Webinar Series
   - Topic: Modelling Complex Systems in the era of Big Data
   - Date: Mar 23, 2023

3. Unilag/Department of Statistics Monthly Webinar Series
   - Topic: How to Build an AI Document Chatbot Data
   - Date: Jan 16, 2024

4.