In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader

######### Content Processing Block ###############################

## Loading PDF file from local file directory
## read the content and store it in data object 
local_path = "./alphabet/Praveen_13Yrs_Datascience_AI.pdf"

if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    print("Upload a PDF file for processing.")

print(data[0].page_content[:20])

  from .autonotebook import tqdm as notebook_tqdm


Praveen Kumar V – Da


In [2]:
## Converting content into dense vector embeddings 
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma 

#Split and chunk the data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)


# Add the chunks to vector database, which takes the model for creating the embeddings.
# vector_db = Chroma.from_documents(
#                                     documents=chunks, 
#                                     embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
#                                     persist_directory="resume_db"
#                                 )

vector_db = Chroma(persist_directory='resume_db', embedding_function=OllamaEmbeddings(model="nomic-embed-text", show_progress=True))
#https://github.com/hwchase17/chroma-langchain/blob/master/persistent-qa.ipynb

  vector_db = Chroma(persist_directory='resume_db', embedding_function=OllamaEmbeddings(model="nomic-embed-text", show_progress=True))
  vector_db = Chroma(persist_directory='resume_db', embedding_function=OllamaEmbeddings(model="nomic-embed-text", show_progress=True))


In [None]:
######### Retrieval + Generation of Response ##############################
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

local_llm = "llama3.1" #latest 4.9GB
#local_llm = "llama3.2:1b" #1.3GB
llm = ChatOllama(model=local_llm)

QUERY_PROMPT = PromptTemplate(
    input_variables = ["question"],
    template="""You are an AI Language model assistant. Your task is to generate five different versions of the given user question to retrieve relavant documents from a vector databaase. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search. Provide these alternative questions separated by newlines. 
    Original question: {question} """
)

retriever = MultiQueryRetriever.from_llm(vector_db.as_retriever(),llm, prompt=QUERY_PROMPT)

# RAG Prompt
template = """Answer the question based ONLY on the following context: 
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [9]:
user_question = "Give me the list of companies Paraveen worked for"
chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser()
)

response = chain.invoke(user_question)

print(response)

OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.10s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.18s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.21s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.19s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


Based on the document metadata, Paraveen Kumar V has worked for the following companies:

1. Samsung R&D Institute
2. Gramener
3. DataJango
4. Mphasis
5. Bridgei2i Analytics Solutions


In [10]:
user_question = "What technical skills does Praveen Possess ?"
chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser()
)

response = chain.invoke(user_question)

print(response)

OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.22s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.21s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.18s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


Based on the document provided, here are some of the technical skills that Praveen Kumar V possesses:

1. Python programming languages:
   - Pandas
   - Scikit-learn
   - Seaborn
   - Numpy
   - Spacy
   - Gensim
   - statsmodels

2. Data science and AI technologies:
   - Machine learning (ML) models: Linear Regression, Logistic Regression, Decision Tree Models, Bagging & Boosting Ensemble models, Naïve Bayes, KNN, Neural Networks, SVM, K Means, Encoding, Data Cleaning, Feature Engineering
   - Natural Language Processing (NLP): NLTK, LangChain

3. Big data technologies:
   - Hadoop: MapReduce programs for benchmarking data transformations
   - Spark: Data ingestion pipeline development
   - Hive: Data warehousing and SQL

4. Cloud technologies:
   - Docker
   - IBM/HP/ECM storage migrations (HMC, PowerHA Cluster builds)
   - Apache Hadoop, IBM Infosphere
   - Linux-RedHat setup & migrations for server migrations

5. Operating Systems:
   - Unix/Linux OS upgrades and patches installati

By llama3.2:1b outputs

In [7]:
user_question = "Give me the list of companies Paraveen worked for"
chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser()
)

response = chain.invoke(user_question)

print(response)

OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.16s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.19s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


Based on the document metadata, Paraveen Kumar V has worked for the following companies:

1. IBM India
2. Mphasis
3. Bridgei2i Analytics Solutions
4. JNTU/IIIT Hyderabad (as Assistant Mentor for MSIT program)


In [None]:
user_question = "What technical skills does Praveen Possess ?"
chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser()
)

response = chain.invoke(user_question)

print(response)

OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.17s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:02<00:00,  2.20s/it]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


Based on the provided document, Praveen Kumar V possesses the following technical skills:

1. Data Science:
   - Proficient Analytical, Problem-solving skills
   - Product Management, vision implementation
   - Data Engineering and pipeline development
   - Data Visualization and communication
   - Hands-on execution of Analytics leadership roles across diverse areas in business transformation

2. Machine Learning (ML) and Artificial Intelligence (AI):
   - AI (LLM, RAG, Embeddings) skills
   - Statistical Analysis, EDA-Insights
   - Predictive, Cluster Modeling
   - Deep Learning Frameworks

3. Python programming:
   - Proficient in various Python libraries such as Pandas, Scikit-learn, Seaborn, Numpy, Spacy, Gensim, statsmodels, PyPDF, BeautifulSoup, LangChain, Llamaindex.

4. Cloud Computing and Big Data Technologies:
   - Hands-on use of technologies like BigQuery SQL, Superset, open source LLMs, RAG, Milvus, LangChian, LlamaIndex, Steamlit, Docker

5. Natural Language Processing (