In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader

######### Content Processing Block ###############################

## Loading PDF file from local file directory
## read the content and store it in data object 
local_path = "./alphabet/Praveen_13Yrs_Datascience_AI.pdf"

if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    print("Upload a PDF file for processing.")

print(data[0].page_content[:20])

  from .autonotebook import tqdm as notebook_tqdm


Praveen Kumar V – Da


In [3]:
data,type(data[0])

([Document(metadata={'source': './alphabet/Praveen_13Yrs_Datascience_AI.pdf'}, page_content='Praveen Kumar V – Data science, AI – 13 plus Yrs – IIIT Hyderabad\n\nPhone: 9663045588 email: praveenv8ai@gmail.com LinkedIn: https://www.linkedin.com/in/praveen-kumar-62a95212/\n\nExecutive Summary\n\nHighly skilled data science professional, leader, adept at translating business problems into actionable data-driven solutions. Proficient to apply respective skills to deliver innovative scalable and impactful results that align with organizational goals.\n\nExploring potential Analytics Leadership roles that leverage experience gained across – diverse areas in\n\nBusiness Transformation • Pharma Mfg, eCommerce, D2C\n\n• People Excellence & Team management\n\nInnovation, Strategic Thinking\n\nBusiness Leadership Communication • Stakeholder Success Enablement\n\nAnalytics Technical and Leadership Competencies\n\nHands on Execution • Proficient Analytical, Problem-solving skills • Product Manageme

In [9]:
## Converting content into dense vector embeddings 
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma 

#Split and chunk the data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)


# Add the chunks to vector database, which takes the model for creating the embeddings.
# vector_db = Chroma.from_documents(
#                                     documents=chunks, 
#                                     embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
#                                     persist_directory="resume_db"
#                                 )

vector_db = Chroma(persist_directory='resume_db', embedding_function=OllamaEmbeddings(model="nomic-embed-text", show_progress=True))
#https://github.com/hwchase17/chroma-langchain/blob/master/persistent-qa.ipynb

In [10]:
######### Retrieval + Generation of Response ##############################
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# local_llm = "llama3.1" #latest 4.9GB
local_llm = "llama3.2" #1.3GB
llm = ChatOllama(model=local_llm)

QUERY_PROMPT = PromptTemplate(
    input_variables = ["question"],
    template="""You are an AI Language model assistant. Your task is to generate five different versions of the given user question to retrieve relavant documents from a vector databaase. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search. Provide these alternative questions separated by newlines. 
    Original question: {question} """
)

retriever = MultiQueryRetriever.from_llm(vector_db.as_retriever(),llm, prompt=QUERY_PROMPT)

# RAG Prompt
template = """Answer the question based ONLY on the following context: 
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [11]:
user_question = "Give me the list of companies Paraveen worked for"
chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser()
)


In [12]:
response = chain.invoke(user_question)

print(response)

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 35.53it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 20.67it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 46.31it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 42.30it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 44.11it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 24.24it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings

Based on the given context, here is a list of companies that Praveen Kumar V has worked for:

1. Samsung R&D Institute, Bangalore
2. Gramener, Hyderabad
3. DataJango, Hyderabad
4. Mphasis, Hyderabad
5. Bridgei2i Analytics Solutions, Bangalore
6. IBM India, Bangalore

Note that Praveen also mentions IIIT Hyderabad as his alma mater and the institute where he completed his PG (MSIT) and B.Sc. programs, but it's not explicitly mentioned as a company he worked for.


In [9]:
user_question = "What are technical skills does Praveen Possess ?"
chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser()
)

response = chain.invoke(user_question)

print(response)

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 75.31it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 60.76it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 31.22it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 64.87it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 44.40it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 69.41it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


According to the document, Praveen possesses the following technical skills:

1. Data Science / AI:
	* Python libraries: Pandas, Scikit-learn, Seaborn, Numpy, Spacy, Gensim, statsmodels, PyPDF, BeautifulSoup, LangChain
2. Machine Learning / AI models techniques:
	* Linear Regression
	* Logistic Regression
	* Decision Tree Models
	* Bagging & Boosting Ensemble models
	* Naïve Bayes
	* KNN
	* Neural Networks
	* SVM
	* K Means
3. Data Engineering and pipeline development:
	* BigQuery SQL
	* Superset
	* open source LLMs (RAG, Milvus)
4. Data Visualization and communication:
	* Data Visualization using tools like LangChain, LlamaIndex, Steamlit
5. Cloud skills:
	* Python programming with cloud capabilities
6. Programming languages:
	* Java
	* Pearl
7. Scripting languages:
	* Linux Shell Scripting
8. Other technologies:
	* Docker
	* Flask
	* Camelot
	* Statistical Models (R Studio)
	* NLP - Process FDA data, OCR models, CNN


By llama3.2:1b outputs

In [10]:
user_question = "Give me the list of companies Paraveen worked for"
chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser()
)

response = chain.invoke(user_question)

print(response)

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 74.10it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 34.54it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 68.70it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 67.47it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 45.64it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 46.13it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings

Based on the context provided, here is a list of companies that Praveen Kumar V worked for:

1. Samsung R&D Institute, Bangalore
2. Gramener, Hyderabad
3. DataJango, Hyderabad
4. Mphasis, Hyderabad
5. Bridgei2i Analytics Solutions, Bangalore
6. IBM India, Bangalore
7. Osmania University - Hyderabad (as a student)
8. IIIT-Hyderabad (as a student and later as an Assistant Mentor for MSIT program)


In [11]:
user_question = "What technical skills does Praveen Possess ?"
chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser()
)

response = chain.invoke(user_question)

print(response)

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 54.37it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 72.08it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 30.03it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 72.54it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 71.25it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 45.02it/s]
Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1
OllamaEmbeddings

Based on the context, Praveen possesses the following technical skills:

1. Programming languages:
	* Python
2. Data science / AI libraries:
	* Pandas
	* Scikit-learn
	* Seaborn
	* Numpy
	* Spacy
	* Gensim
	* statsmodels
	* PyPDF
	* BeautifulSoup
	* LangChain
3. Machine learning / AI models techniques:
	* Linear Regression
	* Logistic Regression
	* Decision Tree Models
	* Bagging & Boosting Ensemble models
	* Naïve Bayes
	* KNN
	* Neural Networks
	* SVM
	* K Means
	* Encoding
	* Data Cleaning
	* Feature Engineering
	* Regularization
	* Normalization
4. Deep learning frameworks:
	* LLMs (Large Language Models)
5. Cloud skills:
	* Python's scikit learn
6. Big data technologies:
	* BigQuery SQL
7. Database management systems:
	* Milvus
8. Data visualization tools:
	* Superset
9. Data engineering and pipeline development:
	* Docker
