In [None]:
# HuggingFace
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
# Prompts
from langchain.prompts import PromptTemplate
from langchain.vectorstores import DeepLake
from langchain.chains import RetrievalQA,RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ContextualCompressionRetriever
from langchain.document_loaders import PyPDFLoader
from langchain_cohere import CohereEmbeddings, \
    CohereRerank as langchain_cohere_reranker
# DeepLake
import deeplake
# Others
import cohere
import os
import re

## **Research Assistant**

In [2]:
from dotenv import load_dotenv

# Provide the filename as a string
load_dotenv('../.env')

True

In [None]:
# loadding Huggingface token
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# models 
repo_id = "mistralai/Mistral-7B-Instruct-v0.3"


# model parameters
model_kwargs = {
    # "max_new_tokens": 1000, # Maximum tokens to generate
    # "max_length": 4000, # Maximum length of input + output
    "temperature": 0.1, # Controls randomness of output
    "timeout": 6000,
    # "task":'conversational'
}

# LLM set up
llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    huggingfacehub_api_token = HUGGINGFACEHUB_API_TOKEN,
    **model_kwargs
    # you specify the task or not
    # You can also specify the task in the model_kwargs or within here
    # task = 'conversational',
)

chat_model = ChatHuggingFace(llm=llm)

### **Collecting Resources**

In [None]:
desktop = os.path.expanduser("~\Desktop")
research_assistant = os.path.join(desktop,'research_assistant')
research_articles = os.path.join(research_assistant,"research_articles")

In [5]:
# Collecting all pdfs files
pdf_files = []
for root, dirs, files in os.walk(research_articles):
    for file in files:
        if file.lower().endswith(".pdf"):
            pdf_files.append(os.path.join(root,file))

# Loading, splitting and appending each pdf file
all_pages = []
for pdf in pdf_files:
    loader = PyPDFLoader(pdf)
    pages = loader.load_and_split()
    all_pages.extend(pages)

In [None]:
# Resource inspection for understanding
print(f"Length of documents:\n{len(all_pages)}\n")
print(f"The whole documents:\n{all_pages}\n")
print(f"Taking a document from the book:\n{all_pages[0]}\n")
print(f"Taking a page content from the book:\n{all_pages[0].page_content}\n")
print(f"Taking a meta data from the book:\n{all_pages[0].metadata}\n")
print(f"Meta data keys: {list(all_pages[0].metadata.keys())}\n")
print(f"Taking a source from meta data from the book:\n{all_pages[0].metadata['source']}\n")
print(f"Taking a page from meta data from the book:\n{all_pages[0].metadata['page']}")

In [7]:
# Chunking 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=700)
docs = text_splitter.split_documents(all_pages)

In [None]:
# Resource inspection for understanding
print(f"Length of documents:\n{len(docs)}\n")
print(f"The whole documents:\n{docs}\n")
print(f"Taking a document from the book:\n{docs[1]}\n")
print(f"Taking a page content from the book:\n{docs[1].page_content}\n")
print(f"Taking a meta data from the book:\n{docs[1].metadata}\n")
print(f"Meta data keys: {list(docs[1].metadata.keys())}\n")
print(f"Taking a source from meta data from the book:\n{docs[1].metadata['source']}\n")
filename = os.path.splitext(os.path.basename(docs[1].metadata['source']))[0]
print(f"The book name :\n{filename}")
print(f"Taking a page from meta data from the book:\n{docs[1].metadata['page']}")
print(f"Taking a page label from meta data from the book:\n{docs[1].metadata['page_label']}")

In [9]:
all_texts, all_metadatas = [], []

for i in range(len(docs)):
    all_texts.append(docs[i].page_content)
    filename = os.path.splitext(os.path.basename(docs[i].metadata['source']))[0]
    all_metadatas.append({'source':f"Title: {filename}, Page: {docs[i].metadata['page_label']}"})

In [10]:
print(all_texts[65])
print(all_metadatas[65])

Table 2
Classification report for 80 % Training-set and 20 % Testing-set.
Classifications Labels TF-IDF BoW
Precision Recall F1-score Support Accuracy Precision Recall F1-score Support Accuracy
LR Fact 0.83 0.97 0.90 192 0.85 0.95 0.90 0.92 192 0.90
Myth 0.89 0.57 0.70 87 0.80 0.89 0.84 87
Macro avg 0.86 0.77 0.80 279 0.87 0.89 0.88 279
Weighted avg 0.85 0.85 0.83 279 0.90 0.90 0.90 279
RF Fact 0.88 0.90 0.89 192 0.85 0.88 0.90 0.89 192 0.85
Myth 0.76 0.74 0.75 87 0.77 0.72 0.75 87
Macro avg 0.82 0.82 0.82 279 0.82 0.81 0.82 279
Weighted avg 0.84 0.85 0.85 279 0.84 0.85 0.84 279
SVM Fact 0.87 0.96 0.91 192 0.87 0.93 0.85 0.89 192 0.85
Myth 0.88 0.69 0.77 87 0.72 0.86 0.79 87
Macro avg 0.88 0.82 0.84 279 0.83 0.86 0.84 279
Weighted avg 0.88 0.87 0.87 279 0.87 0.85 0.86 279
NB Fact 0.73 0.99 0.84 192 0.74 0.85 0.97 0.91 192 0.86
Myth 0.94 0.18 0.31 87 0.90 0.62 0.73 87
Macro avg 0.84 0.59 0.57 279 0.87 0.79 0.82 279
Weighted avg 0.80 0.74 0.67 279 0.87 0.86 0.85 279
DT Fact 0.87 0.82 0.8

### **Creating the Vector Store**

In [None]:
# Cohere Embeddings
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
cohere_embeddings = CohereEmbeddings(
    model="embed-english-v3.0",
    # model="embed-english-v2.0",
    cohere_api_key=COHERE_API_KEY)


# model="sentence-transformers/all-MiniLM-L12-v2"
# embeddings = HuggingFaceEndpointEmbeddings(model=model)

# Create Deep Lake Vector Store
ACTIVELOOP_TOKEN = os.getenv("ACTIVELOOP_TOKEN")
my_activeloop_org_id = "danllm"
my_activeloop_dataset_name = "research_assisant"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

db = DeepLake(dataset_path=dataset_path, embedding=cohere_embeddings,overwrite=True)
db.add_texts(all_texts,all_metadatas)

# Read mode for vector store
# db = DeepLake(dataset_path=dataset_path, embedding=cohere_embeddings,read_only=True)



### **RetrievalQAWithSourcesChain**

In [12]:
search_kwargs = {"k":4}

retriever = db.as_retriever(search_kwargs=search_kwargs)
qa = RetrievalQAWithSourcesChain.from_chain_type(
    llm=chat_model,
    chain_type="stuff",
    retriever = retriever
)

In [13]:
input_data = {
    "question": "what did Mehedi Tajrian discover in there research?"
}
d_response = qa.invoke(input_data)

In [None]:
print(d_response['question'])
print(d_response['answer'])
print("Source(s):")
for source in d_response['sources'].split(";"):
    for source in source.split(","):
        if source:
            print(source.strip())
        else:
            print('Not source')

what did Mehedi Tajrian discover in there research?
 Mehedi Tajrian discovered that structured data was analyzed using two feature extraction methods and six traditional classification models, with LR achieving the highest accuracy rate using BoW. The study also used a deep learning model, and cross-validation was performed using k-fold and loo methods. The data used in this study is openly available at https://researchoutput.csu.edu.au/en/datasets/myths-and-facts-data-on-child-development.

Source(s):
Title: Analysis of child development facts and myths using text mining techniques and classification models
Page: 15


### **The Prompt**

In [14]:
prompt_template_researcher = """
As an NLP researcher, Give an in depth explanation for your the text the given text below in 5 bulletins
Text: {text}
Your answer should be 
{text}\n\n\n

Full description:
"""
prompt_researcher_input_variables = ['text']
prompt_researcher = PromptTemplate(
    template=prompt_template_researcher,
    input_variables=prompt_researcher_input_variables
)

### **RetrievalQAWithSourcesChain With Prompting**

In [15]:
input_data = {'question':"Why did Mehedi Tajrian analyse child development and what was the best classifier?"}
search_kwargs = {"k":4}
retriever = db.as_retriever(search_kwargs=search_kwargs)
qa = RetrievalQAWithSourcesChain.from_chain_type(
    llm=chat_model,
    chain_type="stuff",
    retriever = retriever
)
d_response = qa.invoke(input_data)
print(f"{d_response['question']}\n")
print(f"Final Answer:\n{d_response['answer']}")
# Extract the answer
researcher_input_data = {"text":d_response['answer'].strip()}
llm_researcher_chain = prompt_researcher | chat_model
# Summarise the answer
summariser_response = llm_researcher_chain.invoke(researcher_input_data)
print(f"Summarised answer:\n{summariser_response.content}")
print("Source(s):")
sources = re.split(r'[;]', d_response['sources'])
for source in sources:
    source = source.strip()
    if source:
        print(source)
    else:
        print("No sources")

Why did Mehedi Tajrian analyse child development and what was the best classifier?

Final Answer:
 Mehedi Tajrian analyzed child development using text mining techniques and the best classifier found was Logistic Regression (LR) with a Bag-of-Words (BoW) feature extraction method.

Summarised answer:
 1. Mehedi Tajrian conducted a study on child development, utilizing text mining techniques. Text mining, also known as text analytics, is a sub-field of data mining that focuses on extracting useful information and knowledge from unstructured text data, such as emails, social media posts, and research articles.

2. To classify the child development data, Tajrian employed Logistic Regression (LR) as the best-performing classifier. Logistic Regression is a statistical model used for binary classification problems, where the goal is to predict whether an observation belongs to one of two classes. It is a popular choice in machine learning due to its simplicity, interpretability, and good per

### **RetrievalQAWithSourcesChain With Reranker**

In [16]:
compressor = langchain_cohere_reranker(model='rerank-english-v3.0')
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,base_retriever=retriever
)

compressed_docs = compression_retriever.invoke("Why did Mehedi Tajrian analyse child development and what was the best classifier?")

In [17]:
compressed_docs

[Document(metadata={'source': 'Title: Analysis of child development facts and myths using text mining techniques and classification models, Page: 1', 'relevance_score': 0.997851}, page_content='Research article\nAnalysis of child development facts and myths using text mining\ntechniques and classification models\nMehedi Tajrian\n**\n, Azizur Rahman , Muhammad Ashad Kabir\n*\n, Md Rafiqul Islam\nSchool of Computing, Mathematics and Engineering, Charles Sturt University, NSW, Australia\nARTICLE INFO\nKeywords:\nMisinformation\nMyth\nText mining\nMachine learning\nDeep learning\nABSTRACT\nThe rapid dissemination of misinformation on the internet complicates the decision-making\nprocess for individuals seeking reliable information, particularly parents researching child\ndevelopment topics. This misinformation can lead to adverse consequences, such as inappropriate\ntreatment of children based on myths. While previous research has utilized text-mining tech-\nniques to predict child abuse c

In [None]:
print(compressed_docs[1].page_content[:500]) # shorten thee output for testing purposes

structured data was analysed using two feature extraction method and six traditional classification models and one deep learning
model, with LR achieving the highest accuracy rate using BoW. Cross-validation was performed using k-fold and loo methods,
providing confidence in our findings. Cost-effectiveness evaluation is done using a performance indicator. Among them, LR achieves
the highest accuracy rate using BoW with overall optimum testing time and robust testing time per statement. In addit


In [18]:
print(compressed_docs[1].metadata)

{'source': 'Title: Analysis of child development facts and myths using text mining techniques and classification models, Page: 15', 'relevance_score': 0.9458012}


In [24]:
retrieved_chunks = [doc.page_content for doc in compressed_docs]

# format prompt
final_answer = "\n\n".join(retrieved_chunks).strip()
print(f"Final Answer:\n{final_answer}")
try:
    sources = []
    for i in range(len(compressed_docs)):
        source = compressed_docs[i].metadata['source']
        if not any(source == s for s in sources):
            sources.append(source)
except IndexError as error:
    print("No sources")
    sources.append("No sources")

print("Source(s):")
for source in sources:
    print(f"- {source}")

Final Answer:
Research article
Analysis of child development facts and myths using text mining
techniques and classification models
Mehedi Tajrian
**
, Azizur Rahman , Muhammad Ashad Kabir
*
, Md Rafiqul Islam
School of Computing, Mathematics and Engineering, Charles Sturt University, NSW, Australia
ARTICLE INFO
Keywords:
Misinformation
Myth
Text mining
Machine learning
Deep learning
ABSTRACT
The rapid dissemination of misinformation on the internet complicates the decision-making
process for individuals seeking reliable information, particularly parents researching child
development topics. This misinformation can lead to adverse consequences, such as inappropriate
treatment of children based on myths. While previous research has utilized text-mining tech-
niques to predict child abuse cases, there has been a gap in the analysis of child development
myths and facts. This study addresses this gap by applying text mining techniques and classifi-
cation models to distinguish between myth

### **RetrievalQAWithSourcesChain With both Reranker and Prompting**

In [28]:
# Final Answer
# print(f"Final Answer:\n{final_answer}")
# Extract the answer
researcher_input_data = {"text":final_answer.strip()}
llm_researcher_chain = prompt_researcher | chat_model
# Resercher answer
researcher_response = llm_researcher_chain.invoke(researcher_input_data)
print(f"Researcher answer:\n{researcher_response.content}")
try:
    sources = []
    for i in range(len(compressed_docs)):
        source = compressed_docs[i].metadata['source']
        if not any(source == s for s in sources):
            sources.append(source)
except IndexError as error:
    print("No sources")
    sources.append("No sources")

print("Source(s):")
for source in sources:
    print(f"- {source}")

Researcher answer:
 1. The research article, titled "Analysis of child development facts and myths using text mining techniques and classification models," is a study conducted by Mehedi Tajrian, Azizur Rahman, Muhammad Ashad Kabir, and Md Rafiqul Islam from the School of Computing, Mathematics and Engineering at Charles Sturt University, NSW, Australia.

2. The study aims to address the gap in the analysis of child development myths and facts using modern data science techniques. It leverages newly gathered data from publicly available websites to distinguish between myths and facts about child development.

3. The research methodology involves several stages: pre-processing the data using text mining techniques for enhanced accuracy, analyzing the structured data using six Machine Learning (ML) classifiers, one Deep Learning (DL) model, and two feature extraction techniques, and performing cross-validation using both k-fold and leave-one-out methods to ensure the reliability of the r

### **Using the QA Pipeline**

In [31]:
qa = RetrievalQA.from_chain_type(
    llm=chat_model,
    retriever=retriever
)

input_data = {
    "query": "Why did Mehedi Tajrian analyse child development and what was the best classifier?"
}
qa.invoke(input_data)

{'query': 'Why did Mehedi Tajrian analyse child development and what was the best classifier?',
 'result': ' Mehedi Tajrian analyzed child development to provide valuable insights and improve the accuracy of information parents use in decision-making. The best classifier found in the study was Logistic Regression (LR), which achieved the highest accuracy rate of 90% using the Bag-of-Words (BoW) feature extraction technique.'}

In [32]:
d_response = qa.invoke(input_data)
print(f"{d_response['query']}\n")
print(f"Final Answer:\n{d_response['result']}")

Why did Mehedi Tajrian analyse child development and what was the best classifier?

Final Answer:
 Mehedi Tajrian analyzed child development to provide valuable insights and improve the accuracy of information parents use in decision-making. The best classifier found in the study was Logistic Regression (LR), which achieved the highest accuracy rate of 90% using the Bag-of-Words (BoW) feature extraction technique.


### **Using QA Pipeline With Prompting**

In [35]:
d_response = qa.invoke(input_data)
print(f"{d_response['query']}\n")
print(f"Final Answer:\n{d_response['result']}\n")
# Extract the answer
researcher_input_data = {"text":d_response['result'].strip()}
llm_researcher_chain = prompt_researcher | chat_model
# Researcher  answer
researcher_response = llm_researcher_chain.invoke(researcher_input_data)
print(f"Researcher answer:\n{researcher_response.content}")

Why did Mehedi Tajrian analyse child development and what was the best classifier?

Final Answer:
 Mehedi Tajrian analyzed child development to provide valuable insights and improve the accuracy of information parents use in decision-making. The best classifier found in the study was Logistic Regression (LR), which achieved the highest accuracy rate of 90% using the Bag-of-Words (BoW) feature extraction technique.

Researcher answer:
 1. Mehedi Tajrian conducted a study on child development with the aim of providing valuable insights and enhancing the accuracy of information that parents use in decision-making. The focus of the study was to develop a model that could effectively analyze and classify various aspects of child development.

2. The study identified Logistic Regression (LR) as the best classifier, which demonstrated the highest accuracy rate of 90%. Logistic Regression is a statistical model used for binary classification problems, making it suitable for the study's purpose

### **Using QA Pipeline With Reranker**
- Same as RetrievalQAWithSourcesChain because they must use the base retriever

### **Using QA Pipeline With Both Reranker and Prompting**
- Same as RetrievalQAWithSourcesChain because they must use the base retriever

### **Using Similarity Search without prompting**

In [36]:
query = "Why did Mehedi Tajrian analyse child development and what was the best classifier?"

docs = db.similarity_search(query=query, k=3)
# retrieve relevant chunks
retrieved_chunks = [doc.page_content for doc in docs]
# format prompt
final_answer_resource = "\n\n".join(retrieved_chunks).strip()

print(f"{query}\n")
print(f"Final Answer:\n{final_answer_resource}\n")

Why did Mehedi Tajrian analyse child development and what was the best classifier?

Final Answer:
structured data was analysed using two feature extraction method and six traditional classification models and one deep learning
model, with LR achieving the highest accuracy rate using BoW. Cross-validation was performed using k-fold and loo methods,
providing confidence in our findings. Cost-effectiveness evaluation is done using a performance indicator. Among them, LR achieves
the highest accuracy rate using BoW with overall optimum testing time and robust testing time per statement. In addition to shedding
light on myths and facts regarding child development, this study provides a solid base for future research in the field.
Data availability statement
The data used in this study are openly available at https://researchoutput.csu.edu.au/en/datasets/myths-and-facts-data-on-child-
development.
CRediT authorship contribution statement
Mehedi Tajrian: Writing – original draft, Visualizatio

### **Similarity Search With Prompting**

In [None]:
# Extract the answer
researcher_input_data = {"text":final_answer_resource}
llm_researcher_chain = prompt_researcher | chat_model
researcher_response = llm_researcher_chain.invoke(researcher_input_data)
print(f"Researcher answer:\n{researcher_response.content}")
try:
    sources = []
    for i in range(len(docs)):
        source = docs[i].metadata['source']
        if not any(source == s for s in sources):
            sources.append(source)
except IndexError as error:
    print("No sources")
    sources.append("No sources")

print("Source(s):")
for source in sources:
    print(f"- {source}")

Researcher answer:
 1. The text discusses a study that analyzes structured data related to myths and facts about child development using various feature extraction methods, traditional classification models, and a deep learning model.

2. The study found that Logistic Regression (LR) achieved the highest accuracy rate of 90% when using the Bag-of-Words (BoW) feature extraction technique.

3. Cross-validation was performed using k-fold and leave-one-out methods to ensure the reliability of the results.

4. The study also evaluated the cost-effectiveness of the models, with LR being the most cost-effective due to its high accuracy and low testing time per statement.

5. The findings of this study provide a solid base for future research in the field and aim to help parents make informed decisions about their children's development by debunking myths and providing accurate information. The data used in the study is openly available online. The study was conducted by a team of researchers 

### **Conclusion**
- As seen the use of a prompt, helped refined the output of the retriever thereby giving a more refined output for the user when asking questions about the resources stored in the vector database