In [70]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import requests
import re
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from kaggle_secrets import UserSecretsClient
from pypdf import PdfReader
import weaviate
from weaviate.embedded import EmbeddedOptions
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
from langchain.document_loaders import TextLoader
from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/papers-dataset/Publications/LEE.pdf
/kaggle/input/papers-dataset/Publications/Dawson.pdf
/kaggle/input/papers-dataset/Publications/Qiu.pdf
/kaggle/input/papers-dataset/Publications/15_Nazneen.pdf
/kaggle/input/papers-dataset/Publications/Abbas_2020.pdf
/kaggle/input/papers-dataset/Publications/Tariq_2019.pdf
/kaggle/input/papers-dataset/Publications/22_Ouss_ASD.pdf
/kaggle/input/papers-dataset/Publications/Asd_Cry_patterns.pdf
/kaggle/input/papers-dataset/Publications/zhao2020.pdf
/kaggle/input/papers-dataset/Publications/Abbas_2018.pdf
/kaggle/input/papers-dataset/Publications/carpenter2020 (1).pdf
/kaggle/input/papers-dataset/Publications/Young_Behavior.pdf
/kaggle/input/papers-dataset/Publications/1_Ramırez-Duque_.pdf
/kaggle/input/papers-dataset/Publications/Tariq2018.pdf
/kaggle/input/papers-dataset/Publications/Patten_Audio.pdf


Installing required libraries

In [27]:
!pip install langchain openai weaviate-client tiktoken pypdf

OpenAI Key

In [10]:
os.environ["OPENAI_API_KEY"] = UserSecretsClient().get_secret("OPENAI-API-KEY")

Text preprocessor to remove stop words, citations, figures, tables, and in-text citations, references from the retrieved text and apply stemming, and lemmatizing.

In [35]:
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))
porter_stemmer = PorterStemmer()

def preprocess_text(text):
    # Remove citations, figures, tables, in-text citations
    text = re.sub(r'\[.*?\]', '', text)  # Remove citations
    text = re.sub(r'Fig(?:ure)?\. \d+', '', text)  # Remove figures
    text = re.sub(r'Table\. \d+', '', text)  # Remove tables
    text = re.sub(r'\([A-Za-z]+, \d+\)', '', text)  # Remove in-text citations

    # Remove reference citation list
    text = re.sub(r'References\s*[\n\r]+.*', '', text, flags=re.DOTALL)

    # Remove email addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)

    # Remove author/organization/publisher details
    text = re.sub(r'\b[A-Z][a-z]+(?: [A-Z][a-z]+)?\b(?:,? (?:Inc\.|Corp\.|LLC))?[\s-]*$', '', text)

    # Remove section headings/numberings/ISBNs
    text = re.sub(r'\b(?:[A-Z]\d+\.)+\s?', '', text)
    text = re.sub(r'ISBN(?:-10)?:?\s?\d+[-\s]?\d+[-\s]?\d+[-\s]?\d+', '', text)

    # Tokenize using spaCy
    doc = nlp(text)
    tokens = [token.text for token in doc]

    # Remove stop words
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Stemming
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]

    # Join tokens back into text
    preprocessed_text = ' '.join(stemmed_tokens)

    return preprocessed_text

Variable to store combined text of all the pdfs.

In [28]:
parent_text = ''

In [37]:
# Path to the folder containing PDF files
folder_path = "/kaggle/input/papers-dataset/Publications"

# Iterating through all files in the folder
for filename in os.listdir(folder_path):
    # Checking if the file is a PDF
    if filename.endswith(".pdf"):
        file_path = os.path.join(folder_path, filename)
        # Opening the PDF file in binary mode
        with open(file_path, "rb") as f:
            # Creating a PDF reader object
            pdf_reader = PdfReader(f) 
            # Getting the number of pages in the PDF file
            num_pages = len(pdf_reader.pages)
            # Visiting each page of a pdf, storing its text to pdf_to_text, preprocessing the text
            # and storing to preprocessed_pdf_to_text and then merging this preprocessed pdf data to
            # parent text that holds all the pdfs' data.
            for i in range(num_pages):
                pdf_to_text = pdf_reader.pages[i].extract_text()
                preprocessed_pdf_to_text = preprocess_text(text)
                parent_text += preprocessed_pdf_to_text

Getting a glimpse of text

In [39]:
parent_text[0:2000]

'sensor \n letter \n deep - learn - base detect infant \n autism spectrum disord use auto - encod \n featur represent \n jung hyuk lee1 , geon woo lee1 , guiyoung bong2 , hee jeong yoo2,3and hong kook kim1 , * \n 1school electr engin comput scienc , gwangju institut scienc technolog , \n gwangju 61005 , korea ;   ( j.h.l. ) ;   ( g.w.l. ) \n 2depart psychiatri , seoul nation univers bundang hospit , seongnam - si , \n gyeonggi - 13620 , korea ;   ( g.b. ) ;   ( h.j.y. ) \n 3depart psychiatri , colleg medicin , seoul nation univers , seoul 03980 , korea \n * correspond : \n receiv : 29 octob 2020 ; accept : 24 novemb 2020 ; publish : 26 novemb 2020 \n /gid00030 / gid00035 / gid00032 / gid00030 / gid00038 / gid00001 / gid00033 / gid00042 / gid00045 /gid00001 \n /gid00048 / gid00043 / gid00031 / gid00028 / gid00047 / gid00032 / gid00046 \n abstract : autism spectrum disord ( asd ) development disord life - span disabl . \n diagnost instrument develop qualiﬁ base accuraci \n discrimin chil

Saving text to a text file. Due to less resources, document size has been reduced by dividing by 40.

In [91]:
with open("/kaggle/working/documents.txt", 'w') as f:
    f.write(parent_text[0:int(len(parent_text)/40)])

Chunking the text

In [85]:
loader = TextLoader('/kaggle/working/documents.txt')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

Generating vector embeddings and storing them to Weaviate vector database.

In [1]:
client = weaviate.Client(
  embedded_options = EmbeddedOptions()
)

vectorstore = Weaviate.from_documents(
    client = client,    
    documents = chunks,
    embedding = OpenAIEmbeddings(),
    by_text = False
)

'\nclient = weaviate.Client(\n  embedded_options = EmbeddedOptions()\n)\n\nvectorstore = Weaviate.from_documents(\n    client = client,    \n    documents = chunks,\n    embedding = OpenAIEmbeddings(),\n    by_text = False\n)\n'

Retrieving documents

In [87]:
retriever = vectorstore.as_retriever()

Setting the prompt

In [88]:
template = """You are an assistant for question-answering tasks. You are expected to generate
top 5 most similar relevant research findings on Autism, Therapy, and Intervention
based on a user query.
Question: {question} 
Context: {context} 
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

print(prompt)

input_variables=['context', 'question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='You are an assistant for question-answering tasks. You are expected to generate\ntop 5 most similar relevant research findings on Autism, Therapy, and Intervention\nbased on a user query.\nQuestion: {question} \nContext: {context} \nAnswer:\n'))]


In [103]:
queries = [
    "What are the variety of Multimodal and Multi-modular AI Approaches to Streamline Autism Diagnosis in Young Children?",
    "What is Autism Spectrum Disorder, how it is caysed?",
    "What is the cure of Autism Spectrum Disorder?",
    "What are Stereotypical and maladaptive behaviors in Autism Spectrum, how are these detected and managed?",
    "How relevant is eye contact and how it can be used to detect Autism?",
    "How can cross country trials help in development of Machine learning based Multimodal solutions?",
    "How early infants cry can help in the early detection of Autism?",
    "What are various methods to detect  Atypical Pattern of Facial expression in Children?",
    "What kind of facial expressions can be used to detect Autism Disorder in children?",
    "What are methods to detect Autism from home videos?",
    "What is Still-Face Paradigm in Early Screening for High-Risk Autism Spectrum Disorder?",
    "What is West Syndrome?",
    "What is the utility of Behavior and interaction imaging at 9 months of age predict autism/intellectual disability in high-risk infants with West syndrome?"
]

For larger context length, we use gpt-4-0125-preview

In [106]:
llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0)

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

# Invoking rag langchain for each query and storing answers
answers = []
for query in queries:
    answer = rag_chain.invoke(query)
    answers.append(answer)

'\nllm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0)\n\nrag_chain = (\n    {"context": retriever,  "question": RunnablePassthrough()} \n    | prompt \n    | llm\n    | StrOutputParser() \n)\n\n# Invoking rag langchain for each query and storing answers\nanswers = []\nfor query in queries:\n    answer = rag_chain.invoke(query)\n    answers.append(answer)\n'

Printing answers

In [102]:
for i in range(len(answers)):
    print("Query", i+1, "is:", queries[i])
    print("Answer is as follows:\n")
    print(answers[i])
    print("=========================================\n")

Query 1 is: What are the variety of Multimodal and Multi-modular AI Approaches to Streamline Autism Diagnosis in Young Children
Answer is as follows:

Based on the provided document, here are the top 5 most similar relevant research findings on Autism, Therapy, and Intervention, focusing on multimodal and multi-modular AI approaches to streamline autism diagnosis in young children:

1. **Auto-Encoder Feature Representation for Autism Spectrum Disorder Detection**: The document discusses the development of an automated diagnostic method for Autism Spectrum Disorder (ASD) using deep learning-based models. A pre-trained feature extraction auto-encoder model with a joint optimization scheme is introduced to achieve robust performance on widely distributed and unrefined data. This approach shows promise in detecting ASD in infants compared to using raw datasets directly.

2. **Utilization of Vocal Characteristics in ASD Diagnosis**: The research highlights the significance of vocal characte