# Literature review assistant

The aim of this project is to create an AI-powered literature review assistant called AIRA. This research assistant will use GPT 3.5

# Libraries

In [44]:
!pip install PyPDF2 gdown openai tiktoken pypdf chromadb langchain

from PyPDF2 import PdfReader
import os
import glob
import pandas as pd
import re
import tiktoken



In [45]:
from PIL import Image
from io import BytesIO

# import pytesseract
# Specify the path where Tesseract-OCR was installed
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
# from pytesseract import Output
import re
import glob
import os
import PIL.Image
from PIL import Image
#from pdf2image import convert_from_path

In [46]:
# Chat GPT
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
import chromadb
from openai import OpenAI

In [47]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).



# Creating the database

We first extract the text from the PDF of our literature review. The final output of this section is a dataframe with two columns, the first one is the name od the PDF, the second one is the extracted text of the

In [48]:
def pdf_to_text(pdf_path):
    '''
    Objective:
        This functions transforms a pdf to a text where we can apply text information retrieval

    Input:
        pdf_path (str) : The path where the pdf is located, including the pdf name.

    Output:
        It returns the text of the pdf
    '''
    reader   = PdfReader( pdf_path )
    n_pages  = len( reader.pages )
    print( f'Number of pages: { n_pages }' )

    try:
        extracted_text = [ reader.pages[ i ].extract_text() for i in range( n_pages ) ]
        print( 'Text successfully extracted' )

    except:
        extracted_text = []
        print( 'Text not found' )


    combined_text = '\n'.join( extracted_text )


    return combined_text

In [49]:
def extract_text( combined_text , start_pattern , end_pattern):
    '''
    Objective:
        This function takes a text and extracts the patter indicated by the start_patter and end_patter inputs.

    Input:
        combined_text (str) : The text where we can extract information.

        start_pattern (str) : The starting pattern.

        end_pattern (str) : The ending pattern.
    '''
    start_match   = re.search( start_pattern, combined_text, re.IGNORECASE )
    end_match     = re.search( end_pattern, combined_text[ start_match.end(): ], re.IGNORECASE )

    end_index     = start_match.end() + end_match.start()
    article_text  = combined_text[ start_match.end(): end_index ].strip()

    article_text = article_text.split('\n')

    return article_text


In [50]:
folder_path = '/content/drive/MyDrive/Hackaton/Literature'
data_dict = {'Filename': [], 'PDF_text': []}

# for pattern in patterns:
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    try:
        file_pdf = pdf_to_text(file_path)

    except:
        print(f'{filename} is not a file')
        continue

    data_dict['Filename'].append(filename)
    data_dict['PDF_text'].append(file_pdf)

final_dataframe = pd.DataFrame(data_dict)

Number of pages: 20
Text successfully extracted
Number of pages: 29
Text successfully extracted
Number of pages: 23
Text successfully extracted


In [51]:
final_dataframe.iloc[0, 1]

'Vol.:(0123456789)Social Indicators Research (2020) 152:971–990\nhttps://doi.org/10.1007/s11205-020-02476-8\n1 3\nORIGINAL RESEARCH\nImpact of\xa0Income, Deprivation and\xa0Social Exclusion \non\xa0Subjective Poverty: A\xa0Structural Equation Model \nof\xa0Multidimensional Poverty in\xa0Hong Kong\nSiu\xa0Ming\xa0Chan1 \xa0· Hung\xa0Wong1 \nAccepted: 20 August 2020 / Published online: 27 August 2020 \n© Springer Nature B.V. 2020\nAbstract\nMultidimensional poverty in urban cities has become an increasing global concern. Income \npoverty, deprivation, social exclusion and subjective poverty have been commonly used \nas measurements for poverty. However, the path relationship among these various dimen-sions has been ignored. This study aims to fill this research gap by focusing on the impact \non subjective poverty. A random sample survey of 1979 adult participants in Hong Kong \nwas used for the analysis. Structural equation modelling was applied in studying the path \nrelationship among

In [52]:
final_dataframe

Unnamed: 0,Filename,PDF_text
0,"2020_Chan_Impact of income, deprivation and so...",Vol.:(0123456789)Social Indicators Research (2...
1,2020_Costa_Fuzzy poverty measurement_multi and...,"Costa, Michele\nWorking Paper\nFuzzy poverty m..."
2,2020_Klasen_Diverging identification of the po...,Diverging identiﬁcation of the poor: A non-ran...


#Chunks and tokenization

We then proceed to divide our information in big chunks because each PDF contains a huge number of token. This will allow GPT 3.5 to read the information without surpassing the limit of tokens

In [53]:
final_dataframe["PDF_text"]

0    Vol.:(0123456789)Social Indicators Research (2...
1    Costa, Michele\nWorking Paper\nFuzzy poverty m...
2    Diverging identiﬁcation of the poor: A non-ran...
Name: PDF_text, dtype: object

In [54]:
tokenizer = tiktoken.get_encoding("cl100k_base")

In [55]:
# We must separate the text into chunks
def tokenCounter(text):
    return len(tokenizer.encode(text))

In [56]:
def generate_chunks(row):
    # Reducir el tamaño del chunk para generar menos chunks
    textSplitter = RecursiveCharacterTextSplitter(
        chunk_size=6500,  # Reducir el tamaño del chunk aún más
        chunk_overlap=100,  # Reducir el solapamiento del chunk a la mitad
        length_function=tokenCounter,
        separators=["\n\n", ".", "\n", " "]
    )
    chunks = textSplitter.create_documents([row["PDF_text"]], metadatas=[{"id": f"pdf{row.name + 1}"}])
    return f"pdf{row.name + 1}", chunks

In [57]:
# Aplicamos la función generate_chunks a cada fila del dataframe y convertimos los resultados en un diccionario
chunks_dict = dict(final_dataframe.apply(generate_chunks, axis=1).values)

In [58]:
tokenCounter(str(chunks_dict.values()))

75741

In [59]:
OPENAI_API_KEY = "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"

In [60]:
openaiEmbedding = OpenAIEmbeddingFunction(
        api_key=OPENAI_API_KEY,
        model_name="text-embedding-3-small"
)

In [61]:
#  import chromadb

#  chromaClient = chromadb.PersistentClient()
#  collection = chromaClient.create_collection(
#      name="aira_collection_FINAL_FFF",
#      embedding_function=openaiEmbedding,
#      #metadata={"hnsw:space": "cosine"}
#  )

In [62]:
#  collection.add(
#          documents=[document.page_content for document in chunks],
#          #metadatas=[document.metadata for document in chunks],
#          ids=[f"id{i+1}" for i in range(len(chunks))]
#  )

In [63]:
# tokenCounter(str(collection.get()))

# Functions and prompts

In [64]:
client = OpenAI(api_key = "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")

In [72]:
DELIMITER = "####"

def getCompletionFromMessages(
        query,
        messages,
        model="gpt-4",
        temperature=0,
        delimiter=DELIMITER
):
    query = f"{DELIMITER}{query}{DELIMITER}"
    messages = [{"role": "user", "content": query}]
    response = client.chat.completions.create(
        messages=messages,
        temperature=temperature,
        model=model
    )
    responseContent = response.choices[0].message.content
    messages = [{"content": responseContent, "role": "assistant"}]
    return messages


results = []

for i in range(1, 4):  # Iterar sobre pdf1, pdf2, pdf3, ...
    current_pdf = f"pdf{i}"

    system_prompt = f"""
    You are a literature review assistant. You will be given a dictionary of\
    papers stored in {str(chunks_dict[current_pdf][0])}. You will be asked to analyse the text of each paper  \
    and give a response to a series of questions. You will answer in a \
    professional manner using academic language. Your responses must be concise, \

    Question 1: Title (string)

    Question 2: Author (string)

    Question 3: Journal of publication (string)

    Question 4: Year of publication (numeric)

    Question 5: Keywords after the abstract (string)

    Question 6: Author's institutional affiliation

    Question 7:  DOI (string)

    Use the following JSON format for your response:
    Question 1 : Response 1
    Question 2 : Response 2
    Question N : Response N
    """

    messages = [{"role": "user", "content": system_prompt}]

    completion_result = getCompletionFromMessages(system_prompt, messages)
    results.append(completion_result)

In [73]:
getCompletionFromMessages(system_prompt, messages)


[{'content': '{\n"Question 1" : "Diverging identiﬁcation of the poor: A non-random process. Chile 1992–2017",\n"Question 2" : "Stephan Klasena, Carlos Villalobosb",\n"Question 3" : "World Development",\n"Question 4" : 2020,\n"Question 5" : "Multidimensional poverty, Monetary poverty, Identiﬁcation of the poor, Measures of association, Household ineligibility",\n"Question 6" : "aUniversity of Göttingen, Germany, bUniversidad de Talca, Chile",\n"Question 7" : "https://doi.org/10.1016/j.worlddev.2020.104944"\n}',
  'role': 'assistant'}]

In [74]:
results

[[{'content': '{\n"Question 1": "Impact of Income, Deprivation and Social Exclusion on Subjective Poverty: A Structural Equation Model of Multidimensional Poverty in Hong Kong",\n"Question 2": "Siu Ming Chan, Hung Wong",\n"Question 3": "Social Indicators Research",\n"Question 4": 2020,\n"Question 5": "Subjective poverty, Deprivation, Social exclusion, Hong Kong, Multidimensional poverty, Structural equation model",\n"Question 6": "Department of Social Work, The Chinese University of Hong Kong",\n"Question 7": "https://doi.org/10.1007/s11205-020-02476-8"\n}',
   'role': 'assistant'}],
 [{'content': '{\n"Question 1": "Fuzzy poverty measurement: Multidimensional and unidimensional indicators",\n"Question 2": "Michele Costa",\n"Question 3": "Quaderni - Working Paper DSE, No. 1156",\n"Question 4": 2020,\n"Question 5": "Multidimensional poverty index, Fuzzy indicators, Simulation study",\n"Question 6": "University of Bologna, Department of Economics",\n"Question 7": "https://doi.org/10.6092/

In [75]:
data = {}

for i, result in enumerate(results):
    pdf_name = f"pdf{i + 1}"
    content = result[0]["content"]

    # Dividir el contenido en líneas
    lines = content.split('\n')

    # Extraer las respuestas
    for line in lines:
        if ':' in line:
            question, response = map(str.strip, line.split(':', 1))
            # Renombrar las columnas según lo solicitado
            if question == "Question 1":
                question = "Title"
            elif question == "Question 2":
                question = "Author"
            elif question == "Question 3":
                question = "Journal of publication"
            elif question == "Question 4":
                question = "Year of publication"
            elif question == "Question 5":
                question = "Keywords after the abstract"
            elif question == "Question 6":
                question = "JEL codes"
            elif question == "Question 7":
                question = "DOI"

            data.setdefault(question, []).append(response)



In [76]:
data

{'"Question 1"': ['"Impact of Income, Deprivation and Social Exclusion on Subjective Poverty: A Structural Equation Model of Multidimensional Poverty in Hong Kong",',
  '"Fuzzy poverty measurement: Multidimensional and unidimensional indicators",',
  '"Diverging identiﬁcation of the poor: A non-random process. Chile 1992–2017",'],
 '"Question 2"': ['"Siu Ming Chan, Hung Wong",',
  '"Michele Costa",',
  '"Stephan Klasena, Carlos Villalobosb",'],
 '"Question 3"': ['"Social Indicators Research",',
  '"Quaderni - Working Paper DSE, No. 1156",',
  '"World Development",'],
 '"Question 4"': ['2020,', '2020,', '2020,'],
 '"Question 5"': ['"Subjective poverty, Deprivation, Social exclusion, Hong Kong, Multidimensional poverty, Structural equation model",',
  '"Multidimensional poverty index, Fuzzy indicators, Simulation study",',
  '"Multidimensional poverty, Monetary poverty, Identiﬁcation of the poor, Measures of association, Household ineligibility",'],
 '"Question 6"': ['"Department of Soci

In [77]:
# Crear el DataFrame
df = pd.DataFrame(data)

# Agregar una columna con los nombres de los PDFs
df['PDF'] = [f'pdf{i + 1}' for i in range(3)]

# Reordenar las columnas para tener 'PDF' al principio
df = df[['PDF'] + [col for col in df.columns if col != 'PDF']]

# Mapeo de nombres
column_mapping = {
    '"Question 1"': "Title",
    '"Question 2"': "Author",
    '"Question 3"': "Journal of publication",
    '"Question 4"': "Year of publication",
    '"Question 5"': "Keywords after the abstract",
    '"Question 6"': "Authors institutional affiliation",
    '"Question 7"': "DOI"
}

# Renombrar las columnas
df.rename(columns=column_mapping, inplace=True)
df

Unnamed: 0,PDF,Title,Author,Journal of publication,Year of publication,Keywords after the abstract,Authors institutional affiliation,DOI
0,pdf1,"""Impact of Income, Deprivation and Social Excl...","""Siu Ming Chan, Hung Wong"",","""Social Indicators Research"",",2020,"""Subjective poverty, Deprivation, Social exclu...","""Department of Social Work, The Chinese Univer...","""https://doi.org/10.1007/s11205-020-02476-8"""
1,pdf2,"""Fuzzy poverty measurement: Multidimensional a...","""Michele Costa"",","""Quaderni - Working Paper DSE, No. 1156"",",2020,"""Multidimensional poverty index, Fuzzy indicat...","""University of Bologna, Department of Economics"",","""https://doi.org/10.6092/unibo/amsacta/6562"""
2,pdf3,"""Diverging identiﬁcation of the poor: A non-ra...","""Stephan Klasena, Carlos Villalobosb"",","""World Development"",",2020,"""Multidimensional poverty, Monetary poverty, I...","""aUniversity of Göttingen, Germany, bUniversid...","""https://doi.org/10.1016/j.worlddev.2020.104944"""


In [None]:
# try:
#         user_message =  f"{delimiter}{chunks_dict}{delimiter}"

#         messages =  [
#         {'role':'system',
#          'content': system_message},
#         {'role':'user',
#          'content': f"{delimiter}{user_message}{delimiter}"},
#         ]
#         response = get_completion_from_messages(messages)
#         print(response)

#         return response
# except Exception as e:
#         print(f"Error in classify_poverty_description function: {e}")
#         return None  # Return None or any default value

In [None]:
# first_key, first_value = next(iter(chunks_dict.items()))

In [None]:
# for i in range(3):
#   chunks = textSplitter.create_documents(
#       [final_dataframe["PDF_text"][i]],
#       metadatas=[]
#   )
#   getCompletionFromMessages(system_prompt, messages)