Text extraction

## Dirichlet 

In [1]:
# Import necessary libraries
from gensim import corpora
from gensim.models import LdaModel
from pprint import pprint
import pandas as pd
from nltk.corpus import stopwords

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\annab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Sample documents
path_plot = 'data/plot_summaries.txt'
plot_summary_df = pd.read_csv(path_plot, delimiter='\t', header=None)
plot_summary_df.columns = ['movie_id', 'plot_summary']

In [4]:
plot_summary_df

Unnamed: 0,movie_id,plot_summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho..."
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42300,35102018,American Luthier focuses on Randy Parsons’ tra...
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [5]:
stop_words = set(stopwords.words('english'))

In [6]:
summary = [plot_summary_df.iloc[0,1]]

In [7]:
summary

["Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all."]

In [8]:
# Tokenize the documents
tokenized_docs = [[word for word in doc.lower().split() if word not in stop_words] for doc in summary]


In [9]:
tokenized_docs

[['shlykov,',
  'hard-working',
  'taxi',
  'driver',
  'lyosha,',
  'saxophonist,',
  'develop',
  'bizarre',
  'love-hate',
  'relationship,',
  'despite',
  'prejudices,',
  'realize',
  'different',
  'all.']]

In [10]:
# Create a dictionary mapping each word to a unique id
dictionary = corpora.Dictionary(tokenized_docs)


In [11]:
# Convert tokenized documents into bag-of-words representation
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]


In [12]:
# Train the LDA model
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, alpha = 'auto')


In [13]:
# Print the topics
pprint(lda_model.print_topics())

[(0,
  '0.075*"lyosha," + 0.074*"driver" + 0.071*"shlykov," + 0.068*"prejudices," + '
  '0.068*"despite" + 0.067*"all." + 0.067*"taxi" + 0.067*"bizarre" + '
  '0.066*"love-hate" + 0.066*"develop"'),
 (1,
  '0.074*"realize" + 0.072*"different" + 0.068*"relationship," + '
  '0.068*"hard-working" + 0.068*"saxophonist," + 0.067*"develop" + '
  '0.067*"love-hate" + 0.067*"bizarre" + 0.066*"taxi" + 0.066*"all."')]


In [14]:
from nltk.stem import WordNetLemmatizer

In [15]:
# Sample documents
documents = [
    "Machine learning is an exciting field with endless possibilities.",
    "Natural language processing helps computers understand human language.",
    "Deep learning algorithms are used in various applications such as image recognition and speech synthesis.",
    "Data science involves extracting insights from data through statistical analysis and machine learning techniques."
]
# Tokenize the documents
tokenized_docs = [doc.lower().split() for doc in documents]
#print(tokenized_docs)
# Create a dictionary mapping each word to a unique id
dictionary = corpora.Dictionary(tokenized_docs)
# Convert tokenized documents into bag-of-words representation
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
# Train the LDA model
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary)
# Print the topics
pprint(lda_model.print_topics())

[(0,
  '0.055*"learning" + 0.036*"and" + 0.034*"machine" + 0.031*"exciting" + '
  '0.030*"field" + 0.030*"endless" + 0.030*"with" + 0.030*"image" + 0.030*"is" '
  '+ 0.030*"speech"'),
 (1,
  '0.050*"data" + 0.036*"learning" + 0.035*"machine" + 0.034*"language" + '
  '0.033*"natural" + 0.033*"computers" + 0.033*"human" + 0.033*"and" + '
  '0.033*"helps" + 0.032*"language."')]


In [16]:
type(documents)

list

## LLM

In [17]:
from pypdf import PdfReader
from gensim.utils import simple_preprocess
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.llms import OpenAI

In [18]:
def preprocess(text, stop_words):
    """
    Tokenizes and preprocesses the input text, removing stopwords and short 
    tokens.

    Parameters:
        text (str): The input text to preprocess.
        stop_words (set): A set of stopwords to be removed from the text.
    Returns:
        list: A list of preprocessed tokens.
    """
    result = []
    for token in simple_preprocess(text, deacc=True):
        if token not in stop_words and len(token) > 3:
            result.append(token)
    return result

In [19]:
def get_topic_lists_from_pdf(summary, num_topics, words_per_topic):
    """
    Extracts topics and their associated words from a PDF document using the 
    Latent Dirichlet Allocation (LDA) algorithm.

    Parameters:
        file (str): The path to the PDF file for topic extraction.
        num_topics (int): The number of topics to discover.
        words_per_topic (int): The number of words to include per topic.

    Returns:
        list: A list of num_topics sublists, each containing relevant words 
        for a topic.
    """
    # Load the pdf file
    #loader = PdfReader(file)

    # Extract the text from each page into a list. Each page is considered a document
    documents= summary
    # for page in loader.pages:
    #     documents.append(page.extract_text())

    # Preprocess the documents
    nltk.download('stopwords')
    stop_words = set(stopwords.words(['english','spanish']))
    processed_documents = [preprocess(doc, stop_words) for doc in documents]

    # Create a dictionary and a corpus
    dictionary = corpora.Dictionary(processed_documents)
    corpus = [dictionary.doc2bow(doc) for doc in processed_documents]

    # Build the LDA model
    lda_model = LdaModel(
        corpus, 
        num_topics=num_topics, 
        id2word=dictionary, 
        passes=15
        )

    # Retrieve the topics and their corresponding words
    topics = lda_model.print_topics(num_words=words_per_topic)

    # Store each list of words from each topic into a list
    topics_ls = []
    for topic in topics:
        words = topic[1].split("+")
        topic_words = [word.split("*")[1].replace('"', '').strip() for word in words]
        topics_ls.append(topic_words)

    return topics_ls

In [20]:
def topics_from_pdf(llm, file, num_topics, words_per_topic):
    """
    Generates descriptive prompts for LLM based on topic words extracted from a 
    PDF document.

    This function takes the output of `get_topic_lists_from_pdf` function, 
    which consists of a list of topic-related words for each topic, and 
    generates an output string in table of content format.

    Parameters:
        llm (LLM): An instance of the Large Language Model (LLM) for generating 
        responses.
        file (str): The path to the PDF file for extracting topic-related words.
        num_topics (int): The number of topics to consider.
        words_per_topic (int): The number of words per topic to include.

    Returns:
        str: A response generated by the language model based on the provided 
        topic words.
    """

    # Extract topics and convert to string
    list_of_topicwords = get_topic_lists_from_pdf(file, num_topics, 
                                                  words_per_topic)
    string_lda = ""
    for list in list_of_topicwords:
        string_lda += str(list) + "\n"

    # Create the template
    template_string = '''Describe the topic of each of the {num_topics} 
        double-quote delimited lists in a simple sentence and also write down 
        three possible different subthemes. The lists are the result of an 
        algorithm for topic discovery.
        Do not provide an introduction or a conclusion, only describe the 
        topics. Do not mention the word "topic" when describing the topics.
        Use the following template for the response.

        1: <<<(sentence describing the topic)>>>
        - <<<(Phrase describing the first subtheme)>>>
        - <<<(Phrase describing the second subtheme)>>>
        - <<<(Phrase describing the third subtheme)>>>

        2: <<<(sentence describing the topic)>>>
        - <<<(Phrase describing the first subtheme)>>>
        - <<<(Phrase describing the second subtheme)>>>
        - <<<(Phrase describing the third subtheme)>>>

        ...

        n: <<<(sentence describing the topic)>>>
        - <<<(Phrase describing the first subtheme)>>>
        - <<<(Phrase describing the second subtheme)>>>
        - <<<(Phrase describing the third subtheme)>>>

        Lists: """{string_lda}""" '''

    # LLM call
    prompt_template = ChatPromptTemplate.from_template(template_string)
    chain = LLMChain(llm=llm, prompt=prompt_template)
    response = chain.run({
        "string_lda" : string_lda,
        "num_topics" : num_topics
        })

    return response