In [29]:
import PyPDF2
import nltk
import spacy
#nltk.download('stopwords')
from nltk.corpus import stopwords, wordnet
import string
import re
import matplotlib.pyplot as plt
import replicate
import os

In [31]:
# define the class
class EarningsAnalysis:
    def __init__(self, pdf_path: str, company_name: str, quarter: str):
        # Validate company name
        if not company_name.islower():
            raise ValueError("Company name should be in lowercase.")
        self.company_name = company_name

        # Validate quarter format
        quarter_pattern = r"^Q[1-4]\d{2}$"
        if not re.match(quarter_pattern, quarter):
            raise ValueError("Quarter should be in the format Q[1,2,3,4]YY.")
        self.quarter = quarter
        self.stopwords_custom = [self.company_name, self.quarter, "officer", "analyst", "factset"]  # Define domain-specific stopwords
        self.stopwords = stopwords.words('english') + list(string.punctuation) + self.stopwords_custom
        self.pdf_path = pdf_path # save path to be used in text extraction
        self.transcript_text = self.extract_text_from_pdf()

    def extract_text_from_pdf(self):
        """
        This method extracts text from a PDF doc into a single stream of text.
        """
        text = ""
        with open(self.pdf_path, "rb") as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text()
        blanks_removed = text.replace("\n", " ")
        return blanks_removed

    def get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # Default to noun if POS tag is not recognized

    def preprocess_words(self, text: str):
        """
        This method breaks down the pdf text into tokens of words.
        """
        singles_removed = re.sub(r'\s+[a-zA-Z]\s', ' ', text)
        tokens = nltk.word_tokenize(singles_removed)
        clean_tokens = [token.lower() for token in tokens if token.isalnum()]
        stop_all = self.stopwords
        stop_removal = [stop for stop in clean_tokens if stop not in stop_all]
        pos_tokens = nltk.pos_tag(stop_removal)

        lemmatizer = nltk.WordNetLemmatizer()
        stems = []
        for t in pos_tokens:
            word = t[0]
            pos_type = t[1]
            wordnet_pos = self.get_wordnet_pos(pos_type)
            stems.append(lemmatizer.lemmatize(word, pos=wordnet_pos))
        return stop_removal, stems, pos_tokens

    def ngrams(self, tokens: list, num_ngrams: int):
        """
        This method extracts relevant keywords using n-grams.
        """
        ngrams = nltk.ngrams(tokens, num_ngrams)
        ngram_freq = nltk.FreqDist(ngrams)

        stop_all = self.stopwords
        
        keywords = [ngram for ngram in ngram_freq if ngram[0] not in stop_all and ngram[1] not in stop_all]
        return keywords, ngram_freq

    def word_frequency_dist(self, tokens: list):
        """Return the distribution of word frequencies in the text"""
        fdist = nltk.FreqDist(tokens)
        fdist.plot(30, cumulative=False, title="Top 30 Most Common Words")

    def preprocess_sentences(self, text: str):
        """
        This method breaks down the pdf text into tokens of sentences which is 
        more useful for later methods to provide context about topics discussed in the call.
        """
        #singles_removed = re.sub(r'\s+[a-zA-Z]\s', ' ', text)
        sentences = nltk.sent_tokenize(text)  # tokenize into sentences
        preprocessed_sentences = []

        for sentence in sentences:
            tokens = nltk.word_tokenize(sentence)  # tokenize each sentence into words
            clean_tokens = [token.lower() for token in tokens if token.isalnum()]  # remove non-alphanumeric tokens + lowercase
            stop_removal = [stop for stop in clean_tokens if stop not in self.stopwords]  # remove stopwords
            preprocessed_sentences.append(stop_removal)
        return preprocessed_sentences

    def transcript_summary(self, text: str):
        """
        This method utilizes the llama2 model to generate a summary of the transcript.
        """
        
        os.environ["REPLICATE_API_TOKEN"] = "r8_47mYi9QuJjrqUlWdbqh9IbGhMPlJcm50FX2FS" # set replicate API token
        # set prompts
        pre_prompt = "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'."
        prompt_input = """Imagine you are an investor who is looking for relevant information in an earnings call transcript. 
                        Look at the following transcript and answer the following questions:
                        - What are the key financial highlights of the earnings call transcript provided?
                        - What are the key risks of the company?
                        - What are the key opportunities of the company?
                        - What are the key drivers of the business?
                        - What are the key takeaways from the earnings call transcript?
                        - How does management see the market environment in the coming quarters?
                        - What does the competitive landscape look like for the company?
                        Please answer structured and concise with bullet points. Only use the information provided in the transcript."""

        # generate and return LLM response
        output = replicate.run('a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5', # LLM model
                                input={"prompt": f"{pre_prompt} {prompt_input} {text} Assistant: ", # prompts
                                "temperature":0.1, "top_p":0.9, "max_length":100, "repetition_penalty":1})  # model parameters

        full_response = ""
        for item in output:
            full_response += item

        return full_response, output

    


In [32]:
earnings_analysis = EarningsAnalysis("Data/SNOW_Q224.pdf", company_name='snowflake', quarter='Q224')
snowflake = earnings_analysis.transcript_text
#snowflake = earnings_analysis.extract_text_from_pdf("Data/SNOW_Q224.pdf")
removed_stopwords, word_stems, pos_tags = earnings_analysis.preprocess_words(snowflake)
ngram_keywords, ngram_freq = earnings_analysis.ngrams(removed_stopwords, 2)
sent_preprocessed = earnings_analysis.preprocess_sentences(snowflake)

In [None]:
earnings_analysis.word_frequency_dist(word_stems)

In [107]:
ngram_freq

FreqDist({('scarpelli', 'chief'): 31, ('chief', 'financial'): 31, ('callstreet', 'llc'): 29, ('michael', 'scarpelli'): 29, ('2024', 'earnings'): 25, ('earnings', 'call'): 24, ('corrected', 'transcript'): 22, ('copyright', '2001'): 22, ('2001', 'callstreet'): 22, ('snow', 'q2'): 22, ...})

In [27]:
snowflake

' Corrected Transcript       1-877-FACTSET   www.callstreet.com  Total Pages : 22  Copyright © 2001 -2023  FactSet  CallStreet, LLC     23-Aug-2023   Snowflake, Inc.  (SNOW )  Q2 2024 Earnings Call    Snowflake, Inc.  (SNOW )  Q2 2024 Earnings Call  Corrected Transcript   23-Aug-2023      1-877-FACTSET   www.callstreet.com   2  Copyright © 2001 -2023  FactSet  CallStreet, LLC       CORPORATE PARTICIPANTS     Jimmy L. Sexton   Finance Director & Head -Investor Relations, Snowflake, Inc.   Frank Slootman   Chairman & Chief Executive Officer, Snowflake, Inc.  Michael P. Scarpelli   Chief Financial Officer, Snowflake, Inc.  Christian Kleinerman   Senior Vice President -Product, Snowflake, Inc.    ................................ ................................ ................................ ................................ ................................ ................................ ................................ ................................ .....     OTHER PARTICIPANTS     K

In [33]:
earnings_analysis.transcript_summary(snowflake)

ModelError: CUDA out of memory. Tried to allocate 32.37 GiB (GPU 0; 79.15 GiB total capacity; 26.24 GiB already allocated; 19.46 GiB free; 58.81 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF