In [38]:
import openai
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
# Add a constant to the model (intercept)
from sklearn.metrics import r2_score
import time
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from source.models import *
from source.preprocessing import *
from source.variables import *
from source.helpers import *

from dotenv import load_dotenv
import os

load_dotenv()  # Load environment variables from .env file

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [39]:
aggregated = pd.read_csv('transcripts/aggregated_credit_df.csv', delimiter='|')

#PART = 'presentation'
#PART = 'QnA'
PART = 'analysts'

In [40]:
client = openai.OpenAI(api_key=OPENAI_API_KEY)

output = []
start = 0

# import system_message.txt as a string
with open(f'prompts/system_{PART}_section.txt', 'r') as file:
    system_message = file.read()


print(system_message)

You are an AI language model designed to assist bond investors in analyzing company earnings call transcripts. Your task is to process the provided earnings call transcript, focusing specifically on the Q&A section, and keep only the parts where analysts ask questions or react. Remove all irrelevant sections. Keep only the questions and reactions that are directly relevant.

Instructions:
Provide only the remaining text from the transcript after irrelevant sections have been removed.
Include only the questions and reactions of anaylsts.
Do not add any new text, summaries, explanations, headings, or commentary.
Do not rearrange any sentences or sections; maintain the original order of the remaining content.
Exclude all other parts of the transcript that are not directly relevant.
Make sure that all information necessary to understand the remaining parts is also included.


In [43]:
import tiktoken

if PART == 'QnA' or PART == 'analysts':
    part = 'QnA'
else:
    part = PART


for idx, row in aggregated.iterrows():
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": row[part]}
        ]
    )
    # Instantly add the output as a new column entry for the corresponding row
    response = completion.choices[0].message.content

    print(f"Completed for row {idx}")
    
    # compute token count for the response
    # Initialize the OpenAI tokenizer
    encoding = tiktoken.encoding_for_model("gpt-4o-mini")

    # Function to calculate the number of tokens in a transcript using OpenAI's tokenizer
    def count_tokens(transcript):
        tokens = encoding.encode(transcript)
        return len(tokens)
    
    print(f"Token count: {count_tokens(response)}")

    aggregated.at[idx, f'{PART}_summary'] = response

Completed for row 0
Token count: 722
Completed for row 1
Token count: 1541
Completed for row 2
Token count: 2579
Completed for row 3
Token count: 2749
Completed for row 4
Token count: 2050
Completed for row 5
Token count: 956
Completed for row 6
Token count: 1677
Completed for row 7
Token count: 2535
Completed for row 8
Token count: 2854
Completed for row 9
Token count: 2129
Completed for row 10
Token count: 577
Completed for row 11
Token count: 2337
Completed for row 12
Token count: 1554
Completed for row 13
Token count: 2594
Completed for row 14
Token count: 1724
Completed for row 15
Token count: 1764
Completed for row 16
Token count: 1686
Completed for row 17
Token count: 1488
Completed for row 18
Token count: 855
Completed for row 19
Token count: 2359
Completed for row 20
Token count: 2391
Completed for row 21
Token count: 2246
Completed for row 22
Token count: 1279
Completed for row 23
Token count: 2382
Completed for row 24
Token count: 1265
Completed for row 25
Token count: 399
C

In [44]:
aggregated.head()

Unnamed: 0,call_ID,aggregated_RR,presentation,QnA,analysts_summary
0,0,29.023972,Presentation\nOperator\nGood morning Ladies an...,Question and Answer\nOperator\n[Operator Instr...,This is Jennifer Davis in for Todd. Congratula...
1,1,8.86942,"Presentation\nOperator\nLadies and gentlemen, ...",Question and Answer\nOperator\n(Operator Instr...,Edward Snyder \nThank you very much. Good qua...
2,2,33.210455,Presentation\nOperator\nGreetings ladies and g...,Question and Answer\nOperator\nThank you. Ladi...,"Michael Linenberg \nMerrill Lynch \nHi, yes,..."
3,3,11.875,"Presentation\nOperator\nThank you, all parties...",Question and Answer\nOperator\n[Operator Instr...,Diane Geissler \nMerrill Lynch \nI just want...
4,4,18.34,Presentation\nOperator\nThank you for joining ...,Question and Answer\nOperator\n(Operator Instr...,George Staphos \nBanc of America Securities ...


In [45]:
# save as csv
aggregated.to_csv(f'transcripts/{PART}_summary.csv', index=False, sep='|')