In [1]:
import openai
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
# Add a constant to the model (intercept)
from sklearn.metrics import r2_score
import time
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from source.models import *
from source.preprocessing import *
from source.variables import *
from source.helpers import *

from dotenv import load_dotenv
import os

load_dotenv()  # Load environment variables from .env file

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [4]:
transcript = pd.read_csv('transcripts/transcripts.csv', delimiter='|')
qna =  pd.read_csv('transcripts/QnA.csv', delimiter='|')

# Merge the two dataframes
df = pd.merge(transcript, qna[['transcript','filename']], on='filename')

# rename transcript_x to presentation and transcript_y to QnA
df = df.rename(columns={'transcript_x': 'presentation', 'transcript_y': 'QnA'})

mapping = pd.read_csv('data/mapping.csv')

# create a new column 'AllNames' that concatenates all versions of 'Company' for a 'CompanyName'
mapping['AllNames'] = mapping.groupby('RR_CompanyName')['Transcript_Mapping'].transform(lambda x: ', '.join(x))
mapping.head()

# Load recovery rates
rr = pd.read_csv('data/RR_Bonds.csv')
rr = rr[['Ddate', 'RR', 'CompanyName', 'CUSIP', 'LTDIssuance2', 'Intangibility', 'Receivables1']]

preprocessed_df = pd.read_csv('data/preprocessed_bond_data.csv')

# Add rr columns to preprocessed_df on index
preprocessed_df['RR'] = rr['RR']
preprocessed_df['Ddate'] = rr['Ddate']
preprocessed_df['CompanyName'] = rr['CompanyName']
preprocessed_df['CUSIP'] = rr['CUSIP']
preprocessed_df['LTDIssuance2'] = rr['LTDIssuance2']
preprocessed_df['Intangibility'] = rr['Intangibility']
preprocessed_df['Receivables1'] = rr['Receivables1']

rr = preprocessed_df

# Convert 'Date' column to datetime
rr['Ddate'] = pd.to_datetime(rr['Ddate'], errors='coerce')
rr.head()

# merge rr with mapping on CompanyName and RR_CompanyName
rr = rr.merge(mapping, left_on='CompanyName', right_on='RR_CompanyName')

# join with df on Company and Transcripts_Mapping
merged_df = rr.merge(df, left_on='Transcript_Mapping', right_on='Company')
print(merged_df['CompanyName'].value_counts())

# Ensure the columns are in datetime format
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df['Ddate'] = pd.to_datetime(merged_df['Ddate'])

# Compute the difference in days
merged_df['diff'] = (merged_df['Ddate'] - merged_df['Date']).dt.days

merged_df = merged_df[merged_df['Ddate']>merged_df['Date']]
merged_df = merged_df.sort_values(by='Date').groupby(['CUSIP']).tail(1)

print(merged_df['CompanyName'].value_counts())

Ally Financial Inc.               10317
CIT Group Inc.                    10185
Lehman Brothers Holdings, Inc.     2853
Charter Communications, Inc.       2144
Sempra Energy                      1147
                                  ...  
Frontier Group Holdings, Inc.         1
Dayton Superior Corporation           1
Franklin Bank Corp.                   1
Kellwood Company, LLC                 1
Turning Point Brands, Inc.            1
Name: CompanyName, Length: 210, dtype: int64
Lehman Brothers Holdings, Inc.      317
CIT Group Inc.                      291
Charter Communications, Inc.         28
Ford Motor Company                   19
iStar Inc.                           17
                                   ... 
Centrus Energy Corp.                  1
Education Management Corporation      1
Venoco, Inc.                          1
Exelon Corporation                    1
Kellwood Company, LLC                 1
Name: CompanyName, Length: 159, dtype: int64


In [97]:
# add a number to each transcript based on the 196 unique transcripts
merged_df.reset_index(drop=True, inplace=True)
merged_df['transcript_number'] = merged_df['presentation'].factorize()[0]

number_transcript = merged_df[['transcript_number', 'presentation', 'QnA']].drop_duplicates().sort_values('transcript_number')

number_transcript

Unnamed: 0,transcript_number,presentation,QnA
0,0,Presentation\nOperator\nGood morning Ladies an...,Question and Answer\nOperator\n[Operator Instr...
1,1,"Presentation\nOperator\nLadies and gentlemen, ...",Question and Answer\nOperator\n(Operator Instr...
2,2,Presentation\nOperator\nGreetings ladies and g...,Question and Answer\nOperator\nThank you. Ladi...
3,3,"Presentation\nOperator\nThank you, all parties...",Question and Answer\nOperator\n[Operator Instr...
4,4,Presentation\nOperator\nThank you for joining ...,Question and Answer\nOperator\n(Operator Instr...
...,...,...,...
1068,191,"Presentation\nOperator\nGood day, ladies and g...",Question and Answer\nOperator\n[Operator Instr...
1069,192,"Presentation\nColin Stephen Goldschmidt\nCEO, ...",Question and Answer\nAndrew Goodsall\nUBS Inve...
1070,193,"Presentation\nOperator\nGood morning, ladies a...",Question and Answer\nOperator\n[Audio Gap]\nfr...
1072,194,"Presentation\nOperator\nGood morning, and welc...",Question and Answer\nOperator\n[Operator Instr...


In [47]:
client = openai.OpenAI(api_key=OPENAI_API_KEY)

output = []
start = 0

# import system_message.txt as a string
with open('prompts/system_section.txt', 'r') as file:
    system_message = file.read()

print(system_message)

Please read the following transcript of a Presentation session from an earnings conference call carefully. Your task is to:

1. **Identify and extract all sections** where the company discusses topics related to **credit, debt, default risk, leverage, financial obligations, or any matters that are relevant to bond market investors**.

2. For each extracted section, provide:
   - The **exact text** from the transcript, including any necessary context for full understanding.
   - A **professional assessment** of the information, highlighting its significance for the bond market, and discussing any potential impacts on the company's creditworthiness, debt levels, default risk, or bond pricing.

Please ensure that your analysis is accurate, concise, and focused on providing valuable insights for bond market participants.


In [89]:
counter = 0
#output = []
for i in number_transcript['presentation']:
    if counter == start:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": i}
            ]
        )
        output.append(completion.choices[0].message.content)
        print(len(output))
    else:
        counter += 1


In [90]:
start = 183 + len(output)
start

196

In [91]:
# transform output_df['output_presentation'] to a list
output_list = output_df['output_presentation'].tolist()

for i in output:
    output_list.append(i)

len(output_list)

196

In [93]:
# transform output to df
output_df = pd.DataFrame(output_list, columns=['output_presentation'])
output_df

Unnamed: 0,output_presentation
0,Here are the extracted sections related to cre...
1,### Extracted Sections Relevant to Bond Market...
2,**Extracted Sections Relating to Credit and De...
3,"**Extracted Sections Related to Credit, Debt, ..."
4,### Extracted Sections Relevant to Bond Market...
...,...
191,"### Extracted Sections Related to Credit, Debt..."
192,"### Extracted Sections on Credit, Debt, and Fi..."
193,### Extracted Sections and Assessments\n\n####...
194,"### Extracted Sections Related to Credit, Debt..."


In [94]:
output_df['transcript_number'] = output_df.index

# merge output_df with transcripts on 'transcript_number' and index
transcripts = pd.merge(number_transcript,
                          output_df,
                          on='transcript_number',
                          how='left')

checkpoint = transcripts.copy()

In [95]:
checkpoint

Unnamed: 0,transcript_number,presentation,QnA,output_presentation
0,0,Presentation\nOperator\nGood morning Ladies an...,Question and Answer\nOperator\n[Operator Instr...,Here are the extracted sections related to cre...
1,1,"Presentation\nOperator\nLadies and gentlemen, ...",Question and Answer\nOperator\n(Operator Instr...,### Extracted Sections Relevant to Bond Market...
2,2,Presentation\nOperator\nGreetings ladies and g...,Question and Answer\nOperator\nThank you. Ladi...,**Extracted Sections Relating to Credit and De...
3,3,"Presentation\nOperator\nThank you, all parties...",Question and Answer\nOperator\n[Operator Instr...,"**Extracted Sections Related to Credit, Debt, ..."
4,4,Presentation\nOperator\nThank you for joining ...,Question and Answer\nOperator\n(Operator Instr...,### Extracted Sections Relevant to Bond Market...
...,...,...,...,...
191,191,"Presentation\nOperator\nGood day, ladies and g...",Question and Answer\nOperator\n[Operator Instr...,"### Extracted Sections Related to Credit, Debt..."
192,192,"Presentation\nColin Stephen Goldschmidt\nCEO, ...",Question and Answer\nAndrew Goodsall\nUBS Inve...,"### Extracted Sections on Credit, Debt, and Fi..."
193,193,"Presentation\nOperator\nGood morning, ladies a...",Question and Answer\nOperator\n[Audio Gap]\nfr...,### Extracted Sections and Assessments\n\n####...
194,194,"Presentation\nOperator\nGood morning, and welc...",Question and Answer\nOperator\n[Operator Instr...,"### Extracted Sections Related to Credit, Debt..."


In [98]:
# save checkpoint as csv
checkpoint.to_csv('transcripts/LLM_summary.csv', index=False)
#checkpoint = pd.read_csv('transcripts/LLM_summary.csv')