In [1]:
import openai
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
# Add a constant to the model (intercept)
from sklearn.metrics import r2_score
import time
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from source.models import *
from source.preprocessing import *
from source.variables import *
from source.helpers import *

from dotenv import load_dotenv
import os

load_dotenv()  # Load environment variables from .env file

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [4]:
transcript = pd.read_csv('transcripts/transcripts.csv', delimiter='|')
qna =  pd.read_csv('transcripts/QnA.csv', delimiter='|')

# Merge the two dataframes
df = pd.merge(transcript, qna[['transcript','filename']], on='filename')

# rename transcript_x to presentation and transcript_y to QnA
df = df.rename(columns={'transcript_x': 'presentation', 'transcript_y': 'QnA'})

mapping = pd.read_csv('data/mapping.csv')

# create a new column 'AllNames' that concatenates all versions of 'Company' for a 'CompanyName'
mapping['AllNames'] = mapping.groupby('RR_CompanyName')['Transcript_Mapping'].transform(lambda x: ', '.join(x))
mapping.head()

# Load recovery rates
rr = pd.read_csv('data/RR_Bonds.csv')
rr = rr[['Ddate', 'RR', 'CompanyName', 'CUSIP', 'LTDIssuance2', 'Intangibility', 'Receivables1']]

preprocessed_df = pd.read_csv('data/preprocessed_bond_data.csv')

# Add rr columns to preprocessed_df on index
preprocessed_df['RR'] = rr['RR']
preprocessed_df['Ddate'] = rr['Ddate']
preprocessed_df['CompanyName'] = rr['CompanyName']
preprocessed_df['CUSIP'] = rr['CUSIP']
preprocessed_df['LTDIssuance2'] = rr['LTDIssuance2']
preprocessed_df['Intangibility'] = rr['Intangibility']
preprocessed_df['Receivables1'] = rr['Receivables1']

rr = preprocessed_df

# Convert 'Date' column to datetime
rr['Ddate'] = pd.to_datetime(rr['Ddate'], errors='coerce')
rr.head()

# merge rr with mapping on CompanyName and RR_CompanyName
rr = rr.merge(mapping, left_on='CompanyName', right_on='RR_CompanyName')

# join with df on Company and Transcripts_Mapping
merged_df = rr.merge(df, left_on='Transcript_Mapping', right_on='Company')
print(merged_df['CompanyName'].value_counts())

# Ensure the columns are in datetime format
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df['Ddate'] = pd.to_datetime(merged_df['Ddate'])

# Compute the difference in days
merged_df['diff'] = (merged_df['Ddate'] - merged_df['Date']).dt.days

merged_df = merged_df[merged_df['Ddate']>merged_df['Date']]
merged_df = merged_df.sort_values(by='Date').groupby(['CUSIP']).tail(1)

print(merged_df['CompanyName'].value_counts())

Ally Financial Inc.               10317
CIT Group Inc.                    10185
Lehman Brothers Holdings, Inc.     2853
Charter Communications, Inc.       2144
Sempra Energy                      1147
                                  ...  
Frontier Group Holdings, Inc.         1
Dayton Superior Corporation           1
Franklin Bank Corp.                   1
Kellwood Company, LLC                 1
Turning Point Brands, Inc.            1
Name: CompanyName, Length: 210, dtype: int64
Lehman Brothers Holdings, Inc.      317
CIT Group Inc.                      291
Charter Communications, Inc.         28
Ford Motor Company                   19
iStar Inc.                           17
                                   ... 
Centrus Energy Corp.                  1
Education Management Corporation      1
Venoco, Inc.                          1
Exelon Corporation                    1
Kellwood Company, LLC                 1
Name: CompanyName, Length: 159, dtype: int64


In [6]:
# add a number to each transcript based on the 196 unique transcripts
merged_df.reset_index(drop=True, inplace=True)
merged_df['transcript_number'] = merged_df['presentation'].factorize()[0]

number_transcript = merged_df[['transcript_number', 'presentation', 'QnA']].drop_duplicates().sort_values('transcript_number')

number_transcript.head()

Unnamed: 0,transcript_number,presentation,QnA
0,0,Presentation\nOperator\nGood morning Ladies an...,Question and Answer\nOperator\n[Operator Instr...
1,1,"Presentation\nOperator\nLadies and gentlemen, ...",Question and Answer\nOperator\n(Operator Instr...
2,2,Presentation\nOperator\nGreetings ladies and g...,Question and Answer\nOperator\nThank you. Ladi...
3,3,"Presentation\nOperator\nThank you, all parties...",Question and Answer\nOperator\n[Operator Instr...
4,4,Presentation\nOperator\nThank you for joining ...,Question and Answer\nOperator\n(Operator Instr...


In [25]:
client = openai.OpenAI(api_key=OPENAI_API_KEY)

output = []
start = 0

# import system_message.txt as a string
with open('prompts/system_section.txt', 'r') as file:
    system_message = file.read()

print(system_message)

Please read the following transcript of a Presentation session from an earnings conference call carefully. Your task is to:

1. **Identify and extract all sections** where the company discusses topics related to **credit, debt, default risk, leverage, financial obligations, or any matters that are relevant to bond market investors**.

2. For each extracted section, provide:
   - The **exact text** from the transcript, including any necessary context for full understanding.
   - A **professional assessment** of the information, highlighting its significance for the bond market, and discussing any potential impacts on the company's creditworthiness, debt levels, default risk, or bond pricing.

Please ensure that your analysis is accurate, concise, and focused on providing valuable insights for bond market participants.


In [26]:
counter = 0
for i in number_transcript['presentation']:
    if counter == start:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": i}
            ]
        )
        output.append(completion.choices[0].message.content)
        print(len(output))
    else:
        counter += 1


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-CAr5Z1oqHVx7Dt0IR15SR1uo on requests per day (RPD): Limit 200, Used 200, Requested 1. Please try again in 7m12s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}

In [24]:
print(output)

['### Extracted Sections Related to Credit, Debt, Default Risk, Leverage, and Financial Obligations\n\n---\n\n#### Section 1: Sale of Smart Shirts Operation and Debt Reduction Plans\n**Exact Text:**\n"In summary, our business and brands performed in line with our expectations this quarter. We are confident, focused and passionate about the execution of our long term strategic and financial plans outlined on slide six, which include annual organic sales growth of 4 to 5%, operating margins of 4.6% today increasing to 9% in 2012, excluding potential future acquisitions and earnings per share growth of at least 25% after a significant increase in 2008 excluding potential future acquisitions. We announce the sale of our Smart Shirts operation, which has limited strategic synergies with our other businesses. It has also allowed us to remove a capital intensive and lower margin private label business from our company, while netting Kellwood with approximately $161 million in gross proceeds, 

In [27]:
# transform output to df
output_df = pd.DataFrame(output, columns=['output_presentation'])
output_df.head()

Unnamed: 0,output_presentation
0,Here are the extracted sections related to cre...
1,### Extracted Sections Relevant to Bond Market...
2,**Extracted Sections Relating to Credit and De...
3,"**Extracted Sections Related to Credit, Debt, ..."
4,### Extracted Sections Relevant to Bond Market...


In [28]:
output_df['transcript_number'] = output_df.index

# merge output_df with transcripts on 'transcript_number' and index
transcripts = pd.merge(number_transcript,
                          output_df,
                          on='transcript_number',
                          how='left')

checkpoint = transcripts.copy()

In [31]:
checkpoint.head()

Unnamed: 0,transcript_number,presentation,QnA,output_presentation
0,0,Presentation\nOperator\nGood morning Ladies an...,Question and Answer\nOperator\n[Operator Instr...,Here are the extracted sections related to cre...
1,1,"Presentation\nOperator\nLadies and gentlemen, ...",Question and Answer\nOperator\n(Operator Instr...,### Extracted Sections Relevant to Bond Market...
2,2,Presentation\nOperator\nGreetings ladies and g...,Question and Answer\nOperator\nThank you. Ladi...,**Extracted Sections Relating to Credit and De...
3,3,"Presentation\nOperator\nThank you, all parties...",Question and Answer\nOperator\n[Operator Instr...,"**Extracted Sections Related to Credit, Debt, ..."
4,4,Presentation\nOperator\nThank you for joining ...,Question and Answer\nOperator\n(Operator Instr...,### Extracted Sections Relevant to Bond Market...


In [30]:
# save checkpoint as csv
checkpoint.to_csv('transcripts/LLM_summary.csv', index=False)
#checkpoint = pd.read_csv('transcripts/LLM_summary.csv')