In [None]:
import openai
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
# Add a constant to the model (intercept)
from sklearn.metrics import r2_score
import time
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

from dotenv import load_dotenv
import os

load_dotenv()  # Load environment variables from .env file

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
POST = False

In [None]:
'''If executed in Google Colab, uncomment the following lines'''
#from google.colab import drive
#drive.mount('/content/drive')

#import os
#os.chdir('/content/drive/MyDrive/LLM_CreditorRRPrediction')

In [None]:
# import system_message.txt as a string
with open('prompts/system_qna_scores.txt', 'r') as file:
    system_message = file.read()

print(system_message)

In [None]:
if POST:
    aggregated = pd.read_csv('transcripts/post_aggregated_credit_df.csv', delimiter='|')
else:
    aggregated = pd.read_csv('transcripts/aggregated_credit_df.csv', delimiter='|')

aggregated.head()

In [None]:
client = openai.OpenAI(api_key=OPENAI_API_KEY)

In [None]:
for idx, row in aggregated.iterrows():
    try:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            temperature=0,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": row['QnA']}
            ]
        )
        # Instantly add the output as a new column entry for the corresponding row
        response = completion.choices[0].message.content

        # Parse response columns and populate the DataFrame
        aggregated.at[idx, 'analyst_concerns'] = float(response.split('; ')[0]) 
        aggregated.at[idx, 'responsiveness'] = float(response.split('; ')[1])
        aggregated.at[idx, 'confidence'] = float(response.split('; ')[2])
        aggregated.at[idx, 'evasiveness'] = float(response.split('; ')[3])
        aggregated.at[idx, 'depth'] = float(response.split('; ')[4])
        aggregated.at[idx, 'analyst_satisfaction'] = float(response.split('; ')[5])
        aggregated.at[idx, 'language_accessibility'] = float(response.split('; ')[6])
        
        print(f"Completed for row {idx}")
        print(completion.choices[0].message.content)

    except openai.error.OpenAIError as e:
        print(f"An exception occurred: {e}")
        print("Waiting for 1 minute before retrying...")
        time.sleep(60)


In [None]:
aggregated.head()

In [None]:
llm_labels = ['analyst_concerns', 'responsiveness', 'confidence', 'evasiveness', 'depth', 'analyst_satisfaction', 'language_accessibility']

In [None]:
if POST:
    transcripts = pd.read_csv('transcripts/post_credit_df.csv', delimiter='|')
else:
    transcripts = pd.read_csv('transcripts/credit_df.csv', delimiter='|')

# merge output_df with transcripts on 'transcript_number' and index
transcripts = pd.merge(transcripts,
                          aggregated[['call_ID'] + llm_labels],
                          on='call_ID',
                          how='left')

checkpoint = transcripts.copy()

# replace #DIV/0! & Nan with 0
# Replace '#DIV/0!' with NaN
checkpoint.replace('#DIV/0!', np.nan, inplace=True)
checkpoint = checkpoint.fillna(0)

# make sure all values are numeric except for the Date column
checkpoint = checkpoint.apply(pd.to_numeric, errors='ignore')

# min max scale llm_labels
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
checkpoint[llm_labels] = min_max_scaler.fit_transform(checkpoint[llm_labels])

In [None]:
# save checkpoint as csv
if POST:
    checkpoint.to_csv('transcripts/post_LLM_QnA_analysis.csv', index=False)
else:
    checkpoint.to_csv('transcripts/LLM_QnA_analysis.csv', index=False)
#checkpoint = pd.read_csv('transcripts/LLM_outputs_QnA.csv')

In [None]:
# List of selected supporting features
supporting_features_1 = [
    'CBOE DJIA Volatility Index',
    'NASDAQ 100 Index return',
    'Manufacturers inventories to sales ratio',
    '30 year conventional mortgage rate',
    'Communication Services', 
    'Consumer Discretionary', 
    'Senior secured',  
    'Time to maturity',  
    'Equity value',
    'CDS availability',
    'ActIndustryDistress1',
    'ActIndustryDistress2',
    'Offering amount',
    'Volume',
    'Industrials','Consumer Staples','Financials','Energy','Health Care','Utilities','Information Technology','Real Estate'
]

supporting_features_2 = [
    'Default barrier',
    'LTDIssuance2',
    'Intangibility',
    'Receivables1',
]

In [None]:
'''BOND LEVEL'''
# Select the supporting features, nlp_lables, and RR from final_df
final_df = checkpoint[['RR']
                    + supporting_features_1
                    + supporting_features_2
                    + llm_labels 
                    ]

# In-Sample-Regression
y_train, y_test = final_df['RR'], final_df['RR']
X_train, X_test = final_df.drop(columns=['RR']), final_df.drop(columns=['RR'])

In [None]:
# LLM features

X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

# Fit the model
model = sm.OLS(y_train, X_train).fit()

# Print the summary of the model which includes p-values and significance levels
print(model.summary())

# Make predictions
y_pred = model.predict(X_test)

# Compute and print evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {np.sqrt(mse)}")
print(f"R-squared: {r2}")

# other metric
# Calculate the residuals
residuals = y_test - y_pred

# Plot the residuals
plt.figure(figsize=(10, 5))
plt.scatter(y_test, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Recovery Rate')
plt.ylabel('Residuals')

# Display the plot
plt.show()

In [None]:
# Compute variance inflation factor 
vif = pd.DataFrame()
vif["Features"] = X_train.columns
vif["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif = vif.sort_values(by='VIF', ascending=False)

# Display the VIF
print(vif)

In [None]:
# select significant features
model.significance = model.pvalues[model.pvalues < 0.05].index

# build a graph to show the importance of each feature
importances = model.params[1:]
importances = importances.sort_values()

# Sort the importance values based on absolute values, not just positive or negative
top_10_importances = importances.abs().sort_values(ascending=False).head(10)

# reduce importances to only the top 10
importances = importances[top_10_importances.index]
importances = importances.sort_values()

# color significant features 0/150/130 and others in grey
colors = ['#009682' if feature in model.significance else 'grey' for feature in importances.index]

plt.figure(figsize=(10, 5))
plt.barh(importances.index, importances.values, color=colors)
# Create a custom legend
handles = [plt.Rectangle((0, 0), 1, 1, color='#009682'), plt.Rectangle((0, 0), 1, 1, color='grey')]
labels = ['Significant', 'Not Significant']
plt.legend(handles, labels)
plt.xlabel('Impact')
plt.ylabel('Features')
plt.title('Features Impact on Recovery Rate')
plt.show()