In [382]:
import openai
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
# Add a constant to the model (intercept)
from sklearn.metrics import r2_score
import time
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import json

from dotenv import load_dotenv
import os

load_dotenv()  # Load environment variables from .env file

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [None]:
'''If executed in Google Colab, uncomment the following lines'''
#from google.colab import drive
#drive.mount('/content/drive')

#import os
#os.chdir('/content/drive/MyDrive/LLM_CreditorRRPrediction')

In [None]:
PARTS = ['presentation', 'qna']

client = openai.OpenAI(api_key=OPENAI_API_KEY)

aggregated = pd.read_csv('transcripts/aggregated_credit_df.csv', delimiter='|')

In [None]:
# import system_message.txt as a string
for part in PARTS:
    with open(f'cot/system_{part}_scores.txt', 'r') as file:
        system_message = file.read()

    for idx, row in aggregated.iterrows():
        try:
            if part == 'qna':
                completion = client.chat.completions.create(
                    model="gpt-4o-mini",
                    temperature=0,
                    messages=[
                        {"role": "system", "content": system_message},
                        {"role": "user", "content": row['QnA']},
                    ]
                )
            else:
                completion = client.chat.completions.create(
                    model="gpt-4o-mini",
                    temperature=0,
                    messages=[
                        {"role": "system", "content": system_message},
                        {"role": "user", "content": row['presentation']},
                    ]
                )
            # Instantly add the output as a new column entry for the corresponding row
            response = completion.choices[0].message.content

            print(response)
            aggregated.at[idx, f'{part}_response'] = response

        except openai.error.OpenAIError as e:
            print(f"An exception occurred: {e}")
            print("Waiting for 1 minute before retrying...")
            time.sleep(60)

    # to csv
    aggregated.to_csv(f'cot/reasoning_{part}_response.csv', index=False, sep='|')

In [277]:
# checkpoint
aggregated = pd.read_csv('cot/reasoning_presentation_response.csv', delimiter='|')
aggregated_qna = pd.read_csv('cot/reasoning_qna_response.csv', delimiter='|')

# merge the two dataframes
aggregated = pd.merge(aggregated, aggregated_qna[['call_ID', 'qna_response']], on='call_ID', how='left')

# drop duplicates
aggregated = aggregated.drop_duplicates(subset='call_ID')

# reset the index
aggregated = aggregated.reset_index(drop=True)

# sort by call_ID
aggregated = aggregated.sort_values(by='call_ID')

In [290]:
import re

# Process the string and convert to a dictionary
def transform_to_dict(response_string):
    response_dict = {}
    matches = re.findall(r'(\d+): \[\[(.*?)\]\]', response_string, re.DOTALL)
    for match in matches:
        call_id = int(match[0])
        values = [float(re.sub(r'[^\d.-]', '', v)) for v in match[1].split("], [")]
        response_dict[call_id] = values
    return response_dict

# Function to process responses and split into DataFrame
def process_responses(responses, part):
    data = []
    for packet in responses:
        # packet to list
        # Transform the responses
        result_dict = transform_to_dict(packet)
        for key in result_dict:
            # Extract the JSON structure from the response
            row_data = {'call_ID': key}
            values = result_dict[key]

            if part == 'qna':
                if len(values) < 7:
                    print(f"Skipping call_ID {key} due to insufficient data")
                    continue
                
                # Parse response columns and populate the DataFrame
                row_data['analyst_concerns'] = float(values[0]) 
                row_data['responsiveness'] = float(values[1])
                row_data['confidence'] = float(values[2])
                row_data['evasiveness'] = float(values[3])
                row_data['depth'] = float(values[4])
                row_data['analyst_satisfaction'] = float(values[5])
                row_data['language_accessibility_qna'] = float(values[6])
                data.append(row_data)

            else:
                if len(values) < 13:
                    print(f"Skipping call_ID {key} due to insufficient data")
                    continue

                # Logic to parse and assign values to each column
                if float(values[0]) >= 0:
                    row_data['positive_sentiment'] = float(values[0])
                    row_data['negative_sentiment'] = 0
                else:
                    row_data['positive_sentiment'] = 0
                    row_data['negative_sentiment'] = -1 * float(values[0])

                row_data['uncertainty'] = float(values[1])

                if float(values[2]) >= 0:
                    row_data['optimistic'] = float(values[2])
                    row_data['pessimistic'] = 0
                else:
                    row_data['optimistic'] = 0
                    row_data['pessimistic'] = -1 * float(values[2])

                row_data['vagueness'] = float(values[3])
                row_data['language_accessibility_presentation'] = float(values[4])
                row_data['liquidity_position'] = float(values[5])
                row_data['debt_leverage_stress'] = float(values[6])
                row_data['operational_trends'] = float(values[7])
                row_data['industry_positioning'] = float(values[8])
                row_data['asset_quality'] = float(values[9])
                row_data['recovery_strategies'] = float(values[10])
                row_data['legal_issues'] = float(values[11])
                row_data['macroeconomic'] = float(values[12])
                data.append(row_data)
            
    return pd.DataFrame(data)

In [None]:
for part in PARTS:
    # import system_message.txt as a string
    with open(f'cot/{part}_compare_scores.txt', 'r') as file:
        compare_message = file.read()

    # Initialize variables
    content = ""
    batch_indices = []
    aggregated['adjusted_scores'] = None  # Initialize with None
    counter = 0

    all_responses = []

    # Iterate over rows in batches of 20
    for idx, row in aggregated.iterrows():
        # Skip rows with no content in the source column
        if pd.isna(row[part + "_response"]) or row[part + "_response"] == '':
            'missing'
            continue

        content += f'Call No. {idx}: {row[part + "_response"]}'
        batch_indices.append(idx)  # Keep track of row indices for this batch
        counter += 1

        if counter == 20:
            try:
                # Call the API with the batched content
                completion = client.chat.completions.create(
                    model="gpt-4o-mini",
                    temperature=0,
                    messages=[
                        {"role": "system", "content": compare_message},
                        {"role": "user", "content": content},
                    ]
                )

                # Parse the response
                response = completion.choices[0].message.content
                print(response)

                all_responses.append(response)
                
                # Reset variables for the next batch
                counter = 0
                content = ""
                batch_indices = []
                print('done')

            except openai.error.OpenAIError as e:
                print(f"An exception occurred: {e}")
                print("Waiting for 1 minute before retrying...")
                time.sleep(60)

    # Handle any remaining rows if the last batch has fewer than 20 rows
    if counter > 0:
        try:
            completion = client.chat.completions.create(
                model="gpt-4o-mini",
                temperature=0,
                messages=[
                    {"role": "system", "content": compare_message},
                    {"role": "user", "content": content},
                ]
            )

            response = completion.choices[0].message.content
            response_json = response

            all_responses.append(response)

        except openai.error.OpenAIError as e:
            print(f"An exception occurred: {e}")
            print("Waiting for 1 minute before retrying...")
            time.sleep(60)

    # replace all ; with ,
    all_responses = [x.replace(';', ',') for x in all_responses]

    # replace all ' with ""
    all_responses = [x.replace("'", '') for x in all_responses]

    # Process the responses
    if part == 'presentation':
        presentation_df = process_responses(all_responses, part)
    elif part == 'qna':
        qna_df = process_responses(all_responses, part)

In [363]:
# merge the two dataframes
merged_df = pd.merge(presentation_df, qna_df, on='call_ID', how='left')

bond_level = pd.read_csv('transcripts/credit_df.csv', delimiter='|')

presentation_labels = ['negative_sentiment', 'positive_sentiment', 'uncertainty', 'optimistic', 'pessimistic', 'vagueness', 'language_accessibility_presentation',
              'liquidity_position', 'debt_leverage_stress', 'operational_trends', 'industry_positioning', 'asset_quality', 'recovery_strategies', 'legal_issues','macroeconomic']

qna_labels = ['analyst_concerns', 'responsiveness', 'confidence', 'evasiveness', 'depth', 'analyst_satisfaction', 'language_accessibility_qna']

In [364]:
# rename language_accessibility columns
merged_df = merged_df.rename(columns={'language_accessibility_x': 'language_accessibility_presentation', 'language_accessibility_y': 'language_accessibility_qna'})

In [365]:
emotions = [
        'admiration', 
        #'amusement', 
        'anger', 
        'annoyance', 
        'approval', 
        'caring', 
        'confusion', 
        'curiosity', 
        'desire',
        'disappointment', 
        'disapproval', 
        #'disgust', 
        'embarrassment', 
        'excitement', 
        'fear', 
        #'gratitude', 
        #'grief',
        'joy', 
        #'love', 
        'nervousness', 
        'optimism', 
        'pride', 
        'realization', 
        'relief', 
        'remorse', 
        'sadness', 
        'surprise'
    ]

analyst_emotions = []
for i in emotions:
    analyst_emotions.append(i + '_analysts')

In [366]:
# Add GoEmotions labels to the llm_output
emotions_presentation = pd.read_csv(f'goemotions/presentation_summary_final.csv', delimiter='|')
emotions_qna = pd.read_csv(f'goemotions/qna_summary_final.csv', delimiter='|')
emotions_analysts = pd.read_csv(f'goemotions/analysts_summary_final.csv', delimiter='|')

emotions_presentation = emotions_presentation[["call_ID"] + emotions]
emotions_presentation.drop_duplicates(keep='first', inplace=True)
emotions_qna = emotions_qna[["call_ID"] + emotions]
emotions_qna.drop_duplicates(keep='first', inplace=True)
emotions_analysts = emotions_analysts[["call_ID"] + emotions]
emotions_analysts.drop_duplicates(keep='first', inplace=True)

In [367]:
# merge output_df with transcripts on 'transcript_number' and index
bond_level = pd.merge(bond_level,
                          merged_df[presentation_labels + qna_labels + ['call_ID']],
                          on=['call_ID'],
                          how='left')

llm_output = bond_level.copy()

In [None]:
llm_output = pd.merge(llm_output, emotions_presentation, on='call_ID', how='left')
llm_output = pd.merge(llm_output, emotions_qna, on='call_ID', how='left')

for emotion in emotions:
    llm_output[emotion] = llm_output[f'{emotion}_x'] + llm_output[f'{emotion}_y']
    llm_output.drop([f'{emotion}_x', f'{emotion}_y'], axis=1, inplace=True)
    scaler = MinMaxScaler()
    llm_output[emotion] = scaler.fit_transform(llm_output[[emotion]])

llm_output = pd.merge(llm_output, emotions_analysts, on='call_ID', how='left')

# rename emotions_x to emotions and emotions_y to emotions_analysts
for emotion in emotions:
    llm_output.rename(columns={f'{emotion}_x': f'{emotion}', f'{emotion}_y': f'{emotion}_analysts'}, inplace=True)

llm_output.head()

In [369]:
# replace #DIV/0! & Nan with 0
# Replace '#DIV/0!' with NaN
llm_output.replace('#DIV/0!', np.nan, inplace=True)
llm_output = llm_output.fillna(0)

# make sure all values are numeric except for the Date column
checkpoint = llm_output.apply(pd.to_numeric, errors='ignore')

In [370]:
# List of selected supporting features
supporting_features_1 = [
    'CBOE DJIA Volatility Index',
    'NASDAQ 100 Index return',
    'Manufacturers inventories to sales ratio',
    '30 year conventional mortgage rate',
    'Communication Services', 
    'Consumer Discretionary', 
    'Senior secured',  
    'Time to maturity',  
    'Equity value',
    'CDS availability',
    'ActIndustryDistress1',
    'ActIndustryDistress2',
    'Offering amount',
    'Volume',
    'Industrials','Consumer Staples','Financials','Energy','Health Care','Utilities','Information Technology','Real Estate'
]

supporting_features_2 = [
    'Default barrier',
    'LTDIssuance2',
    'Intangibility',
    'Receivables1',
]

In [371]:
# Ensure the columns are in datetime format
checkpoint['Date'] = pd.to_datetime(checkpoint['Date'])
checkpoint['Ddate'] = pd.to_datetime(checkpoint['Ddate'])
checkpoint['t_delta'] = checkpoint['Ddate'] - checkpoint['Date']

# drop all with t_delta > 180
llm_output = checkpoint[checkpoint['t_delta'] <= pd.Timedelta('180 days')]

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
llm_output[supporting_features_1 + supporting_features_2 + qna_labels + presentation_labels] = scaler.fit_transform(llm_output[supporting_features_1 + supporting_features_2 + qna_labels + presentation_labels])

In [381]:
# to csv
llm_output.to_csv('transcripts/cot_llm_output_final.csv', index=False, sep='|')

In [383]:
llm_output = pd.read_csv('transcripts/cot_llm_output_final.csv', delimiter='|')

In [386]:
def create_colored_chart(model, type):

    # Filter significant features
    significant_features = model.pvalues[model.pvalues < 0.05].index
    importances = model.params[significant_features]  # Only significant features
    importances = importances[1:]

    # Sort importances by their absolute value, and select the top 10
    top_12_importances = importances.abs().sort_values(ascending=False).head(10)
    importances = importances[top_12_importances.index]
    importances = importances.sort_values()  # Sort the values for better visualization

    # Define emotion types (assuming the feature names match these emotion labels)
    emotion_types = {
        'relief': 'positive',
        'remorse': 'negative',
        'nervousness': 'negative',
        'desire': 'positive',
        'fear': 'strong_negative',
        'excitement': 'positive',
        'confusion': 'negative',
        'pride': 'positive',
        'annoyance': 'negative',
        'gratitude': 'positive',
        'anger': 'strong_negative',
        'optimism': 'positive',
        'sadness': 'negative', 
        'approval': 'positive',
        'caring': 'positive',
        'disappointment': 'negative',
        'curiosity': 'positive',
        'surprise': 'positive',
        'admiration': 'positive',
        'embarrassment': 'negative',
        'realization': 'positive',
        'disapproval': 'negative',
        'joy': 'positive',
        'relief_analysts': 'positive',
        'remorse_analysts': 'negative',
        'nervousness_analysts': 'negative',
        'desire_analysts': 'positive',
        'fear_analysts': 'strong_negative',
        'excitement_analysts': 'positive',
        'confusion_analysts': 'negative',
        'pride_analysts': 'positive',
        'annoyance_analysts': 'negative',
        'gratitude_analysts': 'positive',
        'anger_analysts': 'strong_negative',
        'optimism_analysts': 'positive',
        'sadness_analysts': 'negative',
        'approval_analysts': 'positive',
        'caring_analysts': 'positive',
        'disappointment_analysts': 'negative',
        'curiosity_analysts': 'positive',
        'surprise_analysts': 'positive',
        'admiration_analysts': 'positive',
        'embarrassment_analysts': 'negative',
        'realization_analysts': 'positive',
        'disapproval_analysts': 'negative',
        'joy_analysts': 'positive'
    }

    # Map colors to each emotion type
    color_mapping = {
        'positive': 'green',
        'strong_negative': 'red',
        'negative': 'gray',
    }

    # Apply the color mapping to the top 12 significant features
    colors = []
    for feature in importances.index:
        if feature in supporting_features_1 or feature in supporting_features_2:
            colors.append('blue')
        elif feature in presentation_labels or feature in qna_labels:
            colors.append('orange')
        elif feature in emotion_types:
            colors.append(color_mapping[emotion_types[feature]])
        else:
            colors.append('purple')  # Default for unknown features

    # Create a horizontal bar plot for the top 12 significant features
    plt.figure(figsize=(10, 5))
    plt.barh(importances.index, importances.values, color=colors)

    # add std errors
    plt.errorbar(importances, importances.index, xerr=model.bse[1:][importances.index], fmt='|', color='black')

    # Add labels and title
    plt.xlabel('Impact')
    plt.ylabel('Features')
    plt.title('Significant Features and their Impact')

    if type == 'Management Emotions' or type == 'Analyst Emotions' or type == 'All Emotions':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='green'), plt.Rectangle((0, 0), 1, 1, color='gray'), plt.Rectangle((0, 0), 1, 1, color='red')
        ]
        labels = [
                'Positive Emotion', 'Negative Emotion', 'Strong Negative Emotion'
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'All Earnings call':
        # Create a custom legend
        handles = [
            #plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='orange'),
            plt.Rectangle((0, 0), 1, 1, color='green'), plt.Rectangle((0, 0), 1, 1, color='gray'), plt.Rectangle((0, 0), 1, 1, color='red')
        ]
        labels = [
                #'Financial Data', 
                'LLM Labels', 
                'Positive Emotion', 'Negative Emotion', 'Strong Negative Emotion'
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'Financial Data & Presentation Labels' or type == 'Financial Data & Q&A Labels' or type == 'Financial Data 1&2 & LLM Labels':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='orange'),
        ]
        labels = [
                'Financial Data', 
                'LLM Labels', 
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'Financial Data & All Emotions' or type == 'Financial Data & Management Emotions' or type == 'Financial Data & Analyst Emotions':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='green'), plt.Rectangle((0, 0), 1, 1, color='gray'), plt.Rectangle((0, 0), 1, 1, color='red')
        ]
        labels = [
                'Financial Data', 
                'Positive Emotion', 'Negative Emotion', 'Strong Negative Emotion'
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'Financial Data & All Earnings call':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='orange'),
            plt.Rectangle((0, 0), 1, 1, color='green'), plt.Rectangle((0, 0), 1, 1, color='gray'), plt.Rectangle((0, 0), 1, 1, color='red')
        ]
        labels = [
                'Financial Data', 
                'LLM Labels', 
                'Positive Emotion', 'Negative Emotion', 'Strong Negative Emotion'
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'New Metrics':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='purple'),
        ]
        labels = [
                'Financial Data', 
                'New Metrics',
                ]
        # Add the legend
        plt.legend(handles, labels)

    # Show the plot
    return plt.show()


def create_significant_chart(model):

    # select significant features
    model.significance = model.pvalues[model.pvalues < 0.05].index

    # build a graph to show the importance of each feature
    importances = model.params[1:]
    importances = importances.sort_values()

    # Sort the importance values based on absolute values, not just positive or negative
    top_10_importances = importances.abs().sort_values(ascending=False).head(10)

    # reduce importances to only the top 10
    importances = importances[top_10_importances.index]
    importances = importances.sort_values()

    # color significant features 0/150/130 and others in grey
    colors = ['#009682' if feature in model.significance else 'grey' for feature in importances.index]

    plt.figure(figsize=(10, 5))
    plt.barh(importances.index, importances.values, color=colors)

    # add std errors
    plt.errorbar(importances, importances.index, xerr=model.bse[1:][importances.index], fmt='|', color='black')
    
    # add legend
    plt.legend(['Significant'])
    plt.xlabel('Impact')
    plt.ylabel('Features')
    plt.title('Features Impact on Recovery Rate')
    return plt.show()

In [392]:
feature_sets = {
                ## Earnings call features
                'Presentation Labels': presentation_labels, 
                'Q&A Labels':qna_labels, 
                'LLM Labels': presentation_labels + qna_labels,
                'All Earnings call': presentation_labels + qna_labels + emotions + analyst_emotions,
                ## Earnings call features and financial data
                'Financial Data 1&2 & LLM Labels': supporting_features_1 + supporting_features_2 + presentation_labels + qna_labels,
                'Financial Data 1&2 & All Earnings call': supporting_features_1 + supporting_features_2 + presentation_labels + qna_labels + emotions + analyst_emotions,
                }

In [393]:
'''BOND LEVEL'''
# Select the supporting features, nlp_lables, and RR from final_df
final_df = llm_output[['RR']
                    + supporting_features_1
                    + supporting_features_2
                    + qna_labels
                    + presentation_labels
                    + emotions
                    + analyst_emotions
                    ]

In [None]:
'''Dealer Recovery Rate Prediction'''

for key in feature_sets:

    print(f"Model for {key}")

    final_df = llm_output[['RR']+ feature_sets[key]]

    y = final_df['RR']
    X = final_df.drop(columns=['RR'])

    X = sm.add_constant(X)

    # Fit the model
    model = sm.OLS(y, X).fit()

    # Print the summary of the model which includes p-values and significance levels
    print(model.summary())

    if key == 'Financial Data 1&2 & All Earnings call':
        # export the final model
        model.save('models/aggregated_all_features_model.pkl')

    # Make predictions
    y_pred = model.predict(X)

    # Compute and print evaluation metrics
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {np.sqrt(mse)}")
    print(f"R-squared: {r2}")

    create_significant_chart(model)
    create_colored_chart(model, key)