In [3]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
# Add a constant to the model (intercept)
from sklearn.metrics import r2_score
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
'''If executed in Google Colab, uncomment the following lines'''
#from google.colab import drive
#drive.mount('/content/drive')

#import os
#os.chdir('/content/drive/MyDrive/LLM_CreditorRRPrediction')

In [5]:
CoT = False
if CoT:
    llm_outputs = pd.read_csv('transcripts/cot_llm_output_final.csv', delimiter='|')
else:
    llm_outputs = pd.read_csv(f'transcripts/LLM_outputs_final.csv', delimiter='|')

In [6]:
supporting_features_1 = [
    'CBOE DJIA Volatility Index',
    'NASDAQ 100 Index return',
    'Manufacturers inventories to sales ratio',
    '30 year conventional mortgage rate',
    'Communication Services', 
    'Consumer Discretionary', 
    'Senior secured',  
    'Time to maturity',  
    'Equity value',
    'CDS availability',
    'ActIndustryDistress1',
    'ActIndustryDistress2',
    'Offering amount',
    'Volume',
    'Industrials','Consumer Staples','Financials','Energy','Health Care','Utilities','Information Technology','Real Estate'
]

supporting_features_2 = [
    'Default barrier',
    'LTDIssuance2',
    'Intangibility',
    'Receivables1',
]

presentation_labels = ['negative_sentiment', 'positive_sentiment', 'uncertainty', 'optimistic', 'pessimistic', 'vagueness', 'language_accessibility_presentation',
              'liquidity_position', 'debt_leverage_stress', 'operational_trends', 'industry_positioning', 'asset_quality', 'recovery_strategies', 'legal_issues','macroeconomic']

qna_labels = ['analyst_concerns', 'responsiveness', 'confidence', 'evasiveness', 'depth', 'analyst_satisfaction', 'language_accessibility_qna']

qna_mgmt = ['responsiveness', 'confidence', 'depth', 'evasiveness', 'language_accessibility_qna']
qna_analysts = ['analyst_concerns', 'analyst_satisfaction']

emotions = [
        'admiration', 
        #'amusement', 
        'anger', 
        'annoyance', 
        'approval', 
        'caring', 
        'confusion', 
        'curiosity', 
        'desire',
        'disappointment', 
        'disapproval', 
        #'disgust', 
        'embarrassment', 
        'excitement', 
        'fear', 
        #'gratitude', 
        #'grief',
        'joy', 
        #'love', 
        'nervousness', 
        'optimism', 
        'pride', 
        'realization', 
        'relief', 
        'remorse', 
        'sadness', 
        'surprise'
    ]

analyst_emotions = []
for i in emotions:
    analyst_emotions.append(i + '_analysts')

primary_dealer_features = [
    'HHI_number', 'TimeToMaturity', 'TimeSinceOffering', 'Offering_amount',
       'Rating', 'JunkDummy', 'UnratedDummy', 'Enhanced', 'Redeemable',
       'SinkingFund', 'BOND_COUPON', 'IQ_CDS_availability', 'COVENANTS',
        'AvgTransVol', 'TRADES_VOL', 'amihud_ILLIQ_trades',
       'price_dispersion_volumeweighted', 'EquityValue', 'DefaultBarrier2',
       'NumberEmployees', 'IndDis1', 'IndDis2', 'Slope', 'X90DayDR', 'GDP',
       'TradeSize_Retail', 'TradeSize_SmallInstitutional',
       'Seniority_SeniorSubordinate', 'Seniority_SeniorUnsecured',
       'Seniority_SubordinateJunior', 'Default_DefRating',
       'Default_Distressed_exchange', 'Default_Liquidation_C7',
       'Default_RiskRating'
]

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
llm_outputs[supporting_features_1 + supporting_features_2] = scaler.fit_transform(llm_outputs[supporting_features_1 + supporting_features_2])

In [8]:
def create_significant_chart(model):

    # select significant features
    model.significance = model.pvalues[model.pvalues < 0.05].index

    # build a graph to show the importance of each feature
    importances = model.params[1:]
    importances = importances.sort_values()

    # Sort the importance values based on absolute values, not just positive or negative
    top_10_importances = importances.abs().sort_values(ascending=False).head(10)

    # reduce importances to only the top 10
    importances = importances[top_10_importances.index]
    importances = importances.sort_values()

    # color significant features 0/150/130 and others in grey
    colors = ['#009682' if feature in model.significance else 'grey' for feature in importances.index]

    plt.figure(figsize=(10, 5))
    plt.barh(importances.index, importances.values, color=colors)

    # add std errors
    plt.errorbar(importances, importances.index, xerr=model.bse[1:][importances.index], fmt='|', color='black')
    
    # add legend
    plt.legend(['Significant'])
    plt.xlabel('Impact')
    plt.ylabel('Features')
    plt.title('Features Impact on Recovery Rate')
    return plt.show()

def create_colored_chart(model, type):

    # Filter significant features
    significant_features = model.pvalues[model.pvalues < 0.05].index
    importances = model.params[significant_features]  # Only significant features
    importances = importances[1:]

    # Sort importances by their absolute value, and select the top 10
    top_15_importances = importances.abs().sort_values(ascending=False).head(15)
    importances = importances[top_15_importances.index]
    importances = importances.sort_values()  # Sort the values for better visualization

    # Define emotion types (assuming the feature names match these emotion labels)
    emotion_types = {
        'relief': 'positive',
        'remorse': 'negative',
        'nervousness': 'negative',
        'desire': 'positive',
        'fear': 'strong_negative',
        'excitement': 'positive',
        'confusion': 'negative',
        'pride': 'positive',
        'annoyance': 'negative',
        'gratitude': 'positive',
        'anger': 'strong_negative',
        'optimism': 'positive',
        'sadness': 'negative', 
        'approval': 'positive',
        'caring': 'positive',
        'disappointment': 'negative',
        'curiosity': 'positive',
        'surprise': 'positive',
        'admiration': 'positive',
        'embarrassment': 'negative',
        'realization': 'positive',
        'disapproval': 'negative',
        'joy': 'positive',
        'relief_analysts': 'positive',
        'remorse_analysts': 'negative',
        'nervousness_analysts': 'negative',
        'desire_analysts': 'positive',
        'fear_analysts': 'strong_negative',
        'excitement_analysts': 'positive',
        'confusion_analysts': 'negative',
        'pride_analysts': 'positive',
        'annoyance_analysts': 'negative',
        'gratitude_analysts': 'positive',
        'anger_analysts': 'strong_negative',
        'optimism_analysts': 'positive',
        'sadness_analysts': 'negative',
        'approval_analysts': 'positive',
        'caring_analysts': 'positive',
        'disappointment_analysts': 'negative',
        'curiosity_analysts': 'positive',
        'surprise_analysts': 'positive',
        'admiration_analysts': 'positive',
        'embarrassment_analysts': 'negative',
        'realization_analysts': 'positive',
        'disapproval_analysts': 'negative',
        'joy_analysts': 'positive'
    }

    # Map colors to each emotion type
    color_mapping = {
        'positive': 'green',
        'strong_negative': 'red',
        'negative': 'gray',
    }

    # Apply the color mapping to the top 12 significant features
    colors = []
    for feature in importances.index:
        if feature in supporting_features_1 or feature in supporting_features_2:
            colors.append('blue')
        elif feature in presentation_labels or feature in qna_labels:
            colors.append('orange')
        elif feature in emotion_types:
            colors.append(color_mapping[emotion_types[feature]])
        else:
            colors.append('purple')  # Default for unknown features

    # Create a horizontal bar plot for the top 12 significant features
    plt.figure(figsize=(10, 5))
    plt.barh(importances.index, importances.values, color=colors)

    # add std errors
    plt.errorbar(importances, importances.index, xerr=model.bse[1:][importances.index], fmt='|', color='black')

    # Add labels and title
    plt.xlabel('Impact')
    plt.ylabel('Features')
    plt.title('Significant Features and their Impact')

    if type == 'Management Emotions' or type == 'Analyst Emotions' or type == 'All Emotions':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='green'), plt.Rectangle((0, 0), 1, 1, color='gray'), plt.Rectangle((0, 0), 1, 1, color='red')
        ]
        labels = [
                'Positive Emotion', 'Negative Emotion', 'Strong Negative Emotion'
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'All Earnings call':
        # Create a custom legend
        handles = [
            #plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='orange'),
            plt.Rectangle((0, 0), 1, 1, color='green'), plt.Rectangle((0, 0), 1, 1, color='gray'), plt.Rectangle((0, 0), 1, 1, color='red')
        ]
        labels = [
                #'Financial Data', 
                'LLM Labels', 
                'Positive Emotion', 'Negative Emotion', 'Strong Negative Emotion'
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'Financial Data & Presentation Labels' or type == 'Financial Data & Q&A Labels' or type == 'Financial Data 1&2 & LLM Labels':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='orange'),
        ]
        labels = [
                'Financial Data', 
                'LLM Labels', 
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'Financial Data & All Emotions' or type == 'Financial Data & Management Emotions' or type == 'Financial Data & Analyst Emotions':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='green'), plt.Rectangle((0, 0), 1, 1, color='gray'), plt.Rectangle((0, 0), 1, 1, color='red')
        ]
        labels = [
                'Financial Data', 
                'Positive Emotion', 'Negative Emotion', 'Strong Negative Emotion'
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'Financial Data & All Earnings call':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='orange'),
            plt.Rectangle((0, 0), 1, 1, color='green'), plt.Rectangle((0, 0), 1, 1, color='gray'), plt.Rectangle((0, 0), 1, 1, color='red')
        ]
        labels = [
                'Financial Data', 
                'LLM Labels', 
                'Positive Emotion', 'Negative Emotion', 'Strong Negative Emotion'
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'New Metrics':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='purple'),
        ]
        labels = [
                'Financial Data', 
                'New Metrics',
                ]
        # Add the legend
        plt.legend(handles, labels)

    # Show the plot
    return plt.show()

In [9]:
def build_score_overview(df):    

    graph_df = df[['Date', 'MTS', 'CAMI', 'ASS', 'call_ID']]
    graph_df = graph_df.drop_duplicates(subset=['Date', 'call_ID'])

    # Set Seaborn style for a cleaner look
    sns.set(style="whitegrid")

    # Convert Date to pandas datetime format if not already done
    graph_df['Date'] = pd.to_datetime(graph_df['Date'])

    # Calculate rolling averages for smoother trends
    graph_df['MTS_Rolling'] = graph_df['MTS'].rolling(window=30).mean()
    graph_df['CAMI_Rolling'] = graph_df['CAMI'].rolling(window=30).mean()
    graph_df['ASS_Rolling'] = graph_df['ASS'].rolling(window=30).mean()

    # Plotting with customized aesthetics
    plt.figure(figsize=(14, 8))
    plt.plot(graph_df['Date'], graph_df['MTS_Rolling'], label='MTS (30-day avg)', linestyle='-', color='steelblue', linewidth=2)
    plt.plot(graph_df['Date'], graph_df['CAMI_Rolling'], label='CAMI (30-day avg)', linestyle='--', color='darkorange', linewidth=2)
    plt.plot(graph_df['Date'], graph_df['ASS_Rolling'], label='ASS (30-day avg)', linestyle=':', color='seagreen', linewidth=2)

    # Original data as lighter points
    plt.scatter(graph_df['Date'], graph_df['MTS'], color='steelblue', alpha=0.4, s=10)
    plt.scatter(graph_df['Date'], graph_df['CAMI'], color='darkorange', alpha=0.4, s=10)
    plt.scatter(graph_df['Date'], graph_df['ASS'], color='seagreen', alpha=0.4, s=10)

    # Formatting the x-axis for date readability
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=3))  # Set major ticks every 3 months
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

    # Adding labels, title, and legend
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Composite Score', fontsize=12)
    plt.title('Trends of MTS, CAMI, and ASS Over Time (with 7-day Rolling Average)', fontsize=14)
    plt.legend()

    plt.tight_layout()
    plt.xticks(rotation=45)
    return plt.show()

def build_heatmap_macro(df):
    # Reducing the correlation matrix to show only MTS, CAMI, ASS with economic factors
    economic_factors = ['CBOE DJIA Volatility Index', 'NASDAQ 100 Index return', 
                        'Manufacturers inventories to sales ratio', '30 year conventional mortgage rate']

    correlation_columns = ['MTS', 'CAMI', 'ASS', 'CBOE DJIA Volatility Index', 'NASDAQ 100 Index return', 
                        'Manufacturers inventories to sales ratio', '30 year conventional mortgage rate']

    graph_df = df[['Date', 'MTS', 'CAMI', 'ASS', 'call_ID'] + economic_factors]
    graph_df = graph_df.drop_duplicates()

    # Calculating correlations
    correlation_matrix = graph_df[correlation_columns].corr()

    # Selecting only the relevant correlations
    reduced_correlation_matrix = correlation_matrix.loc[['MTS', 'CAMI', 'ASS'], economic_factors]

    # Plotting the reduced correlation heatmap
    plt.figure(figsize=(8, 5))
    sns.heatmap(reduced_correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title("Correlation of MTS, CAMI, ASS with Economic Indicators")
    return plt.show()

def build_heatmap_industries(df): 
    # display avg scores of MTS, CAMI, ASS for 'Industrials','Consumer Staples','Financials','Energy','Health Care','Utilities','Information Technology','Real Estate'
    graph_df = df[['MTS', 'CAMI', 'ASS', 'call_ID', 'Industrials','Consumer Staples','Financials','Energy','Health Care','Utilities','Information Technology','Real Estate']]
    graph_df = graph_df.drop_duplicates(subset=['call_ID'])

    industry_scores = {
        'Industrials': graph_df[graph_df['Industrials'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Consumer Staples': graph_df[graph_df['Consumer Staples'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Financials': graph_df[graph_df['Financials'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Energy': graph_df[graph_df['Energy'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Health Care': graph_df[graph_df['Health Care'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Utilities': graph_df[graph_df['Utilities'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Information Technology': graph_df[graph_df['Information Technology'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Real Estate': graph_df[graph_df['Real Estate'] == 1][['MTS', 'CAMI', 'ASS']].mean()
    }

    industry_scores_df = pd.DataFrame(industry_scores)

    # Plotting the average scores for each industry
    plt.figure(figsize=(10, 6))
    sns.heatmap(industry_scores_df, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title("Average MTS, CAMI, ASS Scores by Industry")
    return plt.show()

In [10]:
def build_correlation_map(df, features):
    # Adjust correlation matrix to show composite metrics on y-axis and financial metrics on x-axis
    composite_metrics = features

    # Compute the correlation matrix only for the selected metrics
    correlation_matrix_adjusted = df[composite_metrics + ['RR']].corr()
    correlation_matrix_adjusted = correlation_matrix_adjusted.loc[composite_metrics, ['RR']]

    # Plot the adjusted heatmap
    plt.figure(figsize=(8, 5))
    sns.heatmap(
        correlation_matrix_adjusted,
        annot=True,
        cmap='coolwarm',
        fmt=".2f",
        linewidths=0.5
    )
    plt.title("Correlation: Features and RR")
    plt.ylabel("Composite Metrics")
    return plt.show()

In [143]:
def create_loadings(df, feature_groups, best, dealer=False):
    """
    Computes PCA weights and derives composite scores for the provided feature groups.

    Parameters:
        df (pd.DataFrame): The input dataframe containing feature values.
        feature_groups (dict): A dictionary where keys are group names, and values are lists of feature names.

    Returns:
        pd.DataFrame: The dataframe with computed scores and composite metrics added.
    """

    def pca_weights(df, features, n_components=1):
        """
        Computes the PCA weights for a subset of features.
        If the sum of weights is negative, it inverts the weights.

        Parameters:
            df (pd.DataFrame): The input dataframe containing feature values.
            features (list): A list of feature names to include in PCA.
            n_components (int): Number of principal components to compute.

        Returns:
            np.ndarray: The weights of the first principal component.
        """
        pca = PCA(n_components=n_components)
        pca.fit(df[features])
        weights = pca.components_[0]

        #print(weights)

        # Invert weights if the sum is negative
        if sum(weights) < 0:
            weights = -weights
        return weights

    # Store computed scores in the dataframe
    for group_name, features in feature_groups.items():
        weights = pca_weights(df, features)
        df[group_name] = np.dot(df[features], weights)

    if dealer:
        # reverse sign of Analyst_Confirmation due to wrong sign in the data
        df['Analyst_Confirmation'] = -df['Analyst_Confirmation']

    if best:
        # Create composite scores
        df['MTS'] = df['Openess'] - df['Missing_Transparency']
        df['CAMI'] = df['Finance'] - df['General_Tone'] + df['Mgmt_Emotions'] - df['Operations']
        df['ASS'] = df['Analyst_Critical'] + df['Analyst_Positive'] - df['Analyst_Negative']

    return df

In [12]:
best_features = {
    'Missing_Transparency': ['vagueness', 'evasiveness', 'responsiveness'],
    'Openess' : ['language_accessibility_presentation', 'language_accessibility_qna', 'depth', 'debt_leverage_stress', 'legal_issues'],
    'Finance' : ['liquidity_position', 'asset_quality'],
    'Operations' : ['operational_trends', 'industry_positioning'],
    'General_Tone': ['optimistic', 'positive_sentiment', 'uncertainty', 'macroeconomic'],
    'Mgmt_Emotions': ['excitement', 'admiration', 'confusion', 'relief'],
    'Analyst_Critical': ['analyst_concerns', 'disapproval_analysts', 'remorse_analysts'],
    'Analyst_Negative': ['sadness_analysts', 'embarrassment_analysts', 'anger_analysts'],
    'Analyst_Positive': ['analyst_satisfaction', 'optimism_analysts']
}

most_insightful = {
    'Missing_Transparency': ['vagueness', 'evasiveness', 'positive_sentiment'],
    'Operations' : ['operational_trends', 'industry_positioning', 'legal_issues'],
    'Analyst_Critical': ['analyst_concerns', 'disapproval_analysts', 'remorse_analysts'],
}

most_insightful_emotion = {
    'Missing_Transparency': ['vagueness', 'evasiveness', 'positive_sentiment'],
    'Operations' : ['operational_trends', 'industry_positioning', 'legal_issues'],
    'Mgmt_Emotions': ['excitement', 'admiration', 'relief', 'confusion', 'optimistic'],
    'Analyst_Critical': ['analyst_concerns', 'disapproval_analysts', 'remorse_analysts'],
    'Analyst_Negative': ['sadness_analysts', 'embarrassment_analysts', 'anger_analysts'],
}

analysis = {
    'best': [best_features, False],
    'best_composite': [best_features, True],
    'most_insightful': [most_insightful, False],
    'most_insightful_emotion': [most_insightful_emotion, False],
}


In [None]:
'''Correlation Analysis'''
df = llm_outputs.copy()

print('Best feature sets:')
feature_groups = best_features
composite_df = create_loadings(df, feature_groups, True)
build_correlation_map(composite_df, list(feature_groups.keys()))
build_correlation_map(composite_df, ['MTS', 'CAMI', 'ASS'])

'''Individual Feature Analysis'''
print('Presentation labels:')
composite_df = create_loadings(df, feature_groups, False)
build_correlation_map(composite_df, presentation_labels)

print('Q&A labels:')
composite_df = create_loadings(df, feature_groups, False)
build_correlation_map(composite_df, qna_labels)

print('Management emotions:')
composite_df = create_loadings(df, feature_groups, False)
build_correlation_map(composite_df, emotions)

print('Analyst emotions:')
composite_df = create_loadings(df, feature_groups, False)
build_correlation_map(composite_df, analyst_emotions)

In [14]:
pres = {
    'Missing_Transparency': ['vagueness', 'evasiveness'],
    'Openess' : ['language_accessibility_presentation', 'debt_leverage_stress', 'legal_issues'],
}

qna_ones = {
    'Missing_Transparency': ['responsiveness'],
    'Openess' : ['language_accessibility_qna', 'depth'],
    'Analyst_Critical': ['analyst_concerns', 'disapproval_analysts', 'remorse_analysts'],
}

compare = {
    'Missing_Transparency': ['vagueness', 'evasiveness', 'responsiveness'],
    'Openess' : ['language_accessibility_presentation', 'language_accessibility_qna', 'depth', 'debt_leverage_stress', 'legal_issues'],
    'Analyst_Critical': ['analyst_concerns', 'disapproval_analysts', 'remorse_analysts'],
}

emotion_comp = {
    'Mgmt_Emotions': ['excitement', 'admiration', 'confusion', 'relief'],
    'Analyst_Negative': ['sadness_analysts', 'embarrassment_analysts', 'anger_analysts'],
    'Analyst_Positive': ['analyst_satisfaction', 'optimism_analysts']
}

transparency_emotions = {
    'Missing_Transparency': ['vagueness', 'evasiveness', 'responsiveness'],
    'Openess' : ['language_accessibility_presentation', 'language_accessibility_qna', 'depth', 'debt_leverage_stress', 'legal_issues'],
    'Analyst_Critical': ['analyst_concerns', 'disapproval_analysts', 'remorse_analysts'],
    'Mgmt_Emotions': ['excitement', 'admiration', 'confusion', 'relief'],
    'Analyst_Negative': ['sadness_analysts', 'embarrassment_analysts', 'anger_analysts'],
    'Analyst_Positive': ['analyst_satisfaction', 'optimism_analysts']
}

all = {
    'Missing_Transparency': ['vagueness', 'evasiveness', 'responsiveness'],
    'Openess' : ['language_accessibility_presentation', 'language_accessibility_qna', 'depth', 'debt_leverage_stress', 'legal_issues'],
    'Finance' : ['liquidity_position', 'asset_quality'],
    'Operations' : ['operational_trends', 'industry_positioning'],
    'General_Tone': ['optimistic', 'positive_sentiment', 'uncertainty', 'macroeconomic'],
    'Mgmt_Emotions': ['excitement', 'admiration', 'confusion', 'relief'],
    'Analyst_Critical': ['analyst_concerns', 'disapproval_analysts', 'remorse_analysts'],
    'Analyst_Negative': ['sadness_analysts', 'embarrassment_analysts', 'anger_analysts'],
    'Analyst_Positive': ['analyst_satisfaction', 'optimism_analysts']
}

analysis = {
    'Presentation Transparency': [pres, False],
    'Q&A Transparency': [qna_ones, False],
    'Compare': [compare, False],
    'Emotion Comparison': [emotion_comp, False],
    'Transparency & Emotions': [transparency_emotions, False],
    'All': [all, False],
    'Computed': [all, True],
}

In [148]:
trans = {
    'Missing_Transparency': ['vagueness', 'evasiveness', 'responsiveness'],
    #'Openess' : ['language_accessibility_presentation', 'language_accessibility_qna', 'depth', 'debt_leverage_stress', 'legal_issues']
}

trans_oper = {
    'Missing_Transparency': ['vagueness', 'evasiveness', 'responsiveness'],
    'Operations' : ['operational_trends', 'industry_positioning', 'positive_sentiment'],
    'Optimistic': ['optimistic'],
    #'Uncertainty': ['uncertainty'],
}

trans_oper_ana = {
    'Missing_Transparency': ['vagueness', 'evasiveness', 'responsiveness'],
    'Operations' : ['operational_trends', 'industry_positioning', 'positive_sentiment'],
    'Optimistic': ['optimistic'],
    'Uncertainty': ['uncertainty'],
    'Analyst_Support': ['analyst_concerns', 'analyst_satisfaction'],
}

emotion_both = {
    'Mgmt_Positive' : ['relief', 'excitement', 'pride', 'optimism', 'approval', 'admiration', 'joy'],
    'Analyst_Negative' : ['confusion_analysts', 'sadness_analysts', 'disappointment_analysts', 'embarrassment_analysts', 'disapproval_analysts', 'fear_analysts', 'anger_analysts'],
}

all = {
    'Missing_Transparency': ['vagueness', 'evasiveness', 'responsiveness'],
    #'Openess' : ['language_accessibility_presentation', 'language_accessibility_qna', 'depth', 'debt_leverage_stress', 'legal_issues'],
    'Operations' : ['operational_trends', 'industry_positioning', 'positive_sentiment'],
    'Optimistic': ['optimistic'],
    #'Uncertainty': ['uncertainty'],
    'Mgmt_Positive' : ['relief', 'excitement', 'pride', 'optimism', 'approval', 'admiration', 'joy'],
    'Analyst_Negative' : ['confusion_analysts', 'sadness_analysts', 'disappointment_analysts', 'embarrassment_analysts', 'disapproval_analysts', 'fear_analysts', 'anger_analysts'],
    'Analyst_Support': ['analyst_concerns', 'analyst_satisfaction'],
}

analysis = {
    'Transparency': [trans, False],
    'Transparency & Operations': [trans_oper, False],
    'Transparency, Operations & Analysts': [trans_oper_ana, False],
    'Emotions': [emotion_both, False],
    'All': [all, False],
}

In [None]:
'''COMPOSITE METRICS'''
'''PCA'''
for key in analysis:
    print('')
    print('________________________________________________________________')
    print('________________________________________________________________')
    print('________________________________________________________________')
    print('')
    print(key)
    feature_groups = analysis[key][0]
    composite = analysis[key][1]

    df = llm_outputs.copy()
    composite_df = create_loadings(df, feature_groups, composite)

    if composite:
        final_df = composite_df[['RR']
                            + supporting_features_1
                            + supporting_features_2
                            #####################
                            + ['MTS', 'CAMI', 'ASS']
                            ]
    else:
        final_df = composite_df[['RR']
                            + supporting_features_1
                            + supporting_features_2
                            #####################
                            + list(feature_groups.keys())
                            ]
                        
    # reset index
    final_df = final_df.reset_index(drop=True)

    # In-Sample-Regression
    y_train, y_test = final_df['RR'], final_df['RR']
    X_train, X_test = final_df.drop(columns=['RR']), final_df.drop(columns=['RR'])

    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)

    # Fit the model
    model = sm.OLS(y_train, X_train).fit()

    # Print the summary of the model which includes p-values and significance levels
    print(model.summary())

    # Make predictions
    y_pred = model.predict(X_test)

    # Compute and print evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {np.sqrt(mse)}")
    print(f"R-squared: {r2}")

   # create_colored_chart(model, 'New Metrics')

In [None]:
'''DEEPER ANALYSIS - INTERPRETABILITY'''
only_one = True
if only_one:
    feature_groups = analysis['best'][0]
    composite = analysis['best'][1]
else:
    feature_groups = analysis['best_composite'][0]
    composite = analysis['best_composite'][1]

df = llm_outputs.copy()
composite_df = create_loadings(df, feature_groups, composite)

In [51]:
# Function to plot with a trend line
def plot_with_trendline(x, y, xlabel, ylabel, title, color):
    plt.figure(figsize=(8, 6))
    plt.scatter(x, y, label='Data Points', alpha=0.7, color=color)
    # Calculate and plot trend line
    z = np.polyfit(x, y, 1)  # Linear fit
    p = np.poly1d(z)
    plt.plot(x, p(x), label='Trend Line', color='red')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    plt.grid(True)
    plt.show()



In [None]:
# Plot 1: RR vs MTS with trend line
plot_with_trendline(
    composite_df['RR'],
    composite_df['MTS'],
    'Recovery Rates (RR)',
    'Management Transparency Score (MTS)',
    'Recovery Rates (RR) vs Management Transparency Score (MTS)',
    'blue'
)

# Plot 2: RR vs CAMI with trend line
plot_with_trendline(
    composite_df['RR'],
    composite_df['CAMI'],
    'Recovery Rates (RR)',
    'Crisis Awareness and Management Index (CAMI)',
    'Recovery Rates (RR) vs Crisis Awareness and Management Index (CAMI)',
    'orange'
)

# Plot 3: RR vs ASS with trend line
plot_with_trendline(
    composite_df['RR'],
    composite_df['ASS'],
    'Recovery Rates (RR)',
    'Analysts Urgency Score (ASS)',
    'Recovery Rates (RR) vs Analysts Urgency Score (ASS)',
    'green'
)

In [None]:
# Calculate absolute changes in recovery rate for each metric

analysis = {
    'All': [all, False],
}

results = []
for key in analysis:
    feature_groups = analysis[key][0]
    composite = analysis[key][1]
    df = llm_outputs.copy()
    composite_df = create_loadings(df, feature_groups, composite)

    metrics = analysis[key][0]
    if composite:
        metrics = ['MTS', 'CAMI', 'ASS']

    for m in metrics:
        # Sort the data by the metric
        data_sorted = composite_df.sort_values(by=m)
        
        # Calculate the number of entries for 20%
        n = len(data_sorted)
        top_20_percent_index = int(n * 0.2)
        bottom_20_percent_index = int(n * 0.2)
        
        # Get the top 20% and bottom 20% data
        top_20_percent = data_sorted.iloc[-top_20_percent_index:]
        bottom_20_percent = data_sorted.iloc[:bottom_20_percent_index]
        
        # Calculate the average RR for top 20% and bottom 20%
        average_rr = data_sorted['RR'].mean()
        average_rr_top_20 = top_20_percent['RR'].mean()
        average_rr_bottom_20 = bottom_20_percent['RR'].mean()
        
        # Calculate the standard deviation of the metric
        std_dev = data_sorted[m].std()
        mean_metric = data_sorted[m].mean()
        
        # Determine the threshold values
        upper_threshold = mean_metric + std_dev
        lower_threshold = mean_metric - std_dev
        
        # Filter the data based on the threshold values
        # if there are no values above or below the threshold, the filtered data will be min/max of the data
        upper_data = data_sorted[data_sorted[m] >= upper_threshold]
        if upper_data.empty:
            upper_data = data_sorted.max()
        lower_data = data_sorted[data_sorted[m] <= lower_threshold]
        if lower_data.empty:
            lower_data = data_sorted.min()
        
        # Calculate the average RR for the filtered data
        average_rr_upper = upper_data['RR'].mean()
        average_rr_lower = lower_data['RR'].mean()
        
        # Store the absolute changes
        results.append({
            "Metric": m,
            "Top 20% Change in RR (%)": average_rr_top_20 - average_rr,
            "Bottom 20% Change in RR (%)": average_rr_bottom_20 - average_rr,
            "1 Std Above Mean Change in RR (%)": average_rr_upper - average_rr,
            "1 Std Below Mean Change in RR (%)": average_rr_lower - average_rr
        })

# Convert results to DataFrame and display
change_df = pd.DataFrame(results)
# import ace_tools as tools; tools.display_dataframe_to_user(name="Absolute Changes in Recovery Rates", dataframe=change_df)
change_df


In [None]:
import pandas as pd
import shap
import xgboost as xgb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Splitting into train and test sets
X = final_df.drop("RR", axis=1)
y = final_df["RR"]

X_train, X_test, y_train, y_test = X, X, y, y

# Train a simple XGBoost model
model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
model.fit(X_train, y_train)

# SHAP Analysis
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_train)

# Visualizing SHAP summary plot
shap.summary_plot(shap_values, X_train)


In [None]:
from sklearn.cluster import KMeans
import numpy as np

# Aggregate SHAP values for clustering
feature_importance = np.abs(shap_values.values).mean(axis=0)
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(feature_importance.reshape(-1, 1))

# Assign clusters to features
clustered_features = pd.DataFrame({'Feature': X_train.columns, 'Cluster': clusters})
print(clustered_features)

In [None]:
from sklearn.linear_model import LinearRegression

lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
lin_explainer = shap.Explainer(lin_model, X_train)
lin_shap_values = lin_explainer(X_train)

shap.summary_plot(lin_shap_values, X_train)


In [None]:
import shap
import numpy as np
import matplotlib.pyplot as plt
from shap import KernelExplainer

# Initialize SHAP Kernel Explainer
predict_fn = lambda x: model.predict(x)
explainer = KernelExplainer(predict_fn, X_train.sample(100))
shap_values = explainer.shap_values(X_train, nsamples=100)


In [None]:
def enhanced_dependence_plot_top_bottom_split(feature, shap_values, X, interaction_feature, title):
    """
    Create SHAP dependence plots separated into top 50% and bottom 50% for the interaction feature.

    Parameters:
        feature (str): The feature to plot.
        shap_values (shap.Explanation): SHAP values.
        X (pd.DataFrame): The dataset.
        interaction_feature (str): The feature to show interaction effects with.
        title (str): The title of the plot.
    """
    # Compute the median value of the interaction feature
    median_value = X[interaction_feature].median()

    # Split data into top 50% and bottom 50%
    X_top = X[X[interaction_feature] > median_value]
    shap_values_top = shap_values[X[interaction_feature] > median_value]

    X_bottom = X[X[interaction_feature] <= median_value]
    shap_values_bottom = shap_values[X[interaction_feature] <= median_value]

    # Plot for top 50%
    plt.figure(figsize=(10, 6))
    shap.dependence_plot(
        feature,
        shap_values_top,
        X_top,
        interaction_index=interaction_feature,
        alpha=0.7,
        show=False,
    )
    plt.title(f"{title} (Top 50% of {interaction_feature})", fontsize=14, weight='bold')
    plt.xlabel(feature, fontsize=12)
    plt.ylabel(f"SHAP Value for {feature}", fontsize=12)
    plt.grid(alpha=0.4)
    plt.tight_layout()
    plt.show()

    # Plot for bottom 50%
    plt.figure(figsize=(10, 6))
    shap.dependence_plot(
        feature,
        shap_values_bottom,
        X_bottom,
        interaction_index=interaction_feature,
        alpha=0.7,
        show=False
    )
    plt.title(f"{title} (Bottom 50% of {interaction_feature})", fontsize=14, weight='bold')
    plt.xlabel(feature, fontsize=12)
    plt.ylabel(f"SHAP Value for {feature}", fontsize=12)
    plt.grid(alpha=0.4)
    plt.tight_layout()
    plt.show()


# Example usage:
# Interaction 1: Time to Maturity × Management Transparency Score (MTS)
enhanced_dependence_plot_top_bottom_split(
    "Time to maturity",
    shap_values,
    X_train,
    interaction_feature="MTS",
    title="Interaction: Time to Maturity × MTS"
)

# Interaction 1: Time to Maturity × Management Transparency Score (MTS)
enhanced_dependence_plot_top_bottom_split(
    "Time to maturity",
    shap_values,
    X_train,
    interaction_feature="CAMI",
    title="Interaction: Time to Maturity × CAMI"
)

# Interaction 2: Default Barrier × Crisis Awareness and Management Index (CAMI)
enhanced_dependence_plot_top_bottom_split(
    "Default barrier",
    shap_values,
    X_train,
    interaction_feature="ASS",
    title="Interaction: Default Barrier × ASS"
)

# Interaction 3: Volume × Analyst Urgency Score (ASS)
enhanced_dependence_plot_top_bottom_split(
    "Volume",
    shap_values,
    X_train,
    interaction_feature="CAMI",
    title="Interaction: Volume × CAMI"
)

# Interaction 3: Volume × Analyst Urgency Score (ASS)
enhanced_dependence_plot_top_bottom_split(
    "Senior secured",
    shap_values,
    X_train,
    interaction_feature="ASS",
    title="Interaction: Senior Secured × ASS"
)

enhanced_dependence_plot_top_bottom_split(
    "Equity value",
    shap_values,
    X_train,
    interaction_feature="MTS",
    title="Interaction: Equity value × MTS"
)

In [None]:
# Adjust correlation matrix to show composite metrics on y-axis and financial metrics on x-axis
composite_metrics = ['MTS', 'CAMI', 'ASS']
financial_metrics = ['Time to maturity', 'Senior secured', 'Volume', 'Equity value']

# Compute the correlation matrix only for the selected metrics
correlation_matrix_adjusted = composite_df[composite_metrics + financial_metrics].corr()
correlation_matrix_adjusted = correlation_matrix_adjusted.loc[composite_metrics, financial_metrics]

# Plot the adjusted heatmap
plt.figure(figsize=(8, 5))
sns.heatmap(
    correlation_matrix_adjusted,
    annot=True,
    cmap='coolwarm',
    fmt=".2f",
    linewidths=0.5
)
plt.title("Correlation: Composite Metrics (Y) vs Financial Metrics (X)")
plt.xlabel("Financial Metrics")
plt.ylabel("Composite Metrics")
plt.show()



In [None]:
graph_df = composite_df[['Date', 'MTS', 'CAMI', 'ASS', 'call_ID', 'Ddate']]
graph_df = graph_df.drop_duplicates(subset=['Date', 'call_ID'])

# compute difference between dates
graph_df['Ddate'] = pd.to_datetime(graph_df['Ddate'])
graph_df['Date'] = pd.to_datetime(graph_df['Date'])
graph_df['DateDiff'] = (graph_df['Ddate'] - graph_df['Date']).dt.days

# divide into groups: <7, <30, <90, >90
graph_df['DateGroup'] = pd.cut(graph_df['DateDiff'], bins=[-1, 7, 30, 90, 1000], labels=['<7', '<30', '<90', '>90'])

# calculate average scores for each group
group_scores = graph_df.groupby('DateGroup')[['MTS', 'CAMI', 'ASS']].mean()

# Plotting the average scores for each group
plt.figure(figsize=(10, 6))
sns.heatmap(group_scores, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Average MTS, CAMI, ASS Scores by Date Group")
plt.show()