In [134]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
# Add a constant to the model (intercept)
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler
import matplotlib.patches as mpatches
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.dates as mdates

In [None]:
'''If executed in Google Colab, uncomment the following lines'''
#from google.colab import drive
#drive.mount('/content/drive')

#import os
#os.chdir('/content/drive/MyDrive/LLM_CreditorRRPrediction')

In [64]:
presentation_labels = ['negative_sentiment', 'positive_sentiment', 'uncertainty', 'optimistic', 'pessimistic', 'vagueness', 'language_accessibility_presentation',
              'liquidity_position', 'debt_leverage_stress', 'operational_trends', 'industry_positioning', 'asset_quality', 'recovery_strategies', 'legal_issues','macroeconomic']

qna_labels = ['analyst_concerns', 'responsiveness', 'confidence', 'evasiveness', 'depth', 'analyst_satisfaction', 'language_accessibility_qna']

qna_mgmt = ['responsiveness', 'confidence', 'depth', 'evasiveness', 'language_accessibility_qna']
qna_analysts = ['analyst_concerns', 'analyst_satisfaction']

emotions = [
        'admiration', 
        #'amusement', 
        'anger', 
        'annoyance', 
        'approval', 
        'caring', 
        'confusion', 
        'curiosity', 
        'desire',
        'disappointment', 
        'disapproval', 
        #'disgust', 
        'embarrassment', 
        'excitement', 
        'fear', 
        #'gratitude', 
        #'grief',
        'joy', 
        #'love', 
        'nervousness', 
        'optimism', 
        'pride', 
        'realization', 
        'relief', 
        'remorse', 
        'sadness', 
        'surprise'
    ]

analyst_emotions = []
for i in emotions:
    analyst_emotions.append(i + '_analysts')

supporting_features_1 = [
    'CBOE DJIA Volatility Index',
    'NASDAQ 100 Index return',
    'Manufacturers inventories to sales ratio',
    '30 year conventional mortgage rate',
    'Communication Services', 
    'Consumer Discretionary', 
    'Senior secured',  
    'Time to maturity',  
    'Equity value',
    'CDS availability',
    'ActIndustryDistress1',
    'ActIndustryDistress2',
    'Offering amount',
    'Volume',
    'Industrials','Consumer Staples','Financials','Energy','Health Care','Utilities','Information Technology','Real Estate'
]

supporting_features_2 = [
    'Default barrier',
    'LTDIssuance2',
    'Intangibility',
    'Receivables1',
]

primary_dealer_features = [
    'HHI_number', 'TimeToMaturity', 'TimeSinceOffering', 'Offering_amount',
       'Rating', 'JunkDummy', 'UnratedDummy', 'Enhanced', 'Redeemable',
       'SinkingFund', 'BOND_COUPON', 'IQ_CDS_availability', 'COVENANTS',
        'AvgTransVol', 'TRADES_VOL', 'amihud_ILLIQ_trades',
       'price_dispersion_volumeweighted', 'EquityValue', 'DefaultBarrier2',
       'NumberEmployees', 'IndDis1', 'IndDis2', 'Slope', 'X90DayDR', 'GDP',
       'TradeSize_Retail', 'TradeSize_SmallInstitutional',
       'Seniority_SeniorSubordinate', 'Seniority_SeniorUnsecured',
       'Seniority_SubordinateJunior', 'Default_DefRating',
       'Default_Distressed_exchange', 'Default_Liquidation_C7',
       'Default_RiskRating'
]

In [None]:
dealer_data = pd.read_csv('dealer/dealer_data.csv')

In [None]:
# Define the columns needed based on your list for predicting 'PrimaryDealer'
columns_needed = [
    # General columns
    'Date', 'DealCSP', "RR_Price",
    # Trade chain
    'TRADE_CHAIN2', 'TRADE_CHAIN3', 'TRADE_CHAIN4', 'TRADE_CHAIN5', 'TRADE_CHAIN6', 'TRADE_CHAIN7', 'TRADE_CHAIN8', 'TRADE_CHAIN9', 'TRADE_CHAIN10',
    # Probability prediction
    "PrimaryDealer", "HHI_number", "TradeSize", "TimeToMaturity", "TimeSinceOffering",
    "Seniority", "Offering_amount", "Rating", "JunkDummy", "UnratedDummy", "Enhanced", 
    "Redeemable", "SinkingFund", "BOND_COUPON", "IQ_CDS_availability", "COVENANTS",
    "Year", "AvgTransVol", "TRADES_VOL", "amihud_ILLIQ_trades", "price_dispersion_volumeweighted",
    "EquityValue", "DefaultBarrier2", "NumberEmployees", "IndDis1", "IndDis2",
    "Slope", "X90DayDR", "GDP", "Default"
]

# Limit the DataFrame to the relevant columns
dealer_df_limited = dealer_data[columns_needed]

# rename DealCSP to CUSIP
dealer_df_limited.rename(columns={'DealCSP': 'CUSIP'}, inplace=True)

# rename RR_price to dealer_RR
dealer_df_limited.rename(columns={'RR_Price': 'dealer_RR'}, inplace=True)

# transform the date column to date
dealer_df_limited['Date'] = pd.to_datetime(dealer_df_limited['Date'])

# ingore time
dealer_df_limited['Date'] = dealer_df_limited['Date'].dt.date

In [67]:
# Convert categorical columns to dummy variables
categorical_cols = ["TradeSize", "Seniority", "Default"]
dealer_df_limited = pd.get_dummies(dealer_df_limited, columns=categorical_cols, drop_first=True)

# Ensure numeric columns are of float type
numeric_cols = ["HHI_number", "TimeToMaturity", "TimeSinceOffering", "Offering_amount", 
                "BOND_COUPON", "IQ_CDS_availability", "COVENANTS", "AvgTransVol", "TRADES_VOL", 
                "amihud_ILLIQ_trades", "price_dispersion_volumeweighted", "EquityValue", 
                "DefaultBarrier2", "NumberEmployees", "IndDis1", "IndDis2", "Slope", 
                "X90DayDR", "GDP"]
dealer_df_limited[numeric_cols] = dealer_df_limited[numeric_cols].apply(pd.to_numeric, errors='coerce')

In [68]:
presentation_analysis = pd.read_csv('transcripts/LLM_outputs_presentation.csv')
qna_analysis = pd.read_csv('transcripts/LLM_QnA_analysis.csv')

# Add GoEmotions labels to the llm_output
emotions_presentation = pd.read_csv('goemotions/presentation_summary_final.csv', delimiter='|')
emotions_qna = pd.read_csv('goemotions/qna_summary_final.csv', delimiter='|')
emotions_analysts = pd.read_csv('goemotions/analysts_summary_final.csv', delimiter='|')

In [None]:
SEPARATE = False

presentation_analysis.rename(columns={'language_accessibility': 'language_accessibility_presentation'}, inplace=True)
qna_analysis.rename(columns={'language_accessibility': 'language_accessibility_qna'}, inplace=True)

qna_analysis = qna_analysis[["call_ID"] + qna_labels]
qna_analysis.drop_duplicates(keep='first', inplace=True)
emotions_presentation = emotions_presentation[["call_ID"] + emotions]
emotions_presentation.drop_duplicates(keep='first', inplace=True)
emotions_qna = emotions_qna[["call_ID"] + emotions]
emotions_qna.drop_duplicates(keep='first', inplace=True)
emotions_analysts = emotions_analysts[["call_ID"] + emotions]
emotions_analysts.drop_duplicates(keep='first', inplace=True)

# add qna_labels to the llm_output
llm_output = pd.merge(presentation_analysis, qna_analysis, on='call_ID', how='left')
llm_output = pd.merge(llm_output, emotions_presentation, on='call_ID', how='left')
llm_output = pd.merge(llm_output, emotions_qna, on='call_ID', how='left')

if not SEPARATE:
    for emotion in emotions:
        llm_output[emotion] = llm_output[f'{emotion}_x'] + llm_output[f'{emotion}_y']
        llm_output.drop([f'{emotion}_x', f'{emotion}_y'], axis=1, inplace=True)
        scaler = MinMaxScaler()
        llm_output[emotion] = scaler.fit_transform(llm_output[[emotion]])

llm_output = pd.merge(llm_output, emotions_analysts, on='call_ID', how='left')

# rename emotions_x to emotions and emotions_y to emotions_analysts
for emotion in emotions:
    llm_output.rename(columns={f'{emotion}_x': f'{emotion}', f'{emotion}_y': f'{emotion}_analysts'}, inplace=True)

llm_output.head()

In [70]:
# Ensure the columns are in datetime format
llm_output['Date'] = pd.to_datetime(llm_output['Date'])
llm_output['Ddate'] = pd.to_datetime(llm_output['Ddate'])
llm_output
llm_output['t_delta'] = llm_output['Ddate'] - llm_output['Date']

# drop all with t_delta > 180
llm_output = llm_output[llm_output['t_delta'] <= pd.Timedelta('180 days')]

In [110]:
dealer_df = pd.merge(dealer_df_limited, llm_output, on='CUSIP', how='inner')

In [111]:
# rename Date_x to Date_Trade and Date_y to Date_Call
dealer_df.rename(columns={'Date_x': 'Date_Trade', 'Date_y': 'Date_Call'}, inplace=True)

In [112]:
# Convert date columns to datetime if needed
if not np.issubdtype(dealer_df['Date_Trade'].dtype, np.datetime64):
    dealer_df['Date_Trade'] = pd.to_datetime(dealer_df['Date_Trade'], errors='coerce')

if not np.issubdtype(dealer_df['Date_Call'].dtype, np.datetime64):
    dealer_df['Date_Call'] = pd.to_datetime(dealer_df['Date_Call'], errors='coerce')

if not np.issubdtype(dealer_df['Ddate'].dtype, np.datetime64):
    dealer_df['Ddate'] = pd.to_datetime(dealer_df['Ddate'], errors='coerce')

# If you have multiple bonds/dates, a typical approach is:
dealer_df = dealer_df.sort_values(['CUSIP','Date_Trade']).copy()

In [None]:
# drop all with Date_Trade > Ddate + 30
print(len(dealer_df))
dealer_df = dealer_df[dealer_df['Date_Trade'] <= dealer_df['Ddate'] + pd.Timedelta('30 days')]
print(len(dealer_df))

In [None]:
def flag_event_window(df, call_date_col='Date_Call', trade_date_col='Date_Trade', window=7):
    """
    For each row, checks if Date_Trade is within +/- window days of Date_Call.
    Returns an integer flag: -1 if in pre-event window, 0 if event date, 1 if in post-event window, else np.nan
    """
    delta = (df[trade_date_col] - df[call_date_col]).dt.days
    conditions = [
        (delta < 0) & (delta >= -window),
        (delta == 0),
        (delta > 0) & (delta <= window)
    ]
    choices = [-1, 0, 1]  # you can define your own labeling
    return np.select(conditions, choices, default=np.nan)

for t in [7, 14, 30]:  # define multiple windows

    dealer_df[f'event_flag_{t}d'] = flag_event_window(dealer_df, 'Date_Call', 'Date_Trade', window=t)

    pre_event = dealer_df[dealer_df[f'event_flag_{t}d'] == -1]
    post_event = dealer_df[dealer_df[f'event_flag_{t}d'] == 1]
    on_event = dealer_df[dealer_df[f'event_flag_{t}d'] == 0]

    # count of pre/post-event observations
    print(f"\n\n{t}-day window:")
    print(f"Pre-event: {len(pre_event)}")
    print(f"Post-event: {len(post_event)}")
    print(f"On-event: {len(on_event)}")

In [None]:
dealer_df['delta_days'] = (dealer_df['Date_Trade'] - dealer_df['Date_Call']).dt.days
dealer_df['delta_default'] = (dealer_df['Date_Trade'] - dealer_df['Ddate']).dt.days

# plot frequency of trades per day over time in relation to earnings calls
avg_trades = dealer_df.groupby('delta_days').size()#.rolling(window=3).mean()
avg_trades.name = 'avg_trades'

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(avg_trades, label='Trades per Day', color='black')

# add to x=0 dotted lines 
ax.axvline(0, color='red', linestyle='--', label='Earnings Call Date')
ax.set_xlabel('Date')
# x min = -30
ax.set_xlim(-30, 30)
ax.set_ylabel('Number of Trades')
ax.set_ylim(0, 2000)
ax.set_title('Average Number of Trades per Day')
ax.legend()
plt.show()

In [None]:
# percentage of trades being done by primary dealer within 30 after call
after_30 = dealer_df[dealer_df['delta_days'] > 0]
after_30 = after_30[after_30['delta_days'] <= 30]
print('Percentage Primary 30 days after call')
print(len(after_30[after_30['PrimaryDealer'] == 1]) / len(after_30))

print('Precentage Primary general')
print(len(dealer_df[dealer_df['PrimaryDealer'] == 1]) / len(dealer_df))

In [117]:
# transform Date_y to date
dealer_df['Date_Call'] = pd.to_datetime(dealer_df['Date_Call'])

# drop all rows where Date_x < Date_y
dealer_df = dealer_df[dealer_df['Date_Trade'] >= dealer_df['Date_Call']]

In [None]:
# calculate the average RR_Price for each day after the call
daily_avg_rr = dealer_df.groupby(['delta_days'])['dealer_RR'].mean().reset_index()

# plot the average RR_Price for each day after the call
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(daily_avg_rr['delta_days'], daily_avg_rr['dealer_RR'], label='Daily Average RR', color='black')

# plot rolling mean
rolling_mean = daily_avg_rr['dealer_RR'].rolling(window=5).mean()
ax.plot(daily_avg_rr['delta_days'], rolling_mean, label='3-day Rolling Mean', color='grey')

# add to x=0 dotted lines
ax.axvline(0, color='red', linestyle='--', label='Earnings Call Date')
ax.set_xlabel('Date')
# set x: min=0, max=60
ax.set_xlim(-0.2, 90)
# mark 52 as avg Default 
ax.axvline(52, color='black', linestyle='--', label='Mean Time to Default')
ax.set_ylabel('Average RR')
ax.set_title('Daily Average RR of all Bonds')
ax.legend()
plt.show()

In [119]:
# List of trade chain columns
trade_chain_cols = ['TRADE_CHAIN2', 'TRADE_CHAIN3', 'TRADE_CHAIN4', 'TRADE_CHAIN5', 'TRADE_CHAIN6', 'TRADE_CHAIN7', 'TRADE_CHAIN8', 'TRADE_CHAIN9', 'TRADE_CHAIN10']

# Calculate chain length
dealer_df['transaction_chain_length'] = dealer_df[trade_chain_cols].notna().sum(axis=1) + 1  # +1 for the initial trade

In [120]:
def create_colored_chart(model, type):

    # Filter significant features
    significant_features = model.pvalues[model.pvalues < 0.05].index
    importances = model.params[significant_features]  # Only significant features
    importances = importances[1:]

    # Sort importances by their absolute value, and select the top 10
    top_15_importances = importances.abs().sort_values(ascending=False).head(15)
    importances = importances[top_15_importances.index]
    importances = importances.sort_values()  # Sort the values for better visualization

    # Define emotion types (assuming the feature names match these emotion labels)
    emotion_types = {
        'relief': 'positive',
        'remorse': 'negative',
        'nervousness': 'negative',
        'desire': 'positive',
        'fear': 'strong_negative',
        'excitement': 'positive',
        'confusion': 'negative',
        'pride': 'positive',
        'annoyance': 'negative',
        'gratitude': 'positive',
        'anger': 'strong_negative',
        'optimism': 'positive',
        'sadness': 'negative', 
        'approval': 'positive',
        'caring': 'positive',
        'disappointment': 'negative',
        'curiosity': 'positive',
        'surprise': 'positive',
        'admiration': 'positive',
        'embarrassment': 'negative',
        'realization': 'positive',
        'disapproval': 'negative',
        'joy': 'positive',
        'relief_analysts': 'positive',
        'remorse_analysts': 'negative',
        'nervousness_analysts': 'negative',
        'desire_analysts': 'positive',
        'fear_analysts': 'strong_negative',
        'excitement_analysts': 'positive',
        'confusion_analysts': 'negative',
        'pride_analysts': 'positive',
        'annoyance_analysts': 'negative',
        'gratitude_analysts': 'positive',
        'anger_analysts': 'strong_negative',
        'optimism_analysts': 'positive',
        'sadness_analysts': 'negative',
        'approval_analysts': 'positive',
        'caring_analysts': 'positive',
        'disappointment_analysts': 'negative',
        'curiosity_analysts': 'positive',
        'surprise_analysts': 'positive',
        'admiration_analysts': 'positive',
        'embarrassment_analysts': 'negative',
        'realization_analysts': 'positive',
        'disapproval_analysts': 'negative',
        'joy_analysts': 'positive'
    }

    # Map colors to each emotion type
    color_mapping = {
        'positive': 'green',
        'strong_negative': 'red',
        'negative': 'gray',
    }

    # Apply the color mapping to the top 12 significant features
    colors = []
    for feature in importances.index:
        if feature in supporting_features_1 or feature in supporting_features_2 or feature in primary_dealer_features:
            colors.append('blue')
        elif feature in presentation_labels or feature in qna_labels:
            colors.append('orange')
        elif feature in emotion_types:
            colors.append(color_mapping[emotion_types[feature]])
        else:
            colors.append('purple')  # Default for unknown features

    # Create a horizontal bar plot for the top 12 significant features
    plt.figure(figsize=(10, 5))
    plt.barh(importances.index, importances.values, color=colors)

    # add std errors
    plt.errorbar(importances, importances.index, xerr=model.bse[1:][importances.index], fmt='|', color='black')

    # Add labels and title
    plt.xlabel('Impact')
    plt.ylabel('Features')
    plt.title('Significant Features and their Impact')

    if type == 'Management Emotions' or type == 'Analyst Emotions' or type == 'All Emotions':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='green'), plt.Rectangle((0, 0), 1, 1, color='gray'), plt.Rectangle((0, 0), 1, 1, color='red')
        ]
        labels = [
                'Positive Emotion', 'Negative Emotion', 'Strong Negative Emotion'
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'All Earnings call':
        # Create a custom legend
        handles = [
            #plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='orange'),
            plt.Rectangle((0, 0), 1, 1, color='green'), plt.Rectangle((0, 0), 1, 1, color='gray'), plt.Rectangle((0, 0), 1, 1, color='red')
        ]
        labels = [
                #'Financial Data', 
                'LLM Labels', 
                'Positive Emotion', 'Negative Emotion', 'Strong Negative Emotion'
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'Financial Data & Presentation Labels' or type == 'Financial Data & Q&A Labels' or type == 'Financial Data 1&2 & LLM Labels':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='orange'),
        ]
        labels = [
                'Financial Data', 
                'LLM Labels', 
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'Financial Data & All Emotions' or type == 'Financial Data & Management Emotions' or type == 'Financial Data & Analyst Emotions':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='green'), plt.Rectangle((0, 0), 1, 1, color='gray'), plt.Rectangle((0, 0), 1, 1, color='red')
        ]
        labels = [
                'Financial Data', 
                'Positive Emotion', 'Negative Emotion', 'Strong Negative Emotion'
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'Financial Data & All Earnings call':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='orange'),
            plt.Rectangle((0, 0), 1, 1, color='green'), plt.Rectangle((0, 0), 1, 1, color='gray'), plt.Rectangle((0, 0), 1, 1, color='red')
        ]
        labels = [
                'Financial Data', 
                'LLM Labels', 
                'Positive Emotion', 'Negative Emotion', 'Strong Negative Emotion'
                ]
        # Add the legend
        plt.legend(handles, labels)

    elif type == 'New Metrics':
        # Create a custom legend
        handles = [
            plt.Rectangle((0, 0), 1, 1, color='blue'),
            plt.Rectangle((0, 0), 1, 1, color='purple'),
        ]
        labels = [
                'Financial Data', 
                'New Metrics',
                ]
        # Add the legend
        plt.legend(handles, labels)

    # Show the plot
    return plt.show()


def create_significant_chart(model):

    # select significant features
    model.significance = model.pvalues[model.pvalues < 0.05].index

    # build a graph to show the importance of each feature
    importances = model.params[1:]
    importances = importances.sort_values()

    # Sort the importance values based on absolute values, not just positive or negative
    top_10_importances = importances.abs().sort_values(ascending=False).head(10)

    # reduce importances to only the top 10
    importances = importances[top_10_importances.index]
    importances = importances.sort_values()

    # color significant features 0/150/130 and others in grey
    colors = ['#009682' if feature in model.significance else 'grey' for feature in importances.index]

    plt.figure(figsize=(10, 5))
    plt.barh(importances.index, importances.values, color=colors)

    # add std errors
    plt.errorbar(importances, importances.index, xerr=model.bse[1:][importances.index], fmt='|', color='black')
    
    # add legend
    plt.legend(['Significant'])
    plt.xlabel('Impact')
    plt.ylabel('Features')
    plt.title('Features Impact on Recovery Rate')
    return plt.show()

In [133]:
# export the data
#dealer_df.to_csv('dealer/dealer_data_llm_output.csv', index=False, sep='|')
dealer_df = pd.read_csv('dealer/dealer_data_llm_output.csv', delimiter= '|')

In [None]:
# Descriptive data analysis
print(f"Transactions: {len(dealer_df)}")
print(f"Primary Transactions: {len(dealer_df[dealer_df['PrimaryDealer'] == 1])/len(dealer_df)}")

# unique bonds
print(f"No Bonds: {len(dealer_df['CUSIP'].unique())}")
print(f"No Bonds traded by Primary: {len(dealer_df[dealer_df['PrimaryDealer'] == 1]['CUSIP'].unique())}")

# average trades per bond
print(f"Avg trades per bond: {dealer_df['CUSIP'].value_counts().mean()}")

# average recovery rate
print(f"Avg RR: {dealer_df['dealer_RR'].mean()}")

# average recovery rate primary dealer
print(f"Avg RR Primary: {dealer_df[dealer_df['PrimaryDealer'] == 1]['dealer_RR'].mean()}")

# average recovery rate non-primary dealer
print(f"Avg RR Non-Primary: {dealer_df[dealer_df['PrimaryDealer'] == 0]['dealer_RR'].mean()}")

In [None]:
# Create a kernel density estimation plot for dealer_RR grouped by PrimaryDealer
plt.figure(figsize=(10, 6))

# Plot density for Primary Dealers (PrimaryDealer == 1)
dealer_df[dealer_df['PrimaryDealer'] == 1]['dealer_RR'].plot.kde(label='Primary Dealer', linewidth=2, color='#009682')

# Plot density for Non-Primary Dealers (PrimaryDealer == 0) 
dealer_df[dealer_df['PrimaryDealer'] == 0]['dealer_RR'].plot.kde(label='Non-Primary Dealer', linewidth=2, color='grey')

plt.xlabel('Recovery Rate')
# x 0 - 125
plt.xlim(0, 125)
plt.ylabel('Density')
plt.title('Distribution of Recovery Rates by Dealer Type')
plt.legend()
plt.grid(True)

In [124]:
'''SET FEATURES'''

# Select the supporting features, nlp_lables, and RR from final_df
selected_df = dealer_df[['dealer_RR', 'PrimaryDealer', 'transaction_chain_length']
                    + supporting_features_1
                    + supporting_features_2
                    + presentation_labels 
                    + qna_labels
                    + emotions
                    + analyst_emotions
                    ]

# reset index
selected_df = selected_df.reset_index(drop=True)

In [125]:
# scale the data except for the target variable dealer_RR and PrimaryDealer and transaction_chain_length
scaler = MinMaxScaler()
selected_df[selected_df.columns[3:]] = scaler.fit_transform(selected_df[selected_df.columns[3:]])

In [126]:
feature_sets = {#'Financial Data 1': supporting_features_1, 
                'Financial Data 1&2': supporting_features_1 + supporting_features_2,
                ## Earnings call features
                #'Presentation Labels': presentation_labels, 
                #'Q&A Labels':qna_labels, 
                #'LLM Labels': presentation_labels + qna_labels,
                #'Management Emotions': emotions, 
                #'Analyst Emotions': analyst_emotions,
                #'All Emotions': emotions + analyst_emotions,
                'All Earnings call': presentation_labels + qna_labels + emotions + analyst_emotions,
                #'Management': emotions + presentation_labels + qna_mgmt,
                #'Analysts': analyst_emotions + qna_analysts,
                ## Earnings call features and financial data
                #'Financial Data 1&2 & Presentation Labels': supporting_features_1 + supporting_features_2 + presentation_labels,
                #'Financial Data 1&2 & Q&A Labels': supporting_features_1 + supporting_features_2 + qna_labels,
                #'Financial Data 1&2 & Management Emotions': supporting_features_1 + supporting_features_2 + emotions,
                #'Financial Data 1&2 & Analyst Emotions': supporting_features_1 + supporting_features_2 + analyst_emotions,
                #'Financial Data 1&2 & LLM Labels': supporting_features_1 + supporting_features_2 + presentation_labels + qna_labels,
                #'Financial Data 1&2 & All Emotions': supporting_features_1 + supporting_features_2 + emotions + analyst_emotions,
                'Financial Data 1&2 & All Earnings call': supporting_features_1 + supporting_features_2 + presentation_labels + qna_labels + analyst_emotions + emotions,
                'Financial Data 1&2 & Management': supporting_features_1 + supporting_features_2 + emotions + presentation_labels + qna_mgmt,
                'Financial Data 1&2 & Analysts': supporting_features_1 + supporting_features_2 + analyst_emotions + qna_analysts,
                }

In [None]:
'''Dealer Recovery Rate Prediction'''
'''PRIMARY DEALER'''

for key in feature_sets:
    print(f"Feature Set: {key}")

    final_df = selected_df[['dealer_RR', 'PrimaryDealer']+ feature_sets[key]]
    final_df = final_df[final_df['PrimaryDealer'] == 1]

    y = final_df['dealer_RR']
    X = final_df.drop(columns=['dealer_RR', 'PrimaryDealer'])

    X = sm.add_constant(X)

    # Fit the model
    model = sm.OLS(y, X).fit()

    # Print the summary of the model which includes p-values and significance levels
    print(model.summary())

    # Make predictions
    y_pred = model.predict(X)

    # Compute and print evaluation metrics
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {np.sqrt(mse)}")
    print(f"R-squared: {r2}")

    create_significant_chart(model)
    create_colored_chart(model, key)

In [None]:
'''Dealer Recovery Rate Prediction'''
'''NON-PRIMARY DEALER'''

for key in feature_sets:
    print(f"Feature Set: {key}")

    final_df = selected_df[['dealer_RR', 'PrimaryDealer']+ feature_sets[key]]
    final_df = final_df[final_df['PrimaryDealer'] == 0]

    y = final_df['dealer_RR']
    X = final_df.drop(columns=['dealer_RR', 'PrimaryDealer'])

    X = sm.add_constant(X)

    # Fit the model
    model = sm.OLS(y, X).fit()

    # Print the summary of the model which includes p-values and significance levels
    print(model.summary())

    # Make predictions
    y_pred = model.predict(X)

    # Compute and print evaluation metrics
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {np.sqrt(mse)}")
    print(f"R-squared: {r2}")

    create_significant_chart(model)
    create_colored_chart(model, key)

In [None]:
'''Transaction Chain Length Prediction'''
final_df = selected_df[selected_df['PrimaryDealer'] == 1]

y_train, y_test = final_df['transaction_chain_length'], final_df['transaction_chain_length']
X_train, X_test = final_df.drop(columns=['dealer_RR', 'PrimaryDealer', 'transaction_chain_length']), final_df.drop(columns=['dealer_RR', 'PrimaryDealer', 'transaction_chain_length'])

X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

# Fit the model
model = sm.OLS(y_train, X_train).fit()

# Print the summary of the model which includes p-values and significance levels
print(model.summary())

# Make predictions
y_pred = model.predict(X_test)

# Compute and print evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {np.sqrt(mse)}")
print(f"R-squared: {r2}")

create_significant_chart(model)
create_colored_chart(model)

In [128]:
"""COMPOSITE METRICS"""

''''Trade Level Analysis'''

# Load the trade-level data
trade_df = pd.read_csv('dealer/dealer_data_llm_output.csv', delimiter='|')

scaler = MinMaxScaler()
trade_df[supporting_features_1 + supporting_features_2] = scaler.fit_transform(trade_df[supporting_features_1 + supporting_features_2])

def create_loadings(df, feature_groups, best, dealer=False):
    """
    Computes PCA weights and derives composite scores for the provided feature groups.

    Parameters:
        df (pd.DataFrame): The input dataframe containing feature values.
        feature_groups (dict): A dictionary where keys are group names, and values are lists of feature names.

    Returns:
        pd.DataFrame: The dataframe with computed scores and composite metrics added.
    """

    def pca_weights(df, features, n_components=1):
        """
        Computes the PCA weights for a subset of features.
        If the sum of weights is negative, it inverts the weights.

        Parameters:
            df (pd.DataFrame): The input dataframe containing feature values.
            features (list): A list of feature names to include in PCA.
            n_components (int): Number of principal components to compute.

        Returns:
            np.ndarray: The weights of the first principal component.
        """
        pca = PCA(n_components=n_components)
        pca.fit(df[features])
        weights = pca.components_[0]

        #print(weights)

        # Invert weights if the sum is negative
        if sum(weights) < 0:
            weights = -weights
        return weights

    # Store computed scores in the dataframe
    for group_name, features in feature_groups.items():
        weights = pca_weights(df, features)
        df[group_name] = np.dot(df[features], weights)

    if dealer:
        # reverse sign of Analyst_Confirmation due to wrong sign in the data
        df['Analyst_Confirmation'] = -df['Analyst_Confirmation']

    if best:
        # Create composite scores
        df['MTS'] = df['Openess'] - df['Missing_Transparency']
        df['CAMI'] = df['Finance'] - df['General_Tone'] + df['Mgmt_Emotions'] - df['Operations']
        df['ASS'] = df['Analyst_Confirmation'] + df['Analyst_Positive'] - df['Analyst_Negative']

    return df

def build_score_overview(df):    

    graph_df = df[['Date', 'MTS', 'CAMI', 'ASS', 'call_ID']]
    graph_df = graph_df.drop_duplicates(subset=['Date', 'call_ID'])

    # Set Seaborn style for a cleaner look
    sns.set(style="whitegrid")

    # Convert Date to pandas datetime format if not already done
    graph_df['Date'] = pd.to_datetime(graph_df['Date'])

    # Calculate rolling averages for smoother trends
    graph_df['MTS_Rolling'] = graph_df['MTS'].rolling(window=30).mean()
    graph_df['CAMI_Rolling'] = graph_df['CAMI'].rolling(window=30).mean()
    graph_df['ASS_Rolling'] = graph_df['ASS'].rolling(window=30).mean()

    # Plotting with customized aesthetics
    plt.figure(figsize=(14, 8))
    plt.plot(graph_df['Date'], graph_df['MTS_Rolling'], label='MTS (30-day avg)', linestyle='-', color='steelblue', linewidth=2)
    plt.plot(graph_df['Date'], graph_df['CAMI_Rolling'], label='CAMI (30-day avg)', linestyle='--', color='darkorange', linewidth=2)
    plt.plot(graph_df['Date'], graph_df['ASS_Rolling'], label='ASS (30-day avg)', linestyle=':', color='seagreen', linewidth=2)

    # Original data as lighter points
    plt.scatter(graph_df['Date'], graph_df['MTS'], color='steelblue', alpha=0.4, s=10)
    plt.scatter(graph_df['Date'], graph_df['CAMI'], color='darkorange', alpha=0.4, s=10)
    plt.scatter(graph_df['Date'], graph_df['ASS'], color='seagreen', alpha=0.4, s=10)

    # Formatting the x-axis for date readability
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=3))  # Set major ticks every 3 months
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

    # Adding labels, title, and legend
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Composite Score', fontsize=12)
    plt.title('Trends of MTS, CAMI, and ASS Over Time (with 7-day Rolling Average)', fontsize=14)
    plt.legend()

    plt.tight_layout()
    plt.xticks(rotation=45)
    return plt.show()

def build_heatmap_macro(df):
    # Reducing the correlation matrix to show only MTS, CAMI, ASS with economic factors
    economic_factors = ['CBOE DJIA Volatility Index', 'NASDAQ 100 Index return', 
                        'Manufacturers inventories to sales ratio', '30 year conventional mortgage rate']

    correlation_columns = ['MTS', 'CAMI', 'ASS', 'CBOE DJIA Volatility Index', 'NASDAQ 100 Index return', 
                        'Manufacturers inventories to sales ratio', '30 year conventional mortgage rate']

    graph_df = df[['Date', 'MTS', 'CAMI', 'ASS', 'call_ID'] + economic_factors]
    graph_df = graph_df.drop_duplicates()

    # Calculating correlations
    correlation_matrix = graph_df[correlation_columns].corr()

    # Selecting only the relevant correlations
    reduced_correlation_matrix = correlation_matrix.loc[['MTS', 'CAMI', 'ASS'], economic_factors]

    # Plotting the reduced correlation heatmap
    plt.figure(figsize=(8, 5))
    sns.heatmap(reduced_correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title("Correlation of MTS, CAMI, ASS with Economic Indicators")
    return plt.show()

def build_heatmap_industries(df): 
    # display avg scores of MTS, CAMI, ASS for 'Industrials','Consumer Staples','Financials','Energy','Health Care','Utilities','Information Technology','Real Estate'
    graph_df = df[['MTS', 'CAMI', 'ASS', 'call_ID', 'Industrials','Consumer Staples','Financials','Energy','Health Care','Utilities','Information Technology','Real Estate']]
    graph_df = graph_df.drop_duplicates(subset=['call_ID'])

    industry_scores = {
        'Industrials': graph_df[graph_df['Industrials'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Consumer Staples': graph_df[graph_df['Consumer Staples'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Financials': graph_df[graph_df['Financials'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Energy': graph_df[graph_df['Energy'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Health Care': graph_df[graph_df['Health Care'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Utilities': graph_df[graph_df['Utilities'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Information Technology': graph_df[graph_df['Information Technology'] == 1][['MTS', 'CAMI', 'ASS']].mean(),
        'Real Estate': graph_df[graph_df['Real Estate'] == 1][['MTS', 'CAMI', 'ASS']].mean()
    }

    industry_scores_df = pd.DataFrame(industry_scores)

    # Plotting the average scores for each industry
    plt.figure(figsize=(10, 6))
    sns.heatmap(industry_scores_df, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title("Average MTS, CAMI, ASS Scores by Industry")
    return plt.show()

In [129]:
best_features = {
    'Missing_Transparency': ['vagueness', 'evasiveness', 'responsiveness'],
    'Openess' : ['language_accessibility_presentation', 'language_accessibility_qna', 'depth', 'debt_leverage_stress', 'legal_issues'],
    'Finance' : ['liquidity_position', 'asset_quality'],
    'Operations' : ['operational_trends', 'industry_positioning'],
    'General_Tone': ['optimistic', 'positive_sentiment', 'uncertainty', 'macroeconomic'],
    'Mgmt_Emotions': ['excitement', 'admiration', 'confusion', 'relief'],
    'Analyst_Confirmation': ['analyst_concerns', 'analyst_satisfaction', 'remorse_analysts', 'disapproval_analysts'],
    'Analyst_Negative': ['sadness_analysts', 'embarrassment_analysts', 'anger_analysts'],
    'Analyst_Positive': ['joy_analysts', 'optimism_analysts']
}

most_insightful = {
    'Missing_Transparency': ['vagueness', 'evasiveness', 'positive_sentiment'],
    'Operations' : ['operational_trends', 'industry_positioning', 'legal_issues'],
    'Analyst_Confirmation': ['analyst_concerns', 'analyst_satisfaction', 'remorse_analysts', 'disapproval_analysts'],
}

most_insightful_emotion = {
    'Missing_Transparency': ['vagueness', 'evasiveness', 'positive_sentiment'],
    'Operations' : ['operational_trends', 'industry_positioning', 'legal_issues'],
    'Mgmt_Emotions': ['excitement', 'admiration', 'relief', 'confusion', 'optimistic'],
    'Analyst_Confirmation': ['analyst_concerns', 'analyst_satisfaction', 'remorse_analysts', 'disapproval_analysts'],
    'Analyst_Negative': ['sadness_analysts', 'embarrassment_analysts', 'anger_analysts'],
}

analysis = {
    'best': [best_features, False],
    'best_composite': [best_features, True],
    'most_insightful': [most_insightful, False],
    'most_insightful_emotion': [most_insightful_emotion, False],
}

In [None]:
'''Primary Dealer'''
df = trade_df.copy()

for key in analysis:
    print('')
    print('________________________________________________________________')
    print('________________________________________________________________')
    print('________________________________________________________________')
    print('')
    print(key)
    feature_groups = analysis[key][0]
    best = analysis[key][1]
    composite_df = create_loadings(df, feature_groups, best, True)

    if best:
        # Select the supporting features, nlp_lables, and RR from final_df
        final_df = composite_df[['dealer_RR', 'PrimaryDealer']
                            + supporting_features_1
                            + supporting_features_2
                            #####################
                            + ['MTS', 'CAMI', 'ASS']
                            ]
        
    else:
        # Select the supporting features, nlp_lables, and RR from final_df
        final_df = composite_df[['dealer_RR', 'PrimaryDealer']
                            + supporting_features_1
                            + supporting_features_2
                            #####################
                            + list(feature_groups.keys())
                            ]
                        
    # reset index
    final_df = final_df.reset_index(drop=True)
    # drop duplicates
    final_df = final_df[final_df['PrimaryDealer'] == 1]

    y = final_df['dealer_RR']
    X = final_df.drop(columns=['dealer_RR', 'PrimaryDealer'])

    X = sm.add_constant(X)

    # Fit the model
    model = sm.OLS(y, X).fit()

    # Print the summary of the model which includes p-values and significance levels
    print(model.summary())

    # Make predictions
    y_pred = model.predict(X)

    # Compute and print evaluation metrics
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {np.sqrt(mse)}")
    print(f"R-squared: {r2}")

    create_colored_chart(model, 'New Metrics')
    if best:
        chart_df = composite_df[composite_df['PrimaryDealer'] == 1]
        build_heatmap_industries(chart_df)

    '''For comparison build the build the same charts for the original features'''
    '''print('')
    print('________________________________________________________________')
    print('')
    print('Original Features')
    # Select all indivudal features from the dictionary
    individual_features = []
    for key in feature_groups:
        individual_features += feature_groups[key]

    final_df = composite_df[['dealer_RR', 'PrimaryDealer']
                        + supporting_features_1
                        + supporting_features_2
                        + individual_features
                        ]

    # reset index
    final_df = final_df.reset_index(drop=True)
    # drop duplicates
    final_df = final_df[final_df['PrimaryDealer'] == 1]

    y = final_df['dealer_RR']
    X = final_df.drop(columns=['dealer_RR', 'PrimaryDealer'])

    X = sm.add_constant(X)

    # Fit the model
    model = sm.OLS(y, X).fit()

    # Print the summary of the model which includes p-values and significance levels
    print(model.summary())

    # Make predictions
    y_pred = model.predict(X)

    # Compute and print evaluation metrics
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {np.sqrt(mse)}")
    print(f"R-squared: {r2}")

    create_colored_chart(model, 'All Earnings call & Financial Data') '''

In [None]:
'''NON Primary Dealer'''
df = trade_df.copy()

for key in analysis:
    print('')
    print('________________________________________________________________')
    print('________________________________________________________________')
    print('________________________________________________________________')
    print('')
    print(key)
    feature_groups = analysis[key][0]
    best = analysis[key][1]
    composite_df = create_loadings(df, feature_groups, best, True)

    if best:
        # Select the supporting features, nlp_lables, and RR from final_df
        final_df = composite_df[['dealer_RR', 'PrimaryDealer']
                            + supporting_features_1
                            + supporting_features_2
                            #####################
                            + ['MTS', 'CAMI', 'ASS']
                            ]
        
    else:
        # Select the supporting features, nlp_lables, and RR from final_df
        final_df = composite_df[['dealer_RR', 'PrimaryDealer']
                            + supporting_features_1
                            + supporting_features_2
                            #####################
                            + list(feature_groups.keys())
                            ]
                        
    # reset index
    final_df = final_df.reset_index(drop=True)
    # drop duplicates
    final_df = final_df[final_df['PrimaryDealer'] == 0]

    y = final_df['dealer_RR']
    X = final_df.drop(columns=['dealer_RR', 'PrimaryDealer'])

    X = sm.add_constant(X)

    # Fit the model
    model = sm.OLS(y, X).fit()

    # Print the summary of the model which includes p-values and significance levels
    print(model.summary())

    # Make predictions
    y_pred = model.predict(X)

    # Compute and print evaluation metrics
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {np.sqrt(mse)}")
    print(f"R-squared: {r2}")

    create_colored_chart(model, 'New Metrics')
    if best:
        chart_df = composite_df[composite_df['PrimaryDealer'] == 0]
        build_heatmap_industries(chart_df)

    '''For comparison build the build the same charts for the original features'''
    '''print('')
    print('________________________________________________________________')
    print('')
    print('Original Features')
    # Select all indivudal features from the dictionary
    individual_features = []
    for key in feature_groups:
        individual_features += feature_groups[key]

    final_df = composite_df[['dealer_RR', 'PrimaryDealer']
                        + supporting_features_1
                        + supporting_features_2
                        + individual_features
                        ]

    # reset index
    final_df = final_df.reset_index(drop=True)
    # drop duplicates
    final_df = final_df[final_df['PrimaryDealer'] == 0]

    y = final_df['dealer_RR']
    X = final_df.drop(columns=['dealer_RR', 'PrimaryDealer'])

    X = sm.add_constant(X)

    # Fit the model
    model = sm.OLS(y, X).fit()

    # Print the summary of the model which includes p-values and significance levels
    print(model.summary())

    # Make predictions
    y_pred = model.predict(X)

    # Compute and print evaluation metrics
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {np.sqrt(mse)}")
    print(f"R-squared: {r2}")

    create_colored_chart(model, 'All Earnings call & Financial Data') '''

In [None]:
# Ignoring dealer type
df = trade_df.copy()

for key in analysis:
    print('')
    print('________________________________________________________________')
    print('________________________________________________________________')
    print('________________________________________________________________')
    print('')
    print(key)
    feature_groups = analysis[key][0]
    best = analysis[key][1]
    composite_df = create_loadings(df, feature_groups, best, True)

    if best:
        # Select the supporting features, nlp_lables, and RR from final_df
        final_df = composite_df[['dealer_RR', 'PrimaryDealer']
                            + supporting_features_1
                            + supporting_features_2
                            #####################
                            + ['MTS', 'CAMI', 'ASS']
                            ]
        
    else:
        # Select the supporting features, nlp_lables, and RR from final_df
        final_df = composite_df[['dealer_RR', 'PrimaryDealer']
                            + supporting_features_1
                            + supporting_features_2
                            #####################
                            + list(feature_groups.keys())
                            ]
                        
    # reset index
    final_df = final_df.reset_index(drop=True)

    y = final_df['dealer_RR']
    X = final_df.drop(columns=['dealer_RR', 'PrimaryDealer'])

    X = sm.add_constant(X)

    # Fit the model
    model = sm.OLS(y, X).fit()

    # Print the summary of the model which includes p-values and significance levels
    print(model.summary())

    # Make predictions
    y_pred = model.predict(X)

    # Compute and print evaluation metrics
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {np.sqrt(mse)}")
    print(f"R-squared: {r2}")

    create_colored_chart(model, 'New Metrics')
    if best:
        chart_df = composite_df
        build_heatmap_industries(chart_df)

    '''For comparison build the build the same charts for the original features'''
    print('')
    print('________________________________________________________________')
    print('')
    print('Original Features')
    # Select all indivudal features from the dictionary
    individual_features = []
    for key in feature_groups:
        individual_features += feature_groups[key]

    final_df = composite_df[['dealer_RR', 'PrimaryDealer']
                        + supporting_features_1
                        + supporting_features_2
                        + individual_features
                        ]

    # reset index
    final_df = final_df.reset_index(drop=True)

    y = final_df['dealer_RR']
    X = final_df.drop(columns=['dealer_RR', 'PrimaryDealer'])

    X = sm.add_constant(X)

    # Fit the model
    model = sm.OLS(y, X).fit()

    # Print the summary of the model which includes p-values and significance levels
    print(model.summary())

    # Make predictions
    y_pred = model.predict(X)

    # Compute and print evaluation metrics
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {np.sqrt(mse)}")
    print(f"R-squared: {r2}")

    create_colored_chart(model, 'All Earnings call & Financial Data') 

In [None]:
# Calculate absolute changes in recovery rate for each metric
results = []
for key in analysis:
    if key == 'best_composite' or key == 'most_insightful':
        feature_groups = analysis[key][0]
        composite = analysis[key][1]
        df = trade_df.copy()
        composite_df = create_loadings(df, feature_groups, composite)

        # build dict for primary df and non primary df
        primary_df = composite_df[composite_df['PrimaryDealer'] == 1]
        non_primary_df = composite_df[composite_df['PrimaryDealer'] == 0]

        df_dict = {
            'Primary': primary_df,
            'Non-Primary': non_primary_df
        }

        for df_key in df_dict.keys():
            metrics = analysis[key][0]
            if composite:
                metrics = ['MTS', 'CAMI', 'ASS']

            for m in metrics:
                # Sort the data by the metric
                data_sorted = df_dict[df_key].sort_values(by=m)
                
                # Calculate the number of entries for 20%
                n = len(data_sorted)
                top_20_percent_index = int(n * 0.2)
                bottom_20_percent_index = int(n * 0.2)
                
                # Get the top 20% and bottom 20% data
                top_20_percent = data_sorted.iloc[-top_20_percent_index:]
                bottom_20_percent = data_sorted.iloc[:bottom_20_percent_index]
                
                # Calculate the average RR for top 20% and bottom 20%
                average_rr = data_sorted['dealer_RR'].mean()
                average_rr_top_20 = top_20_percent['dealer_RR'].mean()
                average_rr_bottom_20 = bottom_20_percent['dealer_RR'].mean()
                
                # Calculate the standard deviation of the metric
                std_dev = data_sorted[m].std()
                mean_metric = data_sorted[m].mean()
                
                # Determine the threshold values
                upper_threshold = mean_metric + std_dev
                lower_threshold = mean_metric - std_dev
                
                # Filter the data based on the threshold values
                # if there are no values above or below the threshold, the filtered data will be min/max of the data
                upper_data = data_sorted[data_sorted[m] >= upper_threshold]
                if upper_data.empty:
                    upper_data = data_sorted.max()
                lower_data = data_sorted[data_sorted[m] <= lower_threshold]
                if lower_data.empty:
                    lower_data = data_sorted.min()
                
                # Calculate the average RR for the filtered data
                average_rr_upper = upper_data['dealer_RR'].mean()
                average_rr_lower = lower_data['dealer_RR'].mean()
                
                # Store the absolute changes
                results.append({
                    "Metric": f'{m}_{df_key}',
                    "Top 20% Change in RR (%)": average_rr_top_20 - average_rr,
                    "Bottom 20% Change in RR (%)": average_rr_bottom_20 - average_rr,
                    "1 Std Above Mean Change in RR (%)": average_rr_upper - average_rr,
                    "1 Std Below Mean Change in RR (%)": average_rr_lower - average_rr
                })

# Convert results to DataFrame and display
change_df = pd.DataFrame(results)
# import ace_tools as tools; tools.display_dataframe_to_user(name="Absolute Changes in Recovery Rates", dataframe=change_df)
change_df

In [None]:
'''Primary Dealer Classification'''
'''Is the probability of being a primary dealer related to earnings call features?'''

#dealer_df = pd.read_csv('dealer/dealer_data_llm_output.csv', delimiter= '|')

# percentage of trades per CUSIP that are primary dealer trades
dealer_df['count_primary_dealer'] = dealer_df.groupby('CUSIP')['PrimaryDealer'].transform('sum')
dealer_df['count_total'] = dealer_df.groupby('CUSIP')['PrimaryDealer'].transform('count')
dealer_df['percentage_primary_dealer'] = dealer_df['count_primary_dealer'] / dealer_df['count_total']

# drop the count columns
dealer_df.drop(columns=['count_primary_dealer', 'count_total'], inplace=True)

# show distribution of the new feature
dealer_df['percentage_primary_dealer'].hist(bins=50)

# convert percentage_primary_dealer to a binary feature
dealer_df['by_primary_dealer'] = dealer_df['percentage_primary_dealer'].apply(lambda x: 1 if x > 0.5 else 0)

# Drop Rating_y column and rename Rating_x to Rating
dealer_df.drop(columns=['Rating_y'], inplace=True)
dealer_df.rename(columns={'Rating_x': 'Rating'}, inplace=True)

feature_sets = {'Financial Data': primary_dealer_features,
                ## Earnings call features
                'Presentation Labels': presentation_labels, 
                'Q&A Labels':qna_labels, 
                'Management Emotions': emotions, 
                'Analyst Emotions': analyst_emotions,
                'LLM Labels': presentation_labels + qna_labels,
                'All Emotions': emotions + analyst_emotions,
                'All Earnings call': presentation_labels + qna_labels + emotions + analyst_emotions,
                ## Earnings call features and financial data
                'Financial Data & Management': primary_dealer_features + presentation_labels + emotions + qna_mgmt,
                'Financial Data & Analysts': primary_dealer_features + qna_analysts + analyst_emotions,
                'Financial Data & All Earnings call': primary_dealer_features + presentation_labels + qna_labels + emotions + analyst_emotions
                }

# scale primary dealer features
scaler = MinMaxScaler()
dealer_df[primary_dealer_features] = scaler.fit_transform(dealer_df[primary_dealer_features])

for key in feature_sets:

    # Prepare the dataset
    final_df = dealer_df[['PrimaryDealer'] + feature_sets[key]]

    # Replace nan with 0
    final_df.fillna(0, inplace=True)

    # drop single value columns
    final_df = final_df.loc[:, final_df.apply(pd.Series.nunique) != 1]

    # drop X90DayDR column
    try:
        final_df.drop(columns=['X90DayDR'], inplace=True)
    except:
        pass
    
    # Define features and target variable
    X = final_df.drop(columns='PrimaryDealer')
    y = final_df['PrimaryDealer']
    
    # Add constant for intercept
    #X = sm.add_constant(X)
    
    # Fit logistic regression model
    logit_model = sm.Logit(y, X)
    result = logit_model.fit()
    
    # Print model summary to view p-values, coefficients, etc.
    print(f"Features: {key}")
    print(result.summary())
    
    # Predict probabilities for the test set
    y_pred_prob = result.predict(X)
    
    # Convert probabilities to binary predictions with a threshold of 0.5
    y_pred = (y_pred_prob >= 0.5).astype(int)
    
    # Compute and print evaluation metrics
    accuracy = accuracy_score(y, y_pred)
    auc = roc_auc_score(y, y_pred_prob)
    precision = np.sum((y == 1) & (y_pred == 1)) / np.sum(y_pred == 1)
    recall = np.sum((y == 1) & (y_pred == 1)) / np.sum(y == 1)
    print(f"Accuracy: {accuracy}")
    print(f"AUC: {auc}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1Score {f1_score(y, y_pred)}")

    create_colored_chart(result, key)

In [None]:
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
import pandas as pd
import numpy as np

for key in analysis:

    print('')
    print('________________________________________________________________')
    print('________________________________________________________________')
    print('________________________________________________________________')
    print('')
    print(key)
    feature_groups = analysis[key][0]
    best = analysis[key][1]
    composite_df = create_loadings(dealer_df, feature_groups, best, True)

    if best:
        # Select the supporting features, nlp_lables, and RR from final_df
        final_df = composite_df[['PrimaryDealer']
                            #+ primary_dealer_features
                            #####################
                            + ['MTS', 'CAMI', 'ASS']
                            ]
        
    else:
        # Select the supporting features, nlp_lables, and RR from final_df
        final_df = composite_df[['PrimaryDealer']
                            #+ primary_dealer_features
                            #####################
                            + list(feature_groups.keys())
                            ]

    # Replace nan with 0
    final_df.fillna(0, inplace=True)

    # drop single value columns
    final_df = final_df.loc[:, final_df.apply(pd.Series.nunique) != 1]

    # drop X90DayDR column
    try:
        final_df.drop(columns=['X90DayDR'], inplace=True)
    except:
        pass
    
    # Define features and target variable
    X = final_df.drop(columns='PrimaryDealer')
    y = final_df['PrimaryDealer']
    
    # Add constant for intercept
    #X = sm.add_constant(X)
    
    # Fit logistic regression model
    logit_model = sm.Logit(y, X)
    result = logit_model.fit()
    
    # Print model summary to view p-values, coefficients, etc.
    print(f"Features: {key}")
    print(result.summary())
    
    # Predict probabilities for the test set
    y_pred_prob = result.predict(X)
    
    # Convert probabilities to binary predictions with a threshold of 0.5
    y_pred = (y_pred_prob >= 0.5).astype(int)
    
    # Compute and print evaluation metrics
    accuracy = accuracy_score(y, y_pred)
    auc = roc_auc_score(y, y_pred_prob)
    precision = np.sum((y == 1) & (y_pred == 1)) / np.sum(y_pred == 1)
    recall = np.sum((y == 1) & (y_pred == 1)) / np.sum(y == 1)
    print(f"Accuracy: {accuracy}")
    print(f"AUC: {auc}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1Score {f1_score(y, y_pred)}")

    # print matrix
    print(f"Confusion Matrix: {confusion_matrix(y, y_pred)}")

    create_colored_chart(result, key)

In [None]:
# apply 