In [2]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.dates as mdates

In [None]:
'''If executed in Google Colab, uncomment the following lines'''
#from google.colab import drive
#drive.mount('/content/drive')

#import os
#os.chdir('/content/drive/MyDrive/LLM_CreditorRRPrediction')

In [4]:
presentation_labels = ['negative_sentiment', 'positive_sentiment', 'uncertainty', 'optimistic', 'pessimistic', 'vagueness', 'language_accessibility_presentation',
              'liquidity_position', 'debt_leverage_stress', 'operational_trends', 'industry_positioning', 'asset_quality', 'recovery_strategies', 'legal_issues','macroeconomic']

qna_labels = ['analyst_concerns', 'responsiveness', 'confidence', 'evasiveness', 'depth', 'analyst_satisfaction', 'language_accessibility_qna']

qna_mgmt = ['responsiveness', 'confidence', 'depth', 'evasiveness', 'language_accessibility_qna']
qna_analysts = ['analyst_concerns', 'analyst_satisfaction']

emotions = [
        'admiration', 
        #'amusement', 
        'anger', 
        'annoyance', 
        'approval', 
        'caring', 
        'confusion', 
        'curiosity', 
        'desire',
        'disappointment', 
        'disapproval', 
        #'disgust', 
        'embarrassment', 
        'excitement', 
        'fear', 
        #'gratitude', 
        #'grief',
        'joy', 
        #'love', 
        'nervousness', 
        'optimism', 
        'pride', 
        'realization', 
        'relief', 
        'remorse', 
        'sadness', 
        'surprise'
    ]

analyst_emotions = []
for i in emotions:
    analyst_emotions.append(i + '_analysts')

macro_industry = [
    'CBOE DJIA Volatility Index',
    'NASDAQ 100 Index return',
    'Manufacturers inventories to sales ratio',
    '30 year conventional mortgage rate',
    'Communication Services', 'Consumer Discretionary', 'Industrials','Consumer Staples','Financials','Energy','Health Care','Utilities','Information Technology','Real Estate'
]

primary_dealer_features = [
    'TimeToMaturity', 'TimeSinceOffering', 'Offering_amount',
       'SinkingFund', 'BOND_COUPON', 'IQ_CDS_availability',
        'AvgTransVol', 'TotalVolume', 'TRADES_VOL',
       'EquityValue', 'DefaultBarrier2', 'LTDIssuance2', 'Intangibility', 'Receivables1',
       'NumberEmployees', 'IndDis1', 'IndDis2'
]

In [None]:
dealer_data = pd.read_csv('dealer/dealer_data.csv')

In [None]:
# Define the columns needed based on your list for predicting 'PrimaryDealer'
columns_needed = ['Date', 'DealCSP', "RR_Price", "PrimaryDealer","Seniority","C2D_Dealer",
    # Trade chain
    'TRADE_CHAIN2', 'TRADE_CHAIN3', 'TRADE_CHAIN4', 'TRADE_CHAIN5', 'TRADE_CHAIN6', 'TRADE_CHAIN7', 'TRADE_CHAIN8', 'TRADE_CHAIN9', 'TRADE_CHAIN10']
    # Probability prediction


# Limit the DataFrame to the relevant columns
dealer_df_limited = dealer_data[columns_needed + primary_dealer_features]

# rename DealCSP to CUSIP
dealer_df_limited.rename(columns={'DealCSP': 'CUSIP'}, inplace=True)

# rename RR_price to dealer_RR
dealer_df_limited.rename(columns={'RR_Price': 'dealer_RR'}, inplace=True)

# transform the date column to date
dealer_df_limited['Date'] = pd.to_datetime(dealer_df_limited['Date'])

# ingore time
dealer_df_limited['Date'] = dealer_df_limited['Date'].dt.date

In [7]:
# Convert categorical columns to dummy variables
categorical_cols = ["Seniority"]
dealer_df_limited = pd.get_dummies(dealer_df_limited, columns=categorical_cols, drop_first=True)
primary_dealer_features.append(['Seniority_SeniorSubordinate', 'Seniority_SeniorUnsecured', 'Seniority_SubordinateJunior'])

In [8]:
primary_dealer_features.append('Seniority_SeniorSubordinate')
primary_dealer_features.append('Seniority_SeniorUnsecured')
primary_dealer_features.append('Seniority_SubordinateJunior')

In [9]:
llm_output = pd.read_csv(f'transcripts/LLM_outputs_final.csv', delimiter='|')

In [10]:
llm_output = llm_output[['Date', 'Ddate', 'CUSIP', 'RR', 'call_ID'] + presentation_labels + qna_labels + macro_industry + emotions + analyst_emotions]

In [11]:
# Ensure the columns are in datetime format
llm_output['Date'] = pd.to_datetime(llm_output['Date'])
llm_output['Ddate'] = pd.to_datetime(llm_output['Ddate'])
llm_output
llm_output['t_delta'] = llm_output['Ddate'] - llm_output['Date']

# drop all with t_delta > 180
llm_output = llm_output[llm_output['t_delta'] <= pd.Timedelta('180 days')]

In [12]:
dealer_df = pd.merge(dealer_df_limited, llm_output, on='CUSIP', how='inner')

In [None]:
"""DATA EXPLORATION"""

In [45]:
# rename Date_x to Date_Trade and Date_y to Date_Call
dealer_df.rename(columns={'Date_x': 'Date_Trade', 'Date_y': 'Date_Call'}, inplace=True)

In [46]:
# Convert date columns to datetime if needed
if not np.issubdtype(dealer_df['Date_Trade'].dtype, np.datetime64):
    dealer_df['Date_Trade'] = pd.to_datetime(dealer_df['Date_Trade'], errors='coerce')

if not np.issubdtype(dealer_df['Date_Call'].dtype, np.datetime64):
    dealer_df['Date_Call'] = pd.to_datetime(dealer_df['Date_Call'], errors='coerce')

if not np.issubdtype(dealer_df['Ddate'].dtype, np.datetime64):
    dealer_df['Ddate'] = pd.to_datetime(dealer_df['Ddate'], errors='coerce')

# If you have multiple bonds/dates, a typical approach is:
dealer_df = dealer_df.sort_values(['CUSIP','Date_Trade']).copy()

In [None]:
# drop all with Date_Trade > Ddate + 30
print(len(dealer_df))
dealer_df = dealer_df[dealer_df['Date_Trade'] <= dealer_df['Ddate'] + pd.Timedelta('30 days')]
print(len(dealer_df))

In [None]:
dealer_df['call_default'] = dealer_df['Ddate'] - dealer_df['Date_Call']
print(dealer_df['call_default'].mean())
test_df = dealer_df[dealer_df['call_default'] != pd.Timedelta('84 days')]
print(test_df['call_default'].mean())

In [None]:
# print unique companies with call_default == 84
test_df = dealer_df[dealer_df['call_default'] == pd.Timedelta('84 days')]
test_df['CUSIP'].unique()

In [None]:
def flag_event_window(df, call_date_col='Date_Call', trade_date_col='Date_Trade', window=7):
    """
    For each row, checks if Date_Trade is within +/- window days of Date_Call.
    Returns an integer flag: -1 if in pre-event window, 0 if event date, 1 if in post-event window, else np.nan
    """
    delta = (df[trade_date_col] - df[call_date_col]).dt.days
    conditions = [
        (delta < 0) & (delta >= -window),
        (delta == 0),
        (delta > 0) & (delta <= window)
    ]
    choices = [-1, 0, 1]  # you can define your own labeling
    return np.select(conditions, choices, default=np.nan)

for t in [7, 14, 30]:  # define multiple windows

    dealer_df[f'event_flag_{t}d'] = flag_event_window(dealer_df, 'Date_Call', 'Date_Trade', window=t)

    pre_event = dealer_df[dealer_df[f'event_flag_{t}d'] == -1]
    post_event = dealer_df[dealer_df[f'event_flag_{t}d'] == 1]
    on_event = dealer_df[dealer_df[f'event_flag_{t}d'] == 0]

    # count of pre/post-event observations
    print(f"\n\n{t}-day window:")
    print(f"Pre-event: {len(pre_event)}")
    print(f"Post-event: {len(post_event)}")
    print(f"On-event: {len(on_event)}")

In [None]:
dealer_df['delta_days'] = (dealer_df['Date_Trade'] - dealer_df['Date_Call']).dt.days

# plot frequency of trades per day over time in relation to earnings calls
avg_trades = dealer_df.groupby('delta_days').size()#.rolling(window=3).mean()
avg_trades.name = 'avg_trades'

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(avg_trades, label='Trades per Day', color='black')

# add to x=0 dotted lines 
ax.axvline(0, color='red', linestyle='--', label='Earnings Call Date')
ax.set_xlabel('Days from Earnings Call Date')
# x min = -30
ax.set_xlim(-30, 30)
ax.set_ylabel('Number of Trades')
ax.set_ylim(0, 2000)
ax.set_title('Average Number of Trades per Day')
ax.legend()
plt.show()

In [None]:
dealer_df['delta_default'] = (dealer_df['Date_Trade'] - dealer_df['Ddate']).dt.days

# plot frequency of trades per day over time in relation to earnings calls
avg_trades = dealer_df.groupby('delta_default').size()#.rolling(window=3).mean()
avg_trades.name = 'avg_trades'

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(avg_trades, label='Trades per Day', color='black')

# add to x=0 dotted lines 
ax.axvline(0, color='red', linestyle='--', label='Default Date')
ax.set_xlabel('Days from Default Date')
# x min = -30
ax.set_xlim(-30, 30)
ax.set_ylabel('Number of Trades')
ax.set_ylim(0, 20000)
ax.set_title('Average Number of Trades per Day')
ax.legend()
plt.show()

In [None]:
# percentage of trades being done by primary dealer within 30 after call
after_30 = dealer_df[dealer_df['delta_days'] > 0]
after_30 = after_30[after_30['delta_days'] <= 30]
print('Percentage Primary 30 days after call')
print(len(after_30[after_30['PrimaryDealer'] == 1]) / len(after_30))

print('Precentage Primary general')
print(len(dealer_df[dealer_df['PrimaryDealer'] == 1]) / len(dealer_df))

In [54]:
# transform Date_y to date
dealer_df['Date_Call'] = pd.to_datetime(dealer_df['Date_Call'])

# drop all rows where Date_x < Date_y
dealer_df = dealer_df[dealer_df['Date_Trade'] >= dealer_df['Date_Call']]

In [None]:
# calculate the average RR_Price for each day after the call
daily_avg_rr = dealer_df.groupby(['delta_days'])['dealer_RR'].mean().reset_index()

# plot the average RR_Price for each day after the call
fig, ax = plt.subplots(figsize=(12, 6))

# plot rolling mean
rolling_mean = daily_avg_rr['dealer_RR'].rolling(window=3).mean()
ax.plot(daily_avg_rr['delta_days'], rolling_mean, label='3-day Rolling Mean RR', color='grey')

# add to x=0 dotted lines
ax.axvline(0, color='red', linestyle='--', label='Earnings Call Date')
ax.set_xlabel('Date')
# set x: min=0, max=60
ax.set_xlim(-0.3, 120)
# mark 52 as avg Default 
ax.axvline(52, color='black', linestyle='--', label='Mean Time to Default')
ax.set_ylabel('Average RR')
ax.set_title('Daily Average RR of all Bonds')
ax.legend()
plt.show()

In [None]:
# calculate the average RR_Price for each day after the call
daily_avg_rr = dealer_df.groupby(['delta_days'])['dealer_RR'].mean().reset_index()

# plot the average RR_Price for each day after the call
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(daily_avg_rr['delta_days'], daily_avg_rr['dealer_RR'], label='Daily Average RR', color='black')

# add to x=0 dotted lines
ax.axvline(0, color='red', linestyle='--', label='Earnings Call Date')
ax.set_xlabel('Date')
# set x: min=0, max=60
ax.set_xlim(-0.3, 120)
# mark 52 as avg Default 
ax.axvline(52, color='black', linestyle='--', label='Mean Time to Default')
ax.set_ylabel('Average RR')
ax.set_title('Daily Average RR of all Bonds')
ax.legend()
plt.show()

In [None]:
# calculate the average RR_Price for each day after the call
daily_avg_rr = dealer_df.groupby(['delta_days'])['dealer_RR'].mean().reset_index()

# plot the average RR_Price for each day after the call
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(daily_avg_rr['delta_days'], daily_avg_rr['dealer_RR'], label='Daily Average RR', color='black')

# plot rolling mean
rolling_mean = daily_avg_rr['dealer_RR'].rolling(window=3).mean()
ax.plot(daily_avg_rr['delta_days'], rolling_mean, label='3-day Rolling Mean RR', color='grey')

# add to x=0 dotted lines
ax.axvline(0, color='red', linestyle='--', label='Earnings Call Date')
ax.set_xlabel('Date')
# set x: min=0, max=60
ax.set_xlim(-0.3, 120)
# mark 52 as avg Default 
ax.axvline(52, color='black', linestyle='--', label='Mean Time to Default')
ax.set_ylabel('Average RR')
ax.set_title('Daily Average RR of all Bonds')
ax.legend()
plt.show()

In [None]:
# calculate the average RR_Price for each day after the call
daily_avg_rr = dealer_df.groupby(['delta_default'])['dealer_RR'].mean().reset_index()

# plot the average RR_Price for each day after the call
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(daily_avg_rr['delta_default'], daily_avg_rr['dealer_RR'], label='Daily Average RR', color='black')

# plot rolling mean
rolling_mean = daily_avg_rr['dealer_RR'].rolling(window=5).mean()
ax.plot(daily_avg_rr['delta_default'], rolling_mean, label='3-day Rolling Mean', color='grey')

ax.set_xlabel('Date')
# set x: min=0, max=60
ax.set_xlim(-30, 30)
# mark 52 as avg Default 
ax.axvline(0, color='black', linestyle='--', label='Default Date')
ax.set_ylabel('Average RR')
ax.set_title('Daily Average RR of all Bonds')
ax.legend()
plt.show()

In [None]:
# calculate the number of trades 7 days after the call
trades_7d = dealer_df.groupby('CUSIP')['event_flag_7d'].sum().reset_index()

# count per CUSIP descending
trades_7d = trades_7d.sort_values('event_flag_7d', ascending=False)
# rename event_flag_7d to trades_7d
trades_7d.rename(columns={'event_flag_7d': 'trades_7d'}, inplace=True)
trades_7d

In [None]:
# merge the number of trades 7 days after the call with the dealer_df
high_trades = pd.merge(dealer_df, trades_7d, on='CUSIP', how='inner')
high_trades = high_trades[high_trades['trades_7d'] > 10]

# unique call_IDs
high_trades['call_ID'].nunique()

In [None]:
high_trades[['Date_Call', "Ddate"]]

In [None]:
# give me the number of rows where delta_default < 0 and delta_days > 0
trades_between = dealer_df[(dealer_df['delta_default'] < 0) & (dealer_df['delta_days'] > 0)]
trades_between['call_ID'].nunique()

In [28]:
# List of trade chain columns
trade_chain_cols = ['TRADE_CHAIN2', 'TRADE_CHAIN3', 'TRADE_CHAIN4', 'TRADE_CHAIN5', 'TRADE_CHAIN6', 'TRADE_CHAIN7', 'TRADE_CHAIN8', 'TRADE_CHAIN9', 'TRADE_CHAIN10']

# Calculate chain length
dealer_df['transaction_chain_length'] = dealer_df[trade_chain_cols].notna().sum(axis=1) + 1  # +1 for the initial trade

In [29]:
# export the data
dealer_df.to_csv('dealer/dealer_data_llm_output.csv', index=False, sep='|')

In [None]:
# Descriptive data analysis
print(f"Transactions: {len(dealer_df)}")
print(f"Primary Transactions: {len(dealer_df[dealer_df['PrimaryDealer'] == 1])/len(dealer_df)}")

# unique bonds
print(f"No Bonds: {len(dealer_df['CUSIP'].unique())}")
print(f"No Bonds traded by Primary: {len(dealer_df[dealer_df['PrimaryDealer'] == 1]['CUSIP'].unique())}")

# average trades per bond
print(f"Avg trades per bond: {dealer_df['CUSIP'].value_counts().mean()}")

# average recovery rate
print(f"Avg RR: {dealer_df['dealer_RR'].mean()}")

# average recovery rate primary dealer
print(f"Avg RR Primary: {dealer_df[dealer_df['PrimaryDealer'] == 1]['dealer_RR'].mean()}")

# average recovery rate non-primary dealer
print(f"Avg RR Non-Primary: {dealer_df[dealer_df['PrimaryDealer'] == 0]['dealer_RR'].mean()}")

In [None]:
# Create a kernel density estimation plot for dealer_RR grouped by PrimaryDealer
plt.figure(figsize=(10, 6))

# Plot density for Primary Dealers (PrimaryDealer == 1)
dealer_df[dealer_df['PrimaryDealer'] == 1]['dealer_RR'].plot.kde(label='Primary Dealer', linewidth=2, color='#009682')

# Plot density for Non-Primary Dealers (PrimaryDealer == 0) 
dealer_df[dealer_df['PrimaryDealer'] == 0]['dealer_RR'].plot.kde(label='Non-Primary Dealer', linewidth=2, color='grey')

plt.xlabel('Recovery Rate')
# x 0 - 125
plt.xlim(0, 125)
plt.ylabel('Density')
plt.title('Distribution of Recovery Rates by Dealer Type')
plt.legend()