In [None]:
import pandas as pd

In [None]:
'''If executed in Google Colab, uncomment the following lines'''
#from google.colab import drive
#drive.mount('/content/drive')

#import os
#os.chdir('/content/drive/MyDrive/LLM_CreditorRRPrediction')

In [None]:
transcript = pd.read_csv('transcripts/transcripts.csv', delimiter='|')
qna =  pd.read_csv('transcripts/QnA.csv', delimiter='|')

# Merge the two dataframes
df = pd.merge(transcript, qna[['transcript','filename']], on='filename')

# rename transcript_x to presentation and transcript_y to QnA
df.rename(columns = {'transcript_x':'presentation', 'transcript_y':'QnA'}, inplace = True)

In [None]:
mapping = pd.read_csv('data/mapping.csv')

# create a new column 'AllNames' that concatenates all versions of 'Company' for a 'CompanyName'
mapping['AllNames'] = mapping.groupby('RR_CompanyName')['Transcript_Mapping'].transform(lambda x: ', '.join(x))
mapping.head()

In [None]:
# Load recovery rates
rr = pd.read_csv('data/RR_Bonds.csv')
rr = rr[['Ddate', 'RR', 'CompanyName', 'CUSIP', 'LTDIssuance2', 'Intangibility', 'Receivables1']]

preprocessed_df = pd.read_csv('data/preprocessed_bond_data.csv')

# Add rr columns to preprocessed_df on index
preprocessed_df['RR'] = rr['RR']
preprocessed_df['Ddate'] = rr['Ddate']
preprocessed_df['CompanyName'] = rr['CompanyName']
preprocessed_df['CUSIP'] = rr['CUSIP']
preprocessed_df['LTDIssuance2'] = rr['LTDIssuance2']
preprocessed_df['Intangibility'] = rr['Intangibility']
preprocessed_df['Receivables1'] = rr['Receivables1']

rr = preprocessed_df

# Convert 'Date' column to datetime
rr['Ddate'] = pd.to_datetime(rr['Ddate'], errors='coerce')
rr.head()

In [None]:
# merge rr with mapping on CompanyName and RR_CompanyName
rr = rr.merge(mapping, left_on='CompanyName', right_on='RR_CompanyName')

In [None]:
'''Get last earnings call before default'''

# join with df on Company and Transcripts_Mapping
merged_df = rr.merge(df, left_on='Transcript_Mapping', right_on='Company')
print(merged_df['CompanyName'].value_counts())

# Ensure the columns are in datetime format
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df['Ddate'] = pd.to_datetime(merged_df['Ddate'])

merged_df['t_delta'] = merged_df['Ddate'] - merged_df['Date']

# Filter out rows where the Date is greater than the Ddate
merged_df = merged_df[merged_df['Ddate']>=merged_df['Date']]
# Get the last row for each CUSIP
merged_df = merged_df.sort_values(by='Date').groupby(['CUSIP']).tail(1)

print(merged_df['CompanyName'].value_counts())

merged_df.reset_index(drop=True, inplace=True)
# Create an ID based on unique CompanyName and Date
merged_df['call_ID'] = merged_df.groupby(['Date','CompanyName']).ngroup()

print(merged_df['call_ID'].nunique())

In [None]:
# export the merged_df to a csv file
merged_df.to_csv('transcripts/credit_df.csv', index=False, sep='|')

In [None]:
aggregated_df = merged_df[['call_ID', 'presentation', 'QnA', 'CompanyName', 'Ddate']].drop_duplicates().sort_values('call_ID')
aggregated_df.reset_index(drop=True, inplace=True)

# export the aggregated_df to a csv file
aggregated_df.to_csv('transcripts/aggregated_credit_df.csv', index=False, sep='|')

In [None]:
'''Get first earnings call after default'''
'''Within first 30 days'''

# join with df on Company and Transcripts_Mapping
merged_df = rr.merge(df, left_on='Transcript_Mapping', right_on='Company')
print(merged_df['CompanyName'].value_counts())

# Ensure the columns are in datetime format
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df['Ddate'] = pd.to_datetime(merged_df['Ddate'])

merged_df['t_delta'] = merged_df['Date'] - merged_df['Ddate']

# Filter out rows where the Date is greater than the Ddate
merged_df = merged_df[merged_df['Ddate']<merged_df['Date']]
merged_df = merged_df[merged_df['t_delta'] <= pd.Timedelta(days=30)]
# Get the last row for each CUSIP
merged_df = merged_df.sort_values(by='Date').groupby(['CUSIP']).head(1)

print(merged_df['CompanyName'].value_counts())

merged_df.reset_index(drop=True, inplace=True)
# Create an ID based on unique CompanyName and Date
merged_df['call_ID'] = merged_df.groupby(['Date','CompanyName']).ngroup()

print(merged_df['call_ID'].nunique())

In [None]:
# export the merged_df to a csv file
merged_df.to_csv('transcripts/post_credit_df.csv', index=False, sep='|')

In [None]:
aggregated_df = merged_df[['call_ID', 'presentation', 'QnA', 'CompanyName', 'Ddate']].drop_duplicates().sort_values('call_ID')
aggregated_df.reset_index(drop=True, inplace=True)

# export the aggregated_df to a csv file
aggregated_df.to_csv('transcripts/post_aggregated_credit_df.csv', index=False, sep='|')

In [14]:
'''Data Exploration'''
merged_df = pd.read_csv('transcripts/credit_df.csv', delimiter='|')

In [15]:
# Ensure the columns are in datetime format
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df['Ddate'] = pd.to_datetime(merged_df['Ddate'])
merged_df
merged_df['t_delta'] = merged_df['Ddate'] - merged_df['Date']

# drop all with t_delta > 180
merged_df = merged_df[merged_df['t_delta'] <= pd.Timedelta('180 days')]

In [None]:
# Get the number of unique companies
print('Unique Companies')
print(merged_df['CompanyName'].nunique())

# Get the number of unique CUSIPs
print('Unique Bonds')
print(merged_df['CUSIP'].nunique())

# Get the number of unique call_IDs
print('Unique Earnings Calls')
print(merged_df['call_ID'].nunique())

In [17]:
# Count occurences of each sector
# Columns: 'Industrials','Consumer Staples','Financials','Energy','Health Care','Utilities','Information Technology','Real Estate'

sector_dict = {
    'Industrials': 0,
    'Consumer Staples': 0,
    'Financials': 0,
    'Energy': 0,
    'Health Care': 0,
    'Utilities': 0,
    'Information Technology': 0,
    'Real Estate': 0
}

for sector in sector_dict.keys():
    sector_dict[sector] = merged_df[sector].sum()

In [None]:
print(sector_dict)

In [None]:
# total sector count
print('Without Sector')
sector_dict['None'] = len(merged_df) - sum(sector_dict.values())

# create bar plot
import matplotlib.pyplot as plt
plt.bar(sector_dict.keys(), sector_dict.values())
plt.title('Sector Distribution')   
# make it wider
plt.gcf().set_size_inches(16, 5)
plt.show()

In [None]:
financials = ['CBOE DJIA Volatility Index',
    'NASDAQ 100 Index return',
    'Manufacturers inventories to sales ratio',
    '30 year conventional mortgage rate',
    'Communication Services', 
    'Consumer Discretionary', 
    'Senior secured',  
    'Time to maturity',  
    'Equity value',
    'CDS availability',
    'ActIndustryDistress1',
    'ActIndustryDistress2',
    'Offering amount',
    'Volume',
    'Default barrier',
    'LTDIssuance2',
    'Intangibility',
    'Receivables1',
    'RR']

merged_df['RR'].describe()

In [None]:
# create an graph to show the distibution of RR
plt.hist(merged_df['RR'], bins=20)
plt.title('Distribution of Recovery Rates')
plt.xlabel('Recovery Rate')
plt.ylabel('Frequency')
plt.show()