In [1]:
import gensim
from gensim.models import Word2Vec

from openai import OpenAI, RateLimitError
import pandas as pd
import numpy as np

from dotenv import load_dotenv
import os
import re

from sklearn.metrics import r2_score
import time
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from source.models import *
from source.preprocessing import *
from source.variables import *
from source.helpers import *

import nltk
from nltk.stem.snowball import SnowballStemmer


In [2]:
transcript = pd.read_csv('transcripts/transcripts.csv', delimiter='|')
qna =  pd.read_csv('transcripts/QnA.csv', delimiter='|')

# Merge the two dataframes
df = pd.merge(transcript, qna[['transcript','filename']], on='filename')

# rename transcript_x to presentation and transcript_y to QnA
df = df.rename(columns={'transcript_x': 'presentation', 'transcript_y': 'QnA'})

In [3]:
mapping = pd.read_csv('data/mapping.csv')

# create a new column 'AllNames' that concatenates all versions of 'Company' for a 'CompanyName'
mapping['AllNames'] = mapping.groupby('RR_CompanyName')['Transcript_Mapping'].transform(lambda x: ', '.join(x))
mapping.head()

Unnamed: 0,RR_CompanyName,Transcript_Mapping,AllNames
0,"1-800-FLOWERS.COM, Inc.",1-800-Flowers.com Inc.,1-800-Flowers.com Inc.
1,3M Company,3M Company,"3M Company, 3M Co."
2,3M Company,3M Co.,"3M Company, 3M Co."
3,A.M. Castle & Co.,A. M. Castle Co.,"A. M. Castle Co., A.M. Castle Co., AM Castle..."
4,A.M. Castle & Co.,A.M. Castle Co.,"A. M. Castle Co., A.M. Castle Co., AM Castle..."


In [4]:
# Load recovery rates
rr = pd.read_csv('data/RR_Bonds.csv')
rr = rr[['Ddate', 'RR', 'CompanyName', 'CUSIP', 'LTDIssuance2', 'Intangibility', 'Receivables1']]

preprocessed_df = pd.read_csv('data/preprocessed_bond_data.csv')

# Add rr columns to preprocessed_df on index
preprocessed_df['RR'] = rr['RR']
preprocessed_df['Ddate'] = rr['Ddate']
preprocessed_df['CompanyName'] = rr['CompanyName']
preprocessed_df['CUSIP'] = rr['CUSIP']
preprocessed_df['LTDIssuance2'] = rr['LTDIssuance2']
preprocessed_df['Intangibility'] = rr['Intangibility']
preprocessed_df['Receivables1'] = rr['Receivables1']

rr = preprocessed_df

# Convert 'Date' column to datetime
rr['Ddate'] = pd.to_datetime(rr['Ddate'], errors='coerce')
rr.head()

Unnamed: 0,RR,ActIndustryDistress1,ActIndustryDistress2,Senior secured,Senior unsecured,Senior subordinated,Subordinated \& Junior,Equity value,Default barrier,Net income margin,...,Russell 2000 Price Index return,Russell 2000 Vol 1m,Wilshire US Small-Cap Price Index,Wilshire Small Cap Vol,Ddate,CompanyName,CUSIP,LTDIssuance2,Intangibility,Receivables1
0,0.18901,0,0,0,1,0,0,1.28712,0.258205,-0.776257,...,0.01903,21.04,3056.03,808.357714,2004-01-01,Bethlehem Steel Corp.,087509AL9,0.467834468,0.058009127,0.029416454
1,20.553472,0,1,0,1,0,0,-135.215,1.269706,-0.564199,...,0.01903,21.04,3137.1,974.74921,2004-05-01,"T-Mobile US, Inc.",45071TAD7,0.0,0.200428895,0.032214499
2,54.315958,0,1,0,1,0,0,-366.575,1.081883,-0.671751,...,0.01903,21.05,3178.04,825.987663,2004-01-15,RCN Corporation,749361AC5,0.0,0.005146611,0.032214499
3,54.79887,0,1,0,1,0,0,-366.575,1.081883,-0.671751,...,0.01903,21.05,3178.04,825.987663,2004-01-15,RCN Corporation,749361AD3,0.0,0.005146611,0.029416454
4,56.666288,0,1,0,1,0,0,-366.575,1.081883,-0.671751,...,0.01903,21.05,3178.04,825.987663,2004-01-15,RCN Corporation,749361AG6,0.0,0.005146611,0.029416454


In [5]:
# merge rr with mapping on CompanyName and RR_CompanyName
rr = rr.merge(mapping, left_on='CompanyName', right_on='RR_CompanyName')

# join with df on Company and Transcripts_Mapping
merged_df = rr.merge(df, left_on='Transcript_Mapping', right_on='Company')
print(merged_df['CompanyName'].value_counts())

Ally Financial Inc.               10317
CIT Group Inc.                    10185
Lehman Brothers Holdings, Inc.     2853
Charter Communications, Inc.       2144
Sempra Energy                      1147
                                  ...  
Frontier Group Holdings, Inc.         1
Dayton Superior Corporation           1
Franklin Bank Corp.                   1
Kellwood Company, LLC                 1
Turning Point Brands, Inc.            1
Name: CompanyName, Length: 210, dtype: int64


In [6]:
# Ensure the columns are in datetime format
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df['Ddate'] = pd.to_datetime(merged_df['Ddate'])

# Compute the difference in days
merged_df['diff'] = (merged_df['Ddate'] - merged_df['Date']).dt.days

merged_df = merged_df[merged_df['Ddate']>merged_df['Date']]
merged_df = merged_df.sort_values(by='Date').groupby(['CUSIP']).tail(1)

print(merged_df['CompanyName'].value_counts())

Lehman Brothers Holdings, Inc.      317
CIT Group Inc.                      291
Charter Communications, Inc.         28
Ford Motor Company                   19
iStar Inc.                           17
                                   ... 
Centrus Energy Corp.                  1
Education Management Corporation      1
Venoco, Inc.                          1
Exelon Corporation                    1
Kellwood Company, LLC                 1
Name: CompanyName, Length: 159, dtype: int64


In [7]:
# Initialize stemmer
stemmer = SnowballStemmer("english")

# Keywords for each credit factor
credit_keywords = {
    'Profitability': ['revenue', 'cost', 'profit', 'earnings', 'margins', 'performance', 'income', 'loss', 'decline', 'decrease', 'outlook', 'guidance'],
    'Liquidity': ['cash', 'liquidity', 'credit', 'flow', 'operations', 'expenditures', 'free cash', 'working capital', 'insolvency', 'crunch', 'flexibility', 'funding'],
    'Leverage': ['debt', 'leverage', 'refinancing', 'reduction', 'interest', 'coverage', 'repayments', 'compliance', 'rating', 'default', 'restructuring'],
    'Operating': ['sales', 'market share', 'efficiency', 'cost', 'position', 'conditions', 'production', 'challenges', 'decline', 'improvement'],
    'Market': ['stock', 'market', 'investor', 'volatility', 'shareholder', 'confidence', 'buybacks', 'dilution', 'perception'],
    'Management': ['management', 'strategic', 'restructuring', 'strategy', 'adaptability', 'leadership', 'initiatives', 'governance', 'organizational', 'CEO', 'board']
}

# Function to identify sections with potential bankruptcy indicators
def identify_bankruptcy_indicators(transcript, keywords):
    sentences = nltk.sent_tokenize(transcript)
    indicator_sentences = []
    for sentence in sentences:
        for key in keywords:
            if any(re.search(r'\b' + re.escape(word) + r'\b', sentence, re.IGNORECASE) for word in keywords[key]):
                indicator_sentences.append(sentence)
                break

    return ' '.join(indicator_sentences)

# Function to clean text by stemming and replacing numbers with magnitude tokens
def clean_text(text):
    # Replace numbers with tokens
    text = re.sub(r'\b\d+(\.\d+)?\s?(billion|bln)\b', 'bln', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d+(\.\d+)?\s?(million|mln)\b', 'mln', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d{1,3}(,\d{3})*(\.\d+)?\b', 'num', text)  # Replace remaining numbers with 'num'
    
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize, stem, and rejoin
    words = nltk.word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words if len(word) > 1]  # Remove single characters that might be noise
    return ' '.join(stemmed_words)

# Transform transcript to lowercase
merged_df['presentation'] = merged_df['presentation'].str.lower()
merged_df['QnA'] = merged_df['QnA'].str.lower()

# Apply function to identify bankruptcy indicators
merged_df['presentation_cleaned'] = merged_df['presentation'].apply(lambda x: identify_bankruptcy_indicators(x, credit_keywords))
merged_df['QnA_cleaned'] = merged_df['QnA'].apply(lambda x: identify_bankruptcy_indicators(x, credit_keywords))

# Apply function to clean text
merged_df['presentation'] = merged_df['presentation'].apply(clean_text)
merged_df['QnA'] = merged_df['QnA'].apply(clean_text)

merged_df['presentation_cleaned'] = merged_df['presentation_cleaned'].apply(clean_text)
merged_df['QnA_cleaned'] = merged_df['QnA_cleaned'].apply(clean_text)

# reset index
merged_df.reset_index(drop=True, inplace=True)

In [8]:
# add a number to each transcript based on the 196 unique transcripts
merged_df.reset_index(drop=True, inplace=True)
merged_df['transcript_number'] = merged_df['presentation'].factorize()[0]

number_transcript = merged_df[['transcript_number', 'presentation', 'QnA', 'presentation_cleaned', 'QnA_cleaned']].drop_duplicates().sort_values('transcript_number')

number_transcript.head()

Unnamed: 0,transcript_number,presentation,QnA,presentation_cleaned,QnA_cleaned
0,0,present oper good morn ladi and gentlemen welc...,question and answer oper oper instruct our fir...,oper instruct would now like to turn the call ...,question and answer oper oper instruct our fir...
1,1,present oper ladi and gentlemen thank you for ...,question and answer oper oper instruct thank y...,as remind this confer is be record today tuesd...,your gross margin were all abov expect and how...
2,2,present oper greet ladi and gentlemen and welc...,question and answer oper thank you ladi and ge...,present oper greet ladi and gentlemen and welc...,guess just to clarifi on the casm guidanc so i...
3,3,present oper thank you all parti for stand by ...,question and answer oper oper instruct our fir...,with me today are dick bond our presid and ceo...,dian geissler merril lynch just want to sort o...
4,4,present oper thank you for join the silgan hol...,question and answer oper oper instruct well ta...,present oper thank you for join the silgan hol...,georg stapho banc of america secur guess first...


In [9]:
#Create Embeddings for each presentation
def get_embedding(words):
    # Compute the mean of the Word2Vec embeddings for the words in the sentence
    embedding = np.mean([w2v_model.wv[word] for word in words if word in w2v_model.wv], axis=0)
    
    # If the embedding is not computed (due to no valid words), return a zero vector
    if isinstance(embedding, np.ndarray):
        return embedding
    else:
        return np.zeros(w2v_model.vector_size)
    
sentences_dict = {
    'presentation': number_transcript['presentation'].tolist(),
    'QnA': number_transcript['QnA'].tolist(),
    'presentation_cleaned': number_transcript['presentation_cleaned'].tolist(),
    'QnA_cleaned': number_transcript['QnA_cleaned'].tolist()
}
    
for key in sentences_dict:
    sentences = sentences_dict[key]
    # Initialize the Word2Vec model
    w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

    # Train the model
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=10)

    # Apply the get_embedding function to each row
    number_transcript[key+'_embedding'] = number_transcript[key].apply(get_embedding)

In [11]:
# merge output_df with transcripts on 'transcript_number' and index
merged_df = pd.merge(merged_df,
                          number_transcript[['transcript_number', 'presentation_embedding', 'QnA_embedding', 'presentation_cleaned_embedding', 'QnA_cleaned_embedding']],  
                          on='transcript_number',
                          how='left')

checkpoint = merged_df.copy()

In [22]:
# List of selected supporting features
import numpy as np


supporting_features_1 = [
    #'CBOE DJIA Volatility Index',
    #'NASDAQ 100 Index return',
    #'Manufacturers inventories to sales ratio',
    #'30 year conventional mortgage rate',
    #'Communication Services', 
    #'Consumer Discretionary', 
    #'Senior secured',  
    #'Time to maturity',  
    #'Equity value',
    #'CDS availability',
    'ActIndustryDistress1',
    'ActIndustryDistress2',
    'Offering amount',
    'Volume',
    'Industrials','Consumer Staples','Financials','Energy','Health Care','Utilities','Information Technology','Real Estate'
]

supporting_features_2 = [
    'Default barrier',
    'LTDIssuance2',
    'Intangibility',
    'Receivables1',
]

embeddings_columns = ['presentation_embedding', 'QnA_embedding', 'presentation_cleaned_embedding', 'QnA_cleaned_embedding']

# Select the supporting features, nlp_lables, and RR from final_df
final_df = checkpoint[['Date'] 
                    #+ supporting_features_1
                    #+ supporting_features_2
                    + embeddings_columns 
                    + ['RR']]

# replace #DIV/0! & Nan with 0
# Replace '#DIV/0!' with NaN
final_df.replace('#DIV/0!', np.nan, inplace=True)
final_df = final_df.fillna(0)

# make sure all values are numeric except for the Date column
final_df = final_df.apply(pd.to_numeric, errors='ignore')

final_df.head()

  mask |= arr == x
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


Unnamed: 0,Date,presentation_embedding,QnA_embedding,presentation_cleaned_embedding,QnA_cleaned_embedding,RR
0,1164931200000000000,"[-0.13621469, -0.03863703, 0.024582852, 0.0642...","[0.006389262, -0.06428379, 0.039283425, -0.030...","[0.0330176, -0.04302061, -0.17677653, 0.051578...","[-0.06587043, 0.047376696, -0.055016145, 0.060...",29.023972
1,1194307200000000000,"[-0.13627845, -0.039064318, 0.01957788, 0.0709...","[0.0022695146, -0.0684128, 0.047639284, -0.031...","[0.03032807, -0.03633119, -0.16206034, 0.06960...","[-0.069002956, 0.048829224, -0.053251397, 0.05...",8.86942
2,1201219200000000000,"[-0.13572867, -0.045439865, 0.032690413, 0.070...","[0.0068305586, -0.0673575, 0.056243017, -0.022...","[0.05018964, -0.031179843, -0.19656299, 0.0548...","[-0.07196874, 0.045851048, -0.04380939, 0.0832...",33.210455
3,1201478400000000000,"[-0.1359667, -0.04059977, 0.024010217, 0.07212...","[0.009730226, -0.06818771, 0.050679263, -0.034...","[0.044179264, -0.03884086, -0.17875871, 0.0646...","[-0.07140504, 0.046560545, -0.039972365, 0.083...",11.875
4,1201651200000000000,"[-0.1362379, -0.043872066, 0.029845119, 0.0659...","[0.006883542, -0.06958085, 0.056686647, -0.028...","[0.04450196, -0.04885385, -0.18468922, 0.06653...","[-0.06751867, 0.046309773, -0.041060567, 0.081...",18.34


In [25]:
from sklearn.linear_model import Ridge


for key in embeddings_columns:

    print(f'Running regression for {key}')

    # transform final_df['presentation_embeddings'] to columns
    key_df = pd.concat([final_df['RR'], final_df[key].apply(pd.Series)], axis=1)

    # In-Sample-Regression
    y_train, y_test = key_df['RR'], key_df['RR']
    X_train, X_test = key_df.drop(columns=['RR']), key_df.drop(columns=['RR'])

    # Fit the model
    model = sm.OLS(y_train, sm.add_constant(X_train)).fit()

    # Make predictions
    y_pred = model.predict(sm.add_constant(X_test))

    # Compute the R^2 score
    r2 = r2_score(y_test, y_pred)
    print(f'R^2 Score for {key}: {r2}')

    # Apply ridge regression
    ridge_model = Ridge(alpha=1.0)
    ridge_model.fit(X_train, y_train)
    y_pred_ridge = ridge_model.predict(X_test)
    r2_ridge = r2_score(y_test, y_pred_ridge)
    print(f'R^2 Score for Ridge Regression on {key}: {r2_ridge}')
    

Running regression for presentation_embedding
R^2 Score for presentation_embedding: 0.6610568311343281
R^2 Score for Ridge Regression on presentation_embedding: 0.06029612992229616
Running regression for QnA_embedding
R^2 Score for QnA_embedding: 0.6527253029901321
R^2 Score for Ridge Regression on QnA_embedding: 0.1950835491909897
Running regression for presentation_cleaned_embedding
R^2 Score for presentation_cleaned_embedding: 0.6793402251543508
R^2 Score for Ridge Regression on presentation_cleaned_embedding: 0.07821713741419944
Running regression for QnA_cleaned_embedding
R^2 Score for QnA_cleaned_embedding: 0.6674648820220185
R^2 Score for Ridge Regression on QnA_cleaned_embedding: 0.18373720178239705
