In [None]:
from openai import OpenAI, RateLimitError
import pandas as pd
import numpy as np

from dotenv import load_dotenv
import os
import re

from sklearn.metrics import r2_score
import time
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from source.models import *
from source.preprocessing import *
from source.variables import *
from source.helpers import *

import nltk
from nltk.stem.snowball import SnowballStemmer

load_dotenv()  # Load environment variables from .env file

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
transcript = pd.read_csv('transcripts/transcripts.csv', delimiter='|')
qna =  pd.read_csv('transcripts/QnA.csv', delimiter='|')

# Merge the two dataframes
df = pd.merge(transcript, qna[['transcript','filename']], on='filename')

# rename transcript_x to presentation and transcript_y to QnA
df = df.rename(columns={'transcript_x': 'presentation', 'transcript_y': 'QnA'})

In [None]:
mapping = pd.read_csv('data/mapping.csv')

# create a new column 'AllNames' that concatenates all versions of 'Company' for a 'CompanyName'
mapping['AllNames'] = mapping.groupby('RR_CompanyName')['Transcript_Mapping'].transform(lambda x: ', '.join(x))
mapping.head()

In [None]:
# Load recovery rates
rr = pd.read_csv('data/RR_Bonds.csv')
rr = rr[['Ddate', 'RR', 'CompanyName', 'CUSIP', 'LTDIssuance2', 'Intangibility', 'Receivables1']]

preprocessed_df = pd.read_csv('data/preprocessed_bond_data.csv')

# Add rr columns to preprocessed_df on index
preprocessed_df['RR'] = rr['RR']
preprocessed_df['Ddate'] = rr['Ddate']
preprocessed_df['CompanyName'] = rr['CompanyName']
preprocessed_df['CUSIP'] = rr['CUSIP']
preprocessed_df['LTDIssuance2'] = rr['LTDIssuance2']
preprocessed_df['Intangibility'] = rr['Intangibility']
preprocessed_df['Receivables1'] = rr['Receivables1']

rr = preprocessed_df

# Convert 'Date' column to datetime
rr['Ddate'] = pd.to_datetime(rr['Ddate'], errors='coerce')
rr.head()

In [None]:
# merge rr with mapping on CompanyName and RR_CompanyName
rr = rr.merge(mapping, left_on='CompanyName', right_on='RR_CompanyName')

In [None]:
# join with df on Company and Transcripts_Mapping
merged_df = rr.merge(df, left_on='Transcript_Mapping', right_on='Company')
print(merged_df['CompanyName'].value_counts())

In [None]:
# Ensure the columns are in datetime format
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df['Ddate'] = pd.to_datetime(merged_df['Ddate'])

# Compute the difference in days
merged_df['diff'] = (merged_df['Ddate'] - merged_df['Date']).dt.days

merged_df = merged_df[merged_df['Ddate']>merged_df['Date']]
merged_df = merged_df.sort_values(by='Date').groupby(['CUSIP']).tail(1)

print(merged_df['CompanyName'].value_counts())

In [None]:
# Initialize stemmer
stemmer = SnowballStemmer("english")

# Keywords for each credit factor
credit_keywords = {
    'Profitability': ['revenue', 'cost', 'profit', 'earnings', 'margins', 'performance', 'income', 'loss', 'decline', 'decrease', 'outlook', 'guidance'],
    'Liquidity': ['cash', 'liquidity', 'credit', 'flow', 'operations', 'expenditures', 'free cash', 'working capital', 'insolvency', 'crunch', 'flexibility', 'funding'],
    'Leverage': ['debt', 'leverage', 'refinancing', 'reduction', 'interest', 'coverage', 'repayments', 'compliance', 'rating', 'default', 'restructuring'],
    'Operating': ['sales', 'market share', 'efficiency', 'cost', 'position', 'conditions', 'production', 'challenges', 'decline', 'improvement'],
    'Market': ['stock', 'market', 'investor', 'volatility', 'shareholder', 'confidence', 'buybacks', 'dilution', 'perception'],
    'Management': ['management', 'strategic', 'restructuring', 'strategy', 'adaptability', 'leadership', 'initiatives', 'governance', 'organizational', 'CEO', 'board']
}

# Function to identify sections with potential bankruptcy indicators
def identify_bankruptcy_indicators(transcript, keywords):
    sentences = nltk.sent_tokenize(transcript)
    indicator_sentences = []
    for sentence in sentences:
        for key in keywords:
            if any(re.search(r'\b' + re.escape(word) + r'\b', sentence, re.IGNORECASE) for word in keywords[key]):
                indicator_sentences.append(sentence)
                break

    return ' '.join(indicator_sentences)

# Function to clean text by stemming and replacing numbers with magnitude tokens
def clean_text(text):
    # Replace numbers with tokens
    text = re.sub(r'\b\d+(\.\d+)?\s?(billion|bln)\b', 'bln', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d+(\.\d+)?\s?(million|mln)\b', 'mln', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d{1,3}(,\d{3})*(\.\d+)?\b', 'num', text)  # Replace remaining numbers with 'num'
    
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize, stem, and rejoin
    words = nltk.word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words if len(word) > 1]  # Remove single characters that might be noise
    return ' '.join(stemmed_words)

# Transform transcript to lowercase
merged_df['presentation'] = merged_df['presentation'].str.lower()
merged_df['QnA'] = merged_df['QnA'].str.lower()

# Apply function to identify bankruptcy indicators
merged_df['presentation'] = merged_df['presentation'].apply(lambda x: identify_bankruptcy_indicators(x, credit_keywords))
merged_df['QnA'] = merged_df['QnA'].apply(lambda x: identify_bankruptcy_indicators(x, credit_keywords))

# Apply function to clean text
merged_df['presentation'] = merged_df['presentation'].apply(clean_text)
merged_df['QnA'] = merged_df['QnA'].apply(clean_text)

# reset index
merged_df.reset_index(drop=True, inplace=True)

In [None]:
# count unique transcripts in transcripts['Cleaned_Bankruptcy_Indicators']
print(merged_df['presentation'].nunique())
print(merged_df['QnA'].nunique())

In [None]:
# add a number to each transcript based on the 196 unique transcripts
merged_df.reset_index(drop=True, inplace=True)
merged_df['transcript_number'] = merged_df['presentation'].factorize()[0]

number_transcript = merged_df[['transcript_number', 'presentation', 'QnA']].drop_duplicates().sort_values('transcript_number')

number_transcript.head()

In [None]:
# count tokens in merged_df['presentation']
print(number_transcript['presentation'].apply(lambda x: len(x.split())).sum())
print(number_transcript['QnA'].apply(lambda x: len(x.split())).sum())

In [None]:
# export number_transcript to csv
#number_transcript.to_csv('transcripts/embeddings_input.csv', index=False)
number_transcript = pd.read_csv('transcripts/embeddings_input.csv')

In [None]:
# divide number_transcript['presentation'] into 6 parts
presentation_list = number_transcript['presentation'].to_list()
QnA_list = number_transcript['QnA'].to_list()

start = 0

In [None]:
import time

#all_embeddings = []
counter = 0
for i in presentation_list:
    if counter == start:
        try:
            embeddings = client.embeddings.create(
            model="text-embedding-3-large",
            input=i,
            encoding_format="float"
            )
            all_embeddings.append(embeddings)
        except RateLimitError as e:
            print(f"Rate limit exceeded: {e}")
            time.sleep(60)
    else:
        counter += 1

In [None]:
start = len(all_embeddings)
print(f"Start: {start}")

In [None]:
number_transcript['presentation_embeddings'] = [None] * len(number_transcript)
number_transcript.reset_index(drop=True, inplace=True)

# extract embeddings from all_embeddings and add to number_transcript as 'presentation_embeddings'
for i in range(0, 196):
    print(all_embeddings[i].data[0].embedding)
    number_transcript['presentation_embeddings'][i] = all_embeddings[i].data[0].embedding

number_transcript

In [None]:
# merge output_df with transcripts on 'transcript_number' and index
merged_df = pd.merge(merged_df,
                          number_transcript[['transcript_number', 'presentation_embeddings']],
                          on='transcript_number',
                          how='left')

checkpoint = merged_df.copy()

In [None]:
#checkpoint.to_csv('transcripts/LLM_embeddings_presentation.csv', index=False)
#checkpoint = pd.read_csv('transcripts/LLM_embeddings_presentation.csv')

In [None]:
# List of selected supporting features
import numpy as np


supporting_features_1 = [
    #'CBOE DJIA Volatility Index',
    #'NASDAQ 100 Index return',
    #'Manufacturers inventories to sales ratio',
    #'30 year conventional mortgage rate',
    #'Communication Services', 
    #'Consumer Discretionary', 
    #'Senior secured',  
    #'Time to maturity',  
    #'Equity value',
    #'CDS availability',
    'ActIndustryDistress1',
    'ActIndustryDistress2',
    'Offering amount',
    'Volume',
    'Industrials','Consumer Staples','Financials','Energy','Health Care','Utilities','Information Technology','Real Estate'
]

supporting_features_2 = [
    'Default barrier',
    'LTDIssuance2',
    'Intangibility',
    'Receivables1',
]

embeddings_columns = ['presentation_embeddings']

# Select the supporting features, nlp_lables, and RR from final_df
final_df = checkpoint[['Date'] 
                    #+ supporting_features_1
                    #+ supporting_features_2
                    + embeddings_columns 
                    + ['RR']]

# replace #DIV/0! & Nan with 0
# Replace '#DIV/0!' with NaN
final_df.replace('#DIV/0!', np.nan, inplace=True)
final_df = final_df.fillna(0)

# make sure all values are numeric except for the Date column
final_df = final_df.apply(pd.to_numeric, errors='ignore')

In [None]:
# transform final_df['presentation_embeddings'] to columns
final_df = pd.concat([final_df, final_df['presentation_embeddings'].apply(pd.Series)], axis=1)
final_df.drop('presentation_embeddings', axis=1, inplace=True)
final_df.head()

In [None]:
# In-Sample-Regression
y_train, y_test = final_df['RR'], final_df['RR']
X_train, X_test = final_df.drop(columns=['RR', 'Date']), final_df.drop(columns=['RR', 'Date'])

In [None]:
# out of sample regression
# Split the data into training and testing sets
train_size = int(0.8 * len(final_df))
X_train, X_test = final_df.drop(columns=['RR', 'Date']).iloc[:train_size], final_df.drop(columns=['RR', 'Date']).iloc[train_size:]
y_train, y_test = final_df['RR'].iloc[:train_size], final_df['RR'].iloc[train_size:]

In [None]:
# Bond data 1 + LLM features

X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

# Fit the model
model = sm.OLS(y_train, X_train).fit()

# Print the summary of the model which includes p-values and significance levels
print(model.summary())

# Make predictions
y_pred = model.predict(X_test)

# Compute and print evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {np.sqrt(mse)}")
print(f"R-squared: {r2}")

# other metric
# Calculate the residuals
residuals = y_test - y_pred

# Plot the residuals
plt.figure(figsize=(10, 5))
plt.scatter(y_test, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Recovery Rate')
plt.ylabel('Residuals')

# Display the plot
plt.show()

In [None]:
# Apply ridge regression
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f'R^2 Score for Ridge Regression on {r2_ridge}')

In [None]:
in_sample = True

if in_sample:
    # In-Sample-Regression
    y_train, y_test = final_df['RR'], final_df['RR']
    X_train, X_test = final_df.drop(columns=['RR', 'Date']), final_df.drop(columns=['RR', 'Date'])

else:
    # out of sample regression
    # Split the data into training and testing sets
    train_size = int(0.7 * len(final_df))
    X_train, X_test = final_df.drop(columns=['RR', 'Date']).iloc[:train_size], final_df.drop(columns=['RR', 'Date']).iloc[train_size:]
    y_train, y_test = final_df['RR'].iloc[:train_size], final_df['RR'].iloc[train_size:]

# change y to binary based on >< 50
#y_train = np.where(y_train > 50, 0, 1)
#y_test = np.where(y_test > 50, 0, 1)

In [None]:
# Function to train Model 1: Sentiment Prediction using Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

def train_sentiment_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    log_reg = LogisticRegression()
    log_reg.fit(X_train_scaled, y_train)

    return log_reg, scaler

# Train the sentiment model
sentiment_model, scaler_sentiment = train_sentiment_model(X_train, y_train)

# Predict on test data
X_test_scaled = scaler_sentiment.transform(X_test)
y_pred_sentiment = sentiment_model.predict(X_test_scaled)

# Evaluate the accuracy of the sentiment model
accuracy = accuracy_score(y_test, y_pred_sentiment)
print(f'Sentiment Model Accuracy: {accuracy:.4f}')


In [None]:
# Function to train Model 2: Recovery Rate Prediction using Ridge Regression
from sklearn.linear_model import Ridge

bonds_df = final_df.copy()


def train_recovery_rate_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    ridge_reg = Ridge(alpha=1.0)
    ridge_reg.fit(X_train_scaled, y_train)

    return ridge_reg, scaler

# Splitting the data for training and testing
X_train_bonds, X_test_bonds, y_train_bonds, y_test_bonds = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# Train the recovery rate model
recovery_rate_model, scaler_recovery = train_recovery_rate_model(X_train_bonds, y_train_bonds)

# Predict on test data
X_test_bonds_scaled = scaler_recovery.transform(X_test_bonds)
y_pred_recovery = recovery_rate_model.predict(X_test_bonds_scaled)

# Evaluate the performance of the recovery rate model
mse = mean_squared_error(y_test_bonds, y_pred_recovery)
r2 = r2_score(y_test_bonds, y_pred_recovery)
print(f'Recovery Rate Model MSE: {mse:.4f}')
print(f'Recovery Rate Model R-squared: {r2:.4f}')
