In [1]:
import os
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
from nltk.stem.snowball import SnowballStemmer
from sklearn.decomposition import LatentDirichletAllocation
import statsmodels.api as sm
import matplotlib.pyplot as plt
import re
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import matplotlib.pyplot as plt

from source.models import *
from source.preprocessing import *
from source.variables import *
from source.helpers import *

from interpret.glassbox import ExplainableBoostingRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
transcript = pd.read_csv('transcripts/transcripts.csv', delimiter='|')
qna =  pd.read_csv('transcripts/QnA.csv', delimiter='|')

# Merge the two dataframes
df = pd.merge(transcript, qna[['transcript','filename']], on='filename')

# rename transcript_x to presentation and transcript_y to QnA
df = df.rename(columns={'transcript_x': 'presentation', 'transcript_y': 'QnA'})

In [3]:
mapping = pd.read_csv('data/mapping.csv')

# create a new column 'AllNames' that concatenates all versions of 'Company' for a 'CompanyName'
mapping['AllNames'] = mapping.groupby('RR_CompanyName')['Transcript_Mapping'].transform(lambda x: ', '.join(x))
mapping.head()

Unnamed: 0,RR_CompanyName,Transcript_Mapping,AllNames
0,"1-800-FLOWERS.COM, Inc.",1-800-Flowers.com Inc.,1-800-Flowers.com Inc.
1,3M Company,3M Company,"3M Company, 3M Co."
2,3M Company,3M Co.,"3M Company, 3M Co."
3,A.M. Castle & Co.,A. M. Castle Co.,"A. M. Castle Co., A.M. Castle Co., AM Castle..."
4,A.M. Castle & Co.,A.M. Castle Co.,"A. M. Castle Co., A.M. Castle Co., AM Castle..."


In [4]:
# Load recovery rates
rr = pd.read_csv('data/RR_Bonds.csv')
rr = rr[['Ddate', 'RR', 'CompanyName', 'CUSIP', 'LTDIssuance2', 'Intangibility', 'Receivables1']]

preprocessed_df = pd.read_csv('data/preprocessed_bond_data.csv')

# Add rr columns to preprocessed_df on index
preprocessed_df['RR'] = rr['RR']
preprocessed_df['Ddate'] = rr['Ddate']
preprocessed_df['CompanyName'] = rr['CompanyName']
preprocessed_df['CUSIP'] = rr['CUSIP']
preprocessed_df['LTDIssuance2'] = rr['LTDIssuance2']
preprocessed_df['Intangibility'] = rr['Intangibility']
preprocessed_df['Receivables1'] = rr['Receivables1']

rr = preprocessed_df

# Convert 'Date' column to datetime
rr['Ddate'] = pd.to_datetime(rr['Ddate'], errors='coerce')
rr.head()

Unnamed: 0,RR,ActIndustryDistress1,ActIndustryDistress2,Senior secured,Senior unsecured,Senior subordinated,Subordinated \& Junior,Equity value,Default barrier,Net income margin,...,Russell 2000 Price Index return,Russell 2000 Vol 1m,Wilshire US Small-Cap Price Index,Wilshire Small Cap Vol,Ddate,CompanyName,CUSIP,LTDIssuance2,Intangibility,Receivables1
0,0.18901,0,0,0,1,0,0,1.28712,0.258205,-0.776257,...,0.01903,21.04,3056.03,808.357714,2004-01-01,Bethlehem Steel Corp.,087509AL9,0.467834468,0.058009127,0.029416454
1,20.553472,0,1,0,1,0,0,-135.215,1.269706,-0.564199,...,0.01903,21.04,3137.1,974.74921,2004-05-01,"T-Mobile US, Inc.",45071TAD7,0.0,0.200428895,0.032214499
2,54.315958,0,1,0,1,0,0,-366.575,1.081883,-0.671751,...,0.01903,21.05,3178.04,825.987663,2004-01-15,RCN Corporation,749361AC5,0.0,0.005146611,0.032214499
3,54.79887,0,1,0,1,0,0,-366.575,1.081883,-0.671751,...,0.01903,21.05,3178.04,825.987663,2004-01-15,RCN Corporation,749361AD3,0.0,0.005146611,0.029416454
4,56.666288,0,1,0,1,0,0,-366.575,1.081883,-0.671751,...,0.01903,21.05,3178.04,825.987663,2004-01-15,RCN Corporation,749361AG6,0.0,0.005146611,0.029416454


In [5]:
# merge rr with mapping on CompanyName and RR_CompanyName
rr = rr.merge(mapping, left_on='CompanyName', right_on='RR_CompanyName')

In [6]:
# join with df on Company and Transcripts_Mapping
merged_df = rr.merge(df, left_on='Transcript_Mapping', right_on='Company')
print(merged_df['CompanyName'].value_counts())

Ally Financial Inc.               10317
CIT Group Inc.                    10185
Lehman Brothers Holdings, Inc.     2853
Charter Communications, Inc.       2144
Sempra Energy                      1147
                                  ...  
Frontier Group Holdings, Inc.         1
Dayton Superior Corporation           1
Franklin Bank Corp.                   1
Kellwood Company, LLC                 1
Turning Point Brands, Inc.            1
Name: CompanyName, Length: 210, dtype: int64


In [7]:
# Ensure the columns are in datetime format
merged_df['Date'] = pd.to_datetime(merged_df['Date'])
merged_df['Ddate'] = pd.to_datetime(merged_df['Ddate'])

# Compute the difference in days
merged_df['diff'] = (merged_df['Ddate'] - merged_df['Date']).dt.days

merged_df = merged_df[merged_df['Ddate']>merged_df['Date']]
merged_df = merged_df.sort_values(by='Date').groupby(['CUSIP']).tail(1)

print(merged_df['CompanyName'].value_counts())

Lehman Brothers Holdings, Inc.      317
CIT Group Inc.                      291
Charter Communications, Inc.         28
Ford Motor Company                   19
iStar Inc.                           17
                                   ... 
Centrus Energy Corp.                  1
Education Management Corporation      1
Venoco, Inc.                          1
Exelon Corporation                    1
Kellwood Company, LLC                 1
Name: CompanyName, Length: 159, dtype: int64


In [8]:
# Initialize stemmer
stemmer = SnowballStemmer("english")

# Keywords for each credit factor
credit_keywords = {
    'Profitability': ['revenue', 'cost', 'profit', 'earnings', 'margins', 'performance', 'income', 'loss', 'decline', 'decrease', 'outlook', 'guidance'],
    'Liquidity': ['cash', 'liquidity', 'credit', 'flow', 'operations', 'expenditures', 'free cash', 'working capital', 'insolvency', 'crunch', 'flexibility', 'funding'],
    'Leverage': ['debt', 'leverage', 'refinancing', 'reduction', 'interest', 'coverage', 'repayments', 'compliance', 'rating', 'default', 'restructuring'],
    'Operating': ['sales', 'market share', 'efficiency', 'cost', 'position', 'conditions', 'production', 'challenges', 'decline', 'improvement'],
    'Market': ['stock', 'market', 'investor', 'volatility', 'shareholder', 'confidence', 'buybacks', 'dilution', 'perception'],
    'Management': ['management', 'strategic', 'restructuring', 'strategy', 'adaptability', 'leadership', 'initiatives', 'governance', 'organizational', 'CEO', 'board']
}

# Function to identify sections with potential bankruptcy indicators
def identify_bankruptcy_indicators(transcript, keywords):
    sentences = nltk.sent_tokenize(transcript)
    indicator_sentences = []
    for sentence in sentences:
        for key in keywords:
            if any(re.search(r'\b' + re.escape(word) + r'\b', sentence, re.IGNORECASE) for word in keywords[key]):
                indicator_sentences.append(sentence)
                break

    return ' '.join(indicator_sentences)

# Function to clean text by stemming and replacing numbers with magnitude tokens
def clean_text(text):
    # Replace numbers with tokens
    text = re.sub(r'\b\d+(\.\d+)?\s?(billion|bln)\b', 'bln', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d+(\.\d+)?\s?(million|mln)\b', 'mln', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d{1,3}(,\d{3})*(\.\d+)?\b', 'num', text)  # Replace remaining numbers with 'num'
    
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize, stem, and rejoin
    words = nltk.word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words if len(word) > 1]  # Remove single characters that might be noise
    return ' '.join(stemmed_words)

# Transform transcript to lowercase
merged_df['presentation'] = merged_df['presentation'].str.lower()
merged_df['QnA'] = merged_df['QnA'].str.lower()

# Apply function to identify bankruptcy indicators
merged_df['presentation_cleaned'] = merged_df['presentation'].apply(lambda x: identify_bankruptcy_indicators(x, credit_keywords))
merged_df['QnA_cleaned'] = merged_df['QnA'].apply(lambda x: identify_bankruptcy_indicators(x, credit_keywords))

# Apply function to clean text
merged_df['presentation'] = merged_df['presentation'].apply(clean_text)
merged_df['QnA'] = merged_df['QnA'].apply(clean_text)

merged_df['presentation_cleaned'] = merged_df['presentation_cleaned'].apply(clean_text)
merged_df['QnA_cleaned'] = merged_df['QnA_cleaned'].apply(clean_text)

# reset index
merged_df.reset_index(drop=True, inplace=True)

In [9]:
sentiment = pd.read_csv('data/sentiment.csv')

# drop all columns except Word, Negative, Positive, Uncertainty, Litigious, Strong_Modal, Weak_Modal
sentiment = sentiment[['Word', 'Negative', 'Positive', 'Uncertainty', 'Litigious', 'StrongModal', 'WeakModal']]

# change any number in Negative, Positive, Uncertainty, Litigious, Strong_Modal, Weak_Modal to 1
sentiment.loc[sentiment['Negative'] != 0, 'Negative'] = 1
sentiment.loc[sentiment['Positive'] != 0, 'Positive'] = 1
sentiment.loc[sentiment['Uncertainty'] != 0, 'Uncertainty'] = 1
sentiment.loc[sentiment['Litigious'] != 0, 'Litigious'] = 1
sentiment.loc[sentiment['StrongModal'] != 0, 'StrongModal'] = 1
sentiment.loc[sentiment['WeakModal'] != 0, 'WeakModal'] = 1

# drop all rows where all columns are 0
sentiment = sentiment[(sentiment[['Negative', 'Positive', 'Uncertainty', 'Litigious', 'StrongModal', 'WeakModal']] != 0).any(axis=1)]

sentiment.head()

Unnamed: 0,Word,Negative,Positive,Uncertainty,Litigious,StrongModal,WeakModal
9,ABANDON,1,0,0,0,0,0
10,ABANDONED,1,0,0,0,0,0
11,ABANDONING,1,0,0,0,0,0
12,ABANDONMENT,1,0,0,0,0,0
13,ABANDONMENTS,1,0,0,0,0,0


In [10]:
# make all words in Word column lowercase and stem them
sentiment['Word'] = sentiment['Word'].str.lower()
sentiment['Word'] = sentiment['Word'].apply(lambda x: stemmer.stem(x))

# drop duplicates
sentiment.drop_duplicates(subset='Word', keep='first', inplace=True)

In [11]:
# in lower case
financial_lexicon = sentiment['Word'].tolist()

def filter_tokens_by_lexicon(text, lexicon):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word in lexicon]
    return ' '.join(filtered_tokens)

filtered_presentation = [filter_tokens_by_lexicon(transcript, financial_lexicon) for transcript in merged_df['presentation']]
filtered_presentation_cleaned = [filter_tokens_by_lexicon(transcript, financial_lexicon) for transcript in merged_df['presentation_cleaned']]
filtered_qna = [filter_tokens_by_lexicon(transcript, financial_lexicon) for transcript in merged_df['QnA']]
filtered_qna_cleaned = [filter_tokens_by_lexicon(transcript, financial_lexicon) for transcript in merged_df['QnA_cleaned']]

# Build the DTM
vectorizer = CountVectorizer()

presentation_dtm = vectorizer.fit_transform(filtered_presentation)
presentation_dtm_df = pd.DataFrame(presentation_dtm.toarray(), columns=vectorizer.get_feature_names_out())

presentation_cleaned_dtm = vectorizer.fit_transform(filtered_presentation_cleaned)
presentation_cleaned_dtm_df = pd.DataFrame(presentation_cleaned_dtm.toarray(), columns=vectorizer.get_feature_names_out())

qna_dtm = vectorizer.fit_transform(filtered_qna)
qna_dtm_df = pd.DataFrame(qna_dtm.toarray(), columns=vectorizer.get_feature_names_out())

qna_cleaned_dtm = vectorizer.fit_transform(filtered_qna_cleaned)
qna_cleaned_dtm_df = pd.DataFrame(qna_cleaned_dtm.toarray(), columns=vectorizer.get_feature_names_out())

In [21]:
# List of selected supporting features
supporting_features_1 = [
    'CBOE DJIA Volatility Index',
    'NASDAQ 100 Index return',
    'Manufacturers inventories to sales ratio',
    '30 year conventional mortgage rate',
    'Communication Services', 
    'Consumer Discretionary', 
    'Senior secured',  
    'Time to maturity',  
    'Equity value',
    'CDS availability',
    'ActIndustryDistress1',
    'ActIndustryDistress2',
    'Offering amount',
    'Volume',
    'Industrials','Consumer Staples','Financials','Energy','Health Care','Utilities','Information Technology','Real Estate'
]

supporting_features_2 = [
    'Default barrier',
    'LTDIssuance2',
    'Intangibility',
    'Receivables1',
]

In [65]:
# create a dictionary of the dataframes
dtm_dict = {
    'presentation': presentation_dtm_df,
    'presentation_cleaned': presentation_cleaned_dtm_df,
    'QnA': qna_dtm_df,
    'QnA_cleaned': qna_cleaned_dtm_df
}

# create a dictionary of dataframes for the 4 types which based on merged_df['RR']
dict_rr = {
    'presentation': merged_df[['RR','Date', 'presentation'] + supporting_features_1 + supporting_features_2].reset_index(drop=True),
    'presentation_cleaned': merged_df[['RR','Date', 'presentation_cleaned'] + supporting_features_1 + supporting_features_2].reset_index(drop=True),
    'QnA': merged_df[['RR','Date', 'QnA'] + supporting_features_1 + supporting_features_2].reset_index(drop=True),
    'QnA_cleaned': merged_df[['RR','Date', 'QnA_cleaned'] + supporting_features_1 + supporting_features_2].reset_index(drop=True),
    'all': merged_df[['RR','Date', 'presentation', 'QnA'] + supporting_features_1 + supporting_features_2].reset_index(drop=True),
    'all_cleaned': merged_df[['RR','Date', 'presentation_cleaned', 'QnA_cleaned'] + supporting_features_1 + supporting_features_2].reset_index(drop=True)
}

In [66]:
# create lists of categories
negative_words = sentiment[sentiment['Negative'] == 1]['Word'].str.lower().tolist()
positive_words = sentiment[sentiment['Positive'] == 1]['Word'].str.lower().tolist()
uncertainty_words = sentiment[sentiment['Uncertainty'] == 1]['Word'].str.lower().tolist()
litigious_words = sentiment[sentiment['Litigious'] == 1]['Word'].str.lower().tolist()
strong_modal_words = sentiment[sentiment['StrongModal'] == 1]['Word'].str.lower().tolist()
weak_modal_words = sentiment[sentiment['WeakModal'] == 1]['Word'].str.lower().tolist()

In [67]:
for key in dtm_dict:

    # add sentiment scores to the dataframe
    dict_rr[key]['NegativeScore'] = 0
    dict_rr[key]['PositiveScore'] = 0
    dict_rr[key]['UncertaintyScore'] = 0
    dict_rr[key]['LitigiousScore'] = 0
    dict_rr[key]['StrongModalScore'] = 0
    dict_rr[key]['WeakModalScore'] = 0

    # iterate of columns in the DTM and add sentiment scores to the dataframe based on categories
    for column in dtm_dict[key].columns:
        if column in negative_words:
            dict_rr[key]['NegativeScore'] += dtm_dict[key][column]
        if column in positive_words:
            dict_rr[key]['PositiveScore'] += dtm_dict[key][column]
        if column in uncertainty_words:
            dict_rr[key]['UncertaintyScore'] += dtm_dict[key][column]
        if column in litigious_words:
            dict_rr[key]['LitigiousScore'] += dtm_dict[key][column]
        if column in strong_modal_words:
            dict_rr[key]['StrongModalScore'] += dtm_dict[key][column]
        if column in weak_modal_words:
            dict_rr[key]['WeakModalScore'] += dtm_dict[key][column]
    
    # compute sentiment = PositiveScore - NegativeScore / Total word count
    dict_rr[key]['Sentiment'] = (dict_rr[key]['PositiveScore'] - dict_rr[key]['NegativeScore']) / len(dict_rr[key][key])
    dict_rr[key]['OptimismScore'] = (dict_rr[key]['StrongModalScore'] - dict_rr[key]['UncertaintyScore']) / len(dict_rr[key][key])
    dict_rr[key]['RiskScore'] = (dict_rr[key]['LitigiousScore'] + dict_rr[key]['WeakModalScore']) / len(dict_rr[key][key])

In [69]:
dict_rr['all']['NegativeScore'] = dict_rr['presentation']['NegativeScore'] + dict_rr['QnA']['NegativeScore']
dict_rr['all']['PositiveScore'] = dict_rr['presentation']['PositiveScore'] + dict_rr['QnA']['PositiveScore']
dict_rr['all']['UncertaintyScore'] = dict_rr['presentation']['UncertaintyScore'] + dict_rr['QnA']['UncertaintyScore']
dict_rr['all']['LitigiousScore'] = dict_rr['presentation']['LitigiousScore'] + dict_rr['QnA']['LitigiousScore']
dict_rr['all']['StrongModalScore'] = dict_rr['presentation']['StrongModalScore'] + dict_rr['QnA']['StrongModalScore']
dict_rr['all']['WeakModalScore'] = dict_rr['presentation']['WeakModalScore'] + dict_rr['QnA']['WeakModalScore']
dict_rr['all']['Sentiment'] = (dict_rr['all']['PositiveScore'] - dict_rr['all']['NegativeScore']) / len(dict_rr['all']['presentation'] + dict_rr['all']['QnA'])
dict_rr['all']['OptimismScore'] = (dict_rr['all']['StrongModalScore'] - dict_rr['all']['UncertaintyScore']) / len(dict_rr['all']['presentation'] + dict_rr['all']['QnA'])
dict_rr['all']['RiskScore'] = (dict_rr['all']['LitigiousScore'] + dict_rr['all']['WeakModalScore']) / len(dict_rr['all']['presentation'] + dict_rr['all']['QnA'])

dict_rr['all_cleaned']['NegativeScore'] = dict_rr['presentation_cleaned']['NegativeScore'] + dict_rr['QnA_cleaned']['NegativeScore']
dict_rr['all_cleaned']['PositiveScore'] = dict_rr['presentation_cleaned']['PositiveScore'] + dict_rr['QnA_cleaned']['PositiveScore']
dict_rr['all_cleaned']['UncertaintyScore'] = dict_rr['presentation_cleaned']['UncertaintyScore'] + dict_rr['QnA_cleaned']['UncertaintyScore']
dict_rr['all_cleaned']['LitigiousScore'] = dict_rr['presentation_cleaned']['LitigiousScore'] + dict_rr['QnA_cleaned']['LitigiousScore']
dict_rr['all_cleaned']['StrongModalScore'] = dict_rr['presentation_cleaned']['StrongModalScore'] + dict_rr['QnA_cleaned']['StrongModalScore']
dict_rr['all_cleaned']['WeakModalScore'] = dict_rr['presentation_cleaned']['WeakModalScore'] + dict_rr['QnA_cleaned']['WeakModalScore']
dict_rr['all_cleaned']['Sentiment'] = (dict_rr['all_cleaned']['PositiveScore'] - dict_rr['all_cleaned']['NegativeScore']) / len(dict_rr['all_cleaned']['presentation_cleaned'] + dict_rr['all_cleaned']['QnA_cleaned'])
dict_rr['all_cleaned']['OptimismScore'] = (dict_rr['all_cleaned']['StrongModalScore'] - dict_rr['all_cleaned']['UncertaintyScore']) / len(dict_rr['all_cleaned']['presentation_cleaned'] + dict_rr['all_cleaned']['QnA_cleaned'])
dict_rr['all_cleaned']['RiskScore'] = (dict_rr['all_cleaned']['LitigiousScore'] + dict_rr['all_cleaned']['WeakModalScore']) / len(dict_rr['all_cleaned']['presentation_cleaned'] + dict_rr['all_cleaned']['QnA_cleaned'])

In [70]:
for key in dict_rr:
    # transform RR to label 1 if RR > 50 else 0
    dict_rr[key]['label'] = dict_rr[key]['RR'].apply(lambda x: 1 if x > 40 else 0)

    # if label == 1 and Sentiment > 0 then TP
    dict_rr[key]['Prediction'] = dict_rr[key].apply(lambda x: 1 if x['label'] == 1 and x['Sentiment'] > 0 else 0, axis=1)
    # if label == 0 and Sentiment < 0 then TN
    dict_rr[key]['Prediction'] = dict_rr[key].apply(lambda x: 1 if x['label'] == 0 and x['Sentiment'] < 0 else 0, axis=1)
    # if label == 1 and Sentiment < 0 then FN
    dict_rr[key]['Prediction'] = dict_rr[key].apply(lambda x: 1 if x['label'] == 1 and x['Sentiment'] < 0 else 0, axis=1)
    # if label == 0 and Sentiment > 0 then FP
    dict_rr[key]['Prediction'] = dict_rr[key].apply(lambda x: 1 if x['label'] == 0 and x['Sentiment'] > 0 else 0, axis=1)

In [71]:
# Compute accuracy for each type
for key in dict_rr:
    accuracy = dict_rr[key]['Prediction'].sum() / len(dict_rr[key])
    print(f'Accuracy for {key}: {accuracy}')

Accuracy for presentation: 0.14046511627906977
Accuracy for presentation_cleaned: 0.06511627906976744
Accuracy for QnA: 0.025116279069767444
Accuracy for QnA_cleaned: 0.03255813953488372
Accuracy for all: 0.062325581395348835
Accuracy for all_cleaned: 0.05116279069767442


In [74]:
for key in dict_rr:
    X = dict_rr[key][['Sentiment', 'OptimismScore', 'RiskScore']]
    y = dict_rr[key]['RR']

    X_train, X_test = X, X
    y_train, y_test = y, y

    # Fit the model
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)

    # Fit the model
    model = sm.OLS(y_train, X_train).fit()

    print(f"Score: {key}")
    # Print the summary of the model which includes p-values and significance levels
    print(model.summary())

    # Make predictions
    y_pred = model.predict(X_test)

    # Compute and print evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {np.sqrt(mse)}")
    print(f"R-squared: {r2}")

Score: presentation
                            OLS Regression Results                            
Dep. Variable:                     RR   R-squared:                       0.415
Model:                            OLS   Adj. R-squared:                  0.414
Method:                 Least Squares   F-statistic:                     253.7
Date:                Tue, 10 Sep 2024   Prob (F-statistic):          2.44e-124
Time:                        19:25:30   Log-Likelihood:                -4733.2
No. Observations:                1075   AIC:                             9474.
Df Residuals:                    1071   BIC:                             9494.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            35.6721  

In [72]:
for key in dict_rr:
    X = dict_rr[key][['Sentiment', 'OptimismScore', 'RiskScore'] + supporting_features_1]
    y = dict_rr[key]['RR']

    X_train, X_test = X, X
    y_train, y_test = y, y

    # Fit the model
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)

    # Fit the model
    model = sm.OLS(y_train, X_train).fit()

    print(f"Score: {key}")
    # Print the summary of the model which includes p-values and significance levels
    print(model.summary())

    # Make predictions
    y_pred = model.predict(X_test)

    # Compute and print evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {np.sqrt(mse)}")
    print(f"R-squared: {r2}")

Score: presentation
                            OLS Regression Results                            
Dep. Variable:                     RR   R-squared:                       0.674
Model:                            OLS   Adj. R-squared:                  0.667
Method:                 Least Squares   F-statistic:                     86.88
Date:                Tue, 10 Sep 2024   Prob (F-statistic):          4.89e-235
Time:                        19:24:53   Log-Likelihood:                -4418.8
No. Observations:                1075   AIC:                             8890.
Df Residuals:                    1049   BIC:                             9019.
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------

In [73]:
for key in dict_rr:
    # replace #DIV/0! & Nan with 0
    # Replace '#DIV/0!' with NaN
    # Replace '#DIV/0!' with NaN
    dict_rr[key].replace('#DIV/0!', np.nan, inplace=True)
    # Replace NaN with 0
    dict_rr[key].fillna(0, inplace=True)

    # make sure all values are numeric except for the Date column
    dict_rr[key] = dict_rr[key].apply(pd.to_numeric, errors='ignore')

    X = dict_rr[key][['Sentiment', 'OptimismScore', 'RiskScore'] + supporting_features_1 + supporting_features_2]
    y = dict_rr[key]['RR']

    X_train, X_test = X, X
    y_train, y_test = y, y

    # Fit the model
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)

    # Fit the model
    model = sm.OLS(y_train, X_train).fit()

    print(f"Score: {key}")
    # Print the summary of the model which includes p-values and significance levels
    print(model.summary())

    # Make predictions
    y_pred = model.predict(X_test)

    # Compute and print evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {np.sqrt(mse)}")
    print(f"R-squared: {r2}")

Score: presentation
                            OLS Regression Results                            
Dep. Variable:                     RR   R-squared:                       0.695
Model:                            OLS   Adj. R-squared:                  0.687
Method:                 Least Squares   F-statistic:                     82.25
Date:                Tue, 10 Sep 2024   Prob (F-statistic):          3.72e-246
Time:                        19:24:54   Log-Likelihood:                -4382.9
No. Observations:                1075   AIC:                             8826.
Df Residuals:                    1045   BIC:                             8975.
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------

In [18]:
for key in dict_rr:
    # avg RR with sentiment > 0
    avg_rr_sentiment_pos = dict_rr[key][dict_rr[key]['Sentiment'] > 0]['RR'].mean()
    # avg RR with sentiment < 0
    avg_rr_sentiment_neg = dict_rr[key][dict_rr[key]['Sentiment'] < 0]['RR'].mean()

    print(f"Average RR with Sentiment > 0 for {key}: {avg_rr_sentiment_pos}")
    print(f"Average RR with Sentiment < 0 for {key}: {avg_rr_sentiment_neg}")

Average RR with Sentiment > 0 for presentation: 42.490470640318954
Average RR with Sentiment < 0 for presentation: 31.451367202373117
Average RR with Sentiment > 0 for presentation_cleaned: 54.27253288231814
Average RR with Sentiment < 0 for presentation_cleaned: 31.871187690647446
Average RR with Sentiment > 0 for QnA: 58.76408225661973
Average RR with Sentiment < 0 for QnA: 32.54350156477682
Average RR with Sentiment > 0 for QnA_cleaned: 59.143283130644235
Average RR with Sentiment < 0 for QnA_cleaned: 31.61833899636329
