In [34]:
import pandas as pd
import re
! pip install nltk
import nltk
import numpy as np



In [35]:
dataset = pd.read_csv('AnnualReports16_processed.csv', encoding='latin-1')
# 计算每列的空缺值数量和百分比
missing_values = dataset.isnull().sum()
percentage_missing = (missing_values / len(dataset)) * 100

# 创建一个DataFrame来展示结果
missing_data_summary = pd.DataFrame({
    'Column Name': dataset.columns,
    'Missing Values': missing_values,
    'Percentage': percentage_missing
})

# 打印结果
print(missing_data_summary)

                                   Column Name  Missing Values  Percentage
cik                                        cik               0    0.000000
company_name                      company_name               0    0.000000
filed_date                          filed_date               0    0.000000
market_abnormal_return  market_abnormal_return               0    0.000000
nasdq                                    nasdq               0    0.000000
market_value                      market_value               0    0.000000
btm                                        btm             186    5.472198
pre_alpha                            pre_alpha               0    0.000000
pre_rmse                              pre_rmse               0    0.000000
InstOwn_Perc                      InstOwn_Perc               0    0.000000
log_share                            log_share               5    0.147102
processed_text                  processed_text             298    8.767284


In [36]:
# 删除包含空值的行
dataset_cleaned = dataset.dropna()

In [37]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [38]:
y = dataset_cleaned['market_abnormal_return']  # 因变量

In [53]:
lexicon_LM = pd.read_csv('Loughran-McDonald_MasterDictionary_1993-2023.csv')
# 将'Word'列的所有值转换为小写
lexicon_LM['Word'] = lexicon_LM['Word'].str.lower()

lexicon_LM.head()


Unnamed: 0,Word,Seq_num,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Complexity,Syllables,Source
0,aardvark,1,664,2.69e-08,1.86e-08,4.05e-06,131,0,0,0,0,0,0,0,0,2,12of12inf
1,aardvarks,2,3,1.21e-10,8.23e-12,9.02e-09,1,0,0,0,0,0,0,0,0,2,12of12inf
2,abaci,3,9,3.64e-10,1.11e-10,5.16e-08,7,0,0,0,0,0,0,0,0,3,12of12inf
3,aback,4,29,1.17e-09,6.33e-10,1.56e-07,28,0,0,0,0,0,0,0,0,2,12of12inf
4,abacus,5,9349,3.79e-07,3.83e-07,3.46e-05,1239,0,0,0,0,0,0,0,0,3,12of12inf


In [54]:
# 从词典中提取每个分类的词汇列表
Negative = set(lexicon_LM[lexicon_LM['Negative'] != 0]['Word'])
Positive = set(lexicon_LM[lexicon_LM['Positive'] != 0]['Word'])
Uncertainty = set(lexicon_LM[lexicon_LM['Uncertainty'] != 0]['Word'])
Litigious = set(lexicon_LM[lexicon_LM['Litigious'] != 0]['Word'])
Strong_Modal = set(lexicon_LM[lexicon_LM['Strong_Modal'] != 0]['Word'])
Weak_Modal = set(lexicon_LM[lexicon_LM['Weak_Modal'] != 0]['Word'])
Constraining = set(lexicon_LM[lexicon_LM['Constraining'] != 0]['Word'])
Complexity = set(lexicon_LM[lexicon_LM['Complexity'] != 0]['Word'])


# 打印每个分类的词汇数量
for category, words in dictionaries.items():
    print(f"{category}: {len(words)} words")

Negative: 2355 words
Positive: 354 words
Uncertainty: 297 words
Litigious: 905 words
Strong_Modal: 19 words
Weak_Modal: 27 words
Constraining: 184 words
Complexity: 53 words


In [55]:
def sentiment_score(text, sen_list):
    # 确保文本不为空或None
    if pd.isnull(text) or text == "":
        return 0
    # 计算文本中情感词汇的总数
    total_count = sum(text.lower().count(word) for word in sen_list if word in text.lower())
    # 归一化得分，使用文本长度作为分母
    return total_count / max(len(text), 1)

In [56]:
for sentiment, score_name in zip(
    [Negative, Positive, Uncertainty, Litigious, Strong_Modal, Weak_Modal, Constraining, Complexity],
    ['Negative_score', 'Positive_score', 'Uncertainty_score', 'Litigious_score', 'Strong_Modal_score', 'Weak_Modal_score', 'Constraining_score', 'Complexity_score']
):
    dataset_cleaned.loc[:, score_name] = dataset_cleaned['processed_text'].apply(lambda x: sentiment_score(x, list(sentiment)))

print(dataset_cleaned.head())

         cik               company_name  filed_date  market_abnormal_return  \
0   804212.0                 airgas inc  20160510.0               -0.008756   
1   880460.0  perfumania holdings, inc.  20160429.0               -0.004723   
2  1276591.0         hansen medical inc  20160425.0               -0.007461   
4    12659.0              h&r block inc  20160617.0               -0.001475   
5  1050825.0              steelcase inc  20160415.0               -0.051923   

   nasdq  market_value       btm  pre_alpha  pre_rmse  InstOwn_Perc  \
0      0      9.238859 -1.615560   0.168271  2.084189      0.846428   
1      1      3.546929  0.883620  -0.149365  4.478955      0.097400   
2      1      3.784871 -3.415026  -0.183296  7.257763      0.305207   
4      0      8.403640 -5.263362  -0.113956  1.942857      0.957540   
5      0      7.307156 -0.704711  -0.055984  1.959769      0.860600   

   log_share                                     processed_text  \
0   1.137248  item 7. managemen

In [112]:
X_LM = dataset_cleaned[['nasdq', 'market_value', 'btm', 'pre_alpha', 'pre_rmse', 'InstOwn_Perc', 'log_share','Negative_score', 'Positive_score', 'Uncertainty_score', 'Litigious_score', 'Strong_Modal_score', 'Weak_Modal_score','Constraining_score','Complexity_score']]

In [113]:
# Divide the data set into training set and test set
x_LM_train, x_LM_test, y_LM_train, y_LM_test = train_test_split(X_LM, y, test_size=0.2, random_state=42)

In [122]:
# Create a linear regression model and train it
model_LM = LinearRegression()
model_LM.fit(x_LM_train, y_LM_train)

In [123]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error, median_absolute_error
# Predict test set
y_LM_pred = model_LM.predict(x_LM_test)


# Evaluate model performance
mse_LM = mean_squared_error(y_LM_test, y_LM_pred)
mae_LM = mean_absolute_error(y_LM_test, y_LM_pred)
r2_LM= r2_score(y_LM_test, y_LM_pred)
medae_LM = median_absolute_error(y_LM_test, y_LM_pred)


print(f"Model_count_vector reply count MSE: {mse_LM}")
print(f'MAE_reply count: {mae_LM}')
print(f'R²__reply count: {r2_LM}')
print(f'MedAE_reply count: {medae_LM}')

Model_count_vector reply count MSE: 0.0034225666801598834
MAE_reply count: 0.029988428227333084
R²__reply count: -0.0018657976564100398
MedAE_reply count: 0.01665407105826146


In [124]:
print(f"model_LM.coef_: {model_LM.coef_}")

model_LM.coef_: [-1.94648504e-03 -1.30237296e-03 -7.79017182e-04 -6.44730736e-03
 -6.08590713e-04  3.09593225e-03 -1.22580472e-05 -2.31028479e-01
 -6.76478481e-02  5.29701305e-01 -7.76614467e-01 -3.71465537e+00
 -1.34047219e+00 -3.26660724e-01  4.31501586e-01]


In [132]:
import statsmodels.api as sm
X_LM = sm.add_constant(X_LM)
model = sm.OLS(y, X_LM).fit()
print(model.summary())

                              OLS Regression Results                              
Dep. Variable:     market_abnormal_return   R-squared:                       0.003
Model:                                OLS   Adj. R-squared:                 -0.002
Method:                     Least Squares   F-statistic:                    0.6284
Date:                    Sun, 28 Apr 2024   Prob (F-statistic):              0.854
Time:                            02:04:23   Log-Likelihood:                 4357.0
No. Observations:                    2919   AIC:                            -8682.
Df Residuals:                        2903   BIC:                            -8586.
Df Model:                              15                                         
Covariance Type:                nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------

In [133]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# 计算每个变量的VIF
vif_data = pd.DataFrame()
vif_data["feature"] = X_LM.columns
vif_data["VIF"] = [variance_inflation_factor(X_LM.values, i) for i in range(len(X_LM.columns))]

print(vif_data)

               feature        VIF
0                const  71.392491
1                nasdq   1.203330
2         market_value   2.659915
3                  btm   1.285010
4            pre_alpha   1.094241
5             pre_rmse   1.999623
6         InstOwn_Perc   1.638120
7            log_share   1.830058
8       Negative_score   1.302296
9       Positive_score   1.261898
10   Uncertainty_score   1.967444
11     Litigious_score   1.138363
12  Strong_Modal_score   1.260705
13    Weak_Modal_score   2.011450
14  Constraining_score   1.164908
15    Complexity_score   1.205204


In [139]:
def stepwise_regression(X_LM, y, significance_level=0.1):
    features = X_LM.columns.tolist()
    while (len(features) > 0):
        # Add a constant term to the features
        features_with_const = sm.add_constant(X_LM[features])
        
        # Fit the model and get p-values for the features
        p_values = sm.OLS(y, features_with_const).fit().pvalues[1:]  # Exclude constant
        
        # Check if p_values contains only NaNs or is empty
        if p_values.isnull().all() or p_values.empty:
            break
        
        # Get the feature with the maximum p-value
        max_p_feature = p_values.idxmax() if not p_values.isna().any() else None
        
        # If the maximum p-value is greater than or equal to the significance level, remove the feature
        if (max_p_feature is not None) and (p_values[max_p_feature] >= significance_level):
            features.remove(max_p_feature)
        else:
            break
    
    # Create a new DataFrame with the remaining features
    X_stepwise = X_LM[features]
    
    # Add a constant term to the final model
    X_stepwise = sm.add_constant(X_stepwise)
    
    return X_stepwise

# 使用自定义的逐步回归函数
X_LM_stepwise = stepwise_regression(X_LM, y)

# 拟合最终模型
model_stepwise = sm.OLS(y, X_LM_stepwise).fit()
print(model_stepwise.summary())

                              OLS Regression Results                              
Dep. Variable:     market_abnormal_return   R-squared:                      -0.000
Model:                                OLS   Adj. R-squared:                 -0.000
Method:                     Least Squares   F-statistic:                       nan
Date:                    Sun, 28 Apr 2024   Prob (F-statistic):                nan
Time:                            02:22:49   Log-Likelihood:                 4352.3
No. Observations:                    2919   AIC:                            -8703.
Df Residuals:                        2918   BIC:                            -8697.
Df Model:                               0                                         
Covariance Type:                nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       