In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
path = '../document/AnnualReports16_processed2.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,cik,company_name,filed_date,item7,market_abnormal_return,nasdq,market_value,btm,pre_alpha,pre_rmse,InstOwn_Perc,log_share
0,804212.0,airgas inc,20160510.0,ITEM 7. MANAGEMENT S DISCUSSION AND ...,-0.008756,0,9.238859,-1.61556,0.168271,2.084189,0.846428,1.137248
1,880460.0,"perfumania holdings, inc.",20160429.0,ITEM 7. MANAGEMENT S DISCUSSION AND ANALY...,-0.004723,1,3.546929,0.88362,-0.149365,4.478955,0.0974,-2.741746
2,1276591.0,hansen medical inc,20160425.0,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALY...,-0.007461,1,3.784871,-3.415026,-0.183296,7.257763,0.305207,0.852817
3,1299969.0,"comstock holding companies, inc.",20160404.0,Item 7. Management s Disc ussion and Analysis...,0.005079,1,1.573336,,-0.331217,6.174199,0.123161,0.403309
4,12659.0,h&r block inc,20160617.0,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,-0.001475,0,8.40364,-5.263362,-0.113956,1.942857,0.95754,1.339484


### test processing

In [4]:
import nltk
import spacy 
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import WordNetLemmatizer

In [5]:
# Function to count the number of sentences in a text
def count_sentences(text):
    # Handle NaN values by returning 0 sentences
    if pd.isnull(text):
        return 0
    sentences = nltk.sent_tokenize(text)
    return len(sentences)

# Function to delete the first sentence in a text
def delete_first_sentence(text):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) > 2:
        return ' '.join(sentences[2:])
    else:
        return text

# Apply the count_sentences function to the 'item7' column and create a new column 'sentence_count'
df['sentence_count'] = df['item7'].apply(count_sentences)

# Filter the DataFrame to keep only rows with 10 or more sentences in 'item7'
df = df[df['sentence_count'] > 10]

# Delete the first sentence in each text in the 'item7' column
df['item7'] = df['item7'].apply(delete_first_sentence)

df.head()

Unnamed: 0,cik,company_name,filed_date,item7,market_abnormal_return,nasdq,market_value,btm,pre_alpha,pre_rmse,InstOwn_Perc,log_share,sentence_count
0,804212.0,airgas inc,20160510.0,RESULTS OF OPERATIONS: 2016 COMPARED TO ...,-0.008756,0,9.238859,-1.61556,0.168271,2.084189,0.846428,1.137248,401
1,880460.0,"perfumania holdings, inc.",20160429.0,Retail sales decreased 14.8% compared to the p...,-0.004723,1,3.546929,0.88362,-0.149365,4.478955,0.0974,-2.741746,173
2,1276591.0,hansen medical inc,20160425.0,"In some cases, these statements may be identif...",-0.007461,1,3.784871,-3.415026,-0.183296,7.257763,0.305207,0.852817,296
3,1299969.0,"comstock holding companies, inc.",20160404.0,This discussion and analysis contains forward-...,0.005079,1,1.573336,,-0.331217,6.174199,0.123161,0.403309,162
4,12659.0,h&r block inc,20160617.0,Subsequent to the closing of the P A Transacti...,-0.001475,0,8.40364,-5.263362,-0.113956,1.942857,0.95754,1.339484,253


In [18]:
# Simple preprocessing by removing extra lines and lowercasing all text
df['item7'] = df['item7'].replace('\n','', regex=True)
df['item7'] = df['item7'].replace('\r','', regex=True)
df['item7'] = df['item7'].replace('\r','', regex=True)
df['item7'] = df['item7'].replace('[\d.,]+|[^\w\s]', '', regex=True)
df['item7'] = [x.lower() for x in df['item7']]
df['item7'] = df['item7'].replace('item 7.','', regex=True)

# Futher preprocessing by removing all stopwords and lemmatizing all text
documents = []

stemmer = WordNetLemmatizer()

for text in df['item7']:
    # Load English tokenizer, tagger, parser, NER and word vectors
    nlp = English()

    #  "nlp" Object is used to create documents with linguistic annotations.
    my_doc = nlp(text)

    # Create list of word tokens
    token_list = []
    for token in my_doc:
        token_list.append(token.text)

    # Create list of word tokens after removing stopwords
    filtered_sentence =[] 

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 

    document = [stemmer.lemmatize(word) for word in filtered_sentence]
    document = ' '.join(document)

    documents.append(document)

df['item7'] = documents
df.head()

Unnamed: 0,cik,company_name,filed_date,item7,market_abnormal_return,nasdq,market_value,btm,pre_alpha,pre_rmse,InstOwn_Perc,log_share,sentence_count
0,804212.0,airgas inc,20160510.0,organic sale decreased compared prior ye...,-0.008756,0,9.238859,-1.61556,0.168271,2.084189,0.846428,1.137248,400
1,880460.0,"perfumania holdings, inc.",20160429.0,decrease retail sale lower mall traffic ...,-0.004723,1,3.546929,0.88362,-0.149365,4.478955,0.0974,-2.741746,172
2,1276591.0,hansen medical inc,20160425.0,statement involve known unknown risk uncertain...,-0.007461,1,3.784871,-3.415026,-0.183296,7.257763,0.305207,0.852817,295
3,1299969.0,"comstock holding companies, inc.",20160404.0,actual result differ materially anticipated fo...,0.005079,1,1.573336,,-0.331217,6.174199,0.123161,0.403309,161
4,12659.0,h&r block inc,20160617.0,subsequent closing p transaction bofi began of...,-0.001475,0,8.40364,-5.263362,-0.113956,1.942857,0.95754,1.339484,252


### nrc list

In [5]:
nrc = pd.read_csv('NRC-Emotion-Lexicon.txt', sep = '\t', names = ['term', 'category', 'associated'])
nrc.head()

Unnamed: 0,term,category,associated
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0


In [20]:
# rearrangement
category_list = nrc['category'] .unique().tolist()
filtered_df = lexicon[lexicon['associated'] == 1]
grouped_df = filtered_df.groupby('category')['term'].apply(list)
grouped_df

category
anger           [abandoned, abandonment, abhor, abhorrent, abo...
anticipation    [abundance, accelerate, accolade, accompanimen...
disgust         [aberration, abhor, abhorrent, abject, abnorma...
fear            [abandon, abandoned, abandonment, abduction, a...
joy             [absolution, abundance, abundant, accolade, ac...
negative        [abandon, abandoned, abandonment, abduction, a...
positive        [abba, ability, abovementioned, absolute, abso...
sadness         [abandon, abandoned, abandonment, abduction, a...
surprise        [abandonment, abduction, abrupt, accident, acc...
trust           [abacus, abbot, absolution, abundance, academi...
Name: term, dtype: object

In [74]:
anti_list = grouped_df.loc['anticipation']
nrc_pos_list = grouped_df.loc['positive']
nrc_neg_list = grouped_df.loc['negative']
ang_list = grouped_df.loc['anger']
anti_list =  grouped_df.loc['anticipation']
dis_list =  grouped_df.loc['disgust']
joy_list = grouped_df.loc['joy']
fear_list = grouped_df.loc['fear']
sad_list =  grouped_df.loc['sadness']
surp_list = grouped_df.loc['surprise']
tru_list =  grouped_df.loc['trust']

### McDonald list

In [9]:
mcd = pd.read_csv('Loughran-McDonald_MasterDictionary_1993-2023.csv')
mcd['Word'] = mcd['Word'].str.lower()
mcd.head()

Unnamed: 0,Word,Seq_num,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Complexity,Syllables,Source
0,aardvark,1,664,2.69e-08,1.86e-08,4.05e-06,131,0,0,0,0,0,0,0,0,2,12of12inf
1,aardvarks,2,3,1.21e-10,8.23e-12,9.02e-09,1,0,0,0,0,0,0,0,0,2,12of12inf
2,abaci,3,9,3.64e-10,1.11e-10,5.16e-08,7,0,0,0,0,0,0,0,0,3,12of12inf
3,aback,4,29,1.17e-09,6.33e-10,1.56e-07,28,0,0,0,0,0,0,0,0,2,12of12inf
4,abacus,5,9349,3.79e-07,3.83e-07,3.46e-05,1239,0,0,0,0,0,0,0,0,3,12of12inf


In [75]:
neg_list = set(mcd[mcd['Negative'] != 0]['Word'])
pos_list = set(mcd[mcd['Positive'] != 0]['Word'])
unc_list= set(mcd[mcd['Uncertainty'] != 0]['Word'])
lit_list = set(mcd[mcd['Litigious'] != 0]['Word'])
stg_list = set(mcd[mcd['Strong_Modal'] != 0]['Word'])
weak_list = set(mcd[mcd['Weak_Modal'] != 0]['Word'])
ctr_list = set(mcd[mcd['Constraining'] != 0]['Word'])
Comp_list = set(mcd[mcd['Complexity'] != 0]['Word'])

### count sentiment increment

In [25]:
# A Function to Construct a Sentiment Variable Using a Lexicon-Based Approach
def sentiment_score(text, sen_list):
    temp_list = []
    for t in text:
        if isinstance(t, str):
            temp = 0
            for w in sen_list:
                temp += t.count(w)
            if len(t) != 0:
                temp_list.append(temp/len(t))
            else:
                temp_list.append(0)
        else:
            temp_list.append(0)
    return temp_list

In [38]:
sen_df = pd.DataFrame(df['item7']).copy()
sen_df['Pos_Dic'] = sentiment_score(df['item7'], pos_list)
sen_df['Neg_Dic'] = sentiment_score(df['item7'], neg_list)
sen_df['Anti_Dic'] = sentiment_score(df['item7'], anti_list)
sen_df.head()

Unnamed: 0,item7,Pos_Dic,Neg_Dic,Anti_Dic
0,organic sale decreased compared prior ye...,0.003957,0.012075,0.018113
1,decrease retail sale lower mall traffic ...,0.006885,0.009103,0.019084
2,statement involve known unknown risk uncertain...,0.003248,0.011573,0.01973
3,actual result differ materially anticipated fo...,0.004487,0.008776,0.017798
4,subsequent closing p transaction bofi began of...,0.003611,0.009907,0.019913


In [84]:
sen_df['pos_anti_increment'] = (sen_df['Pos_Dic'] + sen_df['Anti_Dic'])/ sen_df['Anti_Dic']
sen_df['neg_anti_increment'] = (sen_df['Anti_Dic'] - sen_df['Neg_Dic'])/ sen_df['Anti_Dic']
sen_df.head()

Unnamed: 0,item7,Pos_Dic,Neg_Dic,Anti_Dic,pos_anti_increment,neg_anti_increment,result,posanti_increment
0,organic sale decreased compared prior ye...,0.003957,0.012075,0.018113,1.218487,0.333333,0.333333,-1.23
1,decrease retail sale lower mall traffic ...,0.006885,0.009103,0.019084,1.360775,0.523002,0.523002,-7.6
2,statement involve known unknown risk uncertain...,0.003248,0.011573,0.01973,1.164634,0.413415,0.413415,-1.37
3,actual result differ materially anticipated fo...,0.004487,0.008776,0.017798,1.252078,0.506925,0.506925,-3.15
4,subsequent closing p transaction bofi began of...,0.003611,0.009907,0.019913,1.181364,0.502496,0.502496,-2.16


In [47]:
sen_df['result'] = sen_df.apply(lambda row: row['pos_anti_increment'] if row['Pos_Dic'] > row['Neg_Dic'] else row['neg_anti_increment'], axis=1)
sen_df.head()

Unnamed: 0,item7,Pos_Dic,Neg_Dic,Anti_Dic,pos_anti_increment,neg_anti_increment,result
0,organic sale decreased compared prior ye...,0.003957,0.012075,0.018113,1.218487,0.333333,0.333333
1,decrease retail sale lower mall traffic ...,0.006885,0.009103,0.019084,1.360775,0.523002,0.523002
2,statement involve known unknown risk uncertain...,0.003248,0.011573,0.01973,1.164634,0.413415,0.413415
3,actual result differ materially anticipated fo...,0.004487,0.008776,0.017798,1.252078,0.506925,0.506925
4,subsequent closing p transaction bofi began of...,0.003611,0.009907,0.019913,1.181364,0.502496,0.502496


### 前期回报与语气增量的影响

In [85]:
X = np.array(df['pre_alpha']).reshape(-1, 1)
y = np.array(sen_df['result']).reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

y_train = np.array(y_train)
model.fit(X_train, y_train)

In [86]:
import statsmodels.api as sm
# Coefficient
coefficient = model.coef_[0]
print("Coefficient:", coefficient)

# Predict on the test set
y_pred = model.predict(X_test)

# MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# R2
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

# P-value using statsmodels
X_train_sm = sm.add_constant(X_train)  # Add a constant term to X2_train
model_sm = sm.OLS(y_train, X_train_sm)
results = model_sm.fit()
p_value = results.pvalues[1]
print("P-value:", p_value)

Coefficient: [-0.02588891]
Mean Squared Error: 0.006231867386899484
R-squared: -0.0037744388457545686
P-value: 0.04193058219296836


前期回报情况不好的可能会对未来更积极

### 不稳定因素与预期增量的影响

In [97]:
df2 = pd.DataFrame(sen_df['result']).copy()
df2['unc_Dic'] = sentiment_score(df['item7'], unc_list)
df2['stg_Dic'] = sentiment_score(df['item7'], stg_list)
df2['weak_Dic'] = sentiment_score(df['item7'], weak_list)

df2['lit_Dic'] = sentiment_score(df['item7'], lit_list)
df2['ctr_Dic'] = sentiment_score(df['item7'], ctr_list)

df2['unc_risk'] = df2['unc_Dic'] + df2['weak_Dic'] - df2['stg_Dic']
df2['lit_risk'] = df2['lit_Dic'] + df2['ctr_Dic']


In [98]:
df2.head()

Unnamed: 0,result,unc_Dic,stg_Dic,weak_Dic,lit_Dic,ctr_Dic,unc_risk,lit_risk
0,0.333333,0.0023,0.000507,0.000169,0.002418,0.002554,0.001962,0.004972
1,0.523002,0.003882,0.000647,0.000277,0.001756,0.003928,0.003512,0.005684
2,0.413415,0.00332,7.2e-05,0.000289,0.004307,0.004379,0.003537,0.008686
3,0.506925,0.00212,9.9e-05,0.0,0.002416,0.00281,0.002021,0.005226
4,0.502496,0.003711,0.000166,0.00063,0.007256,0.00159,0.004175,0.008846


In [99]:
# Split the dataset into training and test sets
features = df2.loc[:, 'unc_Dic':'lit_risk']
X = features
y = df2['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [101]:
import statsmodels.api as sm

results = []

# Iterate over each feature and evaluate the linear regression model
for feature in features:
    # Create the linear regression model
    model = sm.OLS(y_train, sm.add_constant(X_train[[feature]]))
    results_single = model.fit()

    # Predict on the test set
    X_test_const = sm.add_constant(X_test[[feature]])
    y_pred = results_single.predict(X_test_const)

    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Get the regression coefficient and p-value
    coef = results_single.params[1]
    p_value = results_single.pvalues[1]

    # Append the results to the list
    results.append({'Feature': feature,'Coefficient': coef, 'P-value': p_value, 'MSE': mse, 'R2 Score': r2})

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)
sorted_results = results_df.sort_values(by=['P-value', 'R2 Score'], ascending=[True, True])
print(sorted_results)

    Feature  Coefficient       P-value       MSE  R2 Score
2  weak_Dic    88.785241  1.293601e-13  0.006221 -0.002046
5  unc_risk     6.488153  1.318133e-04  0.006266 -0.009239
1   stg_Dic   -29.633936  5.604604e-03  0.006232 -0.003721
0   unc_Dic     4.677905  1.110867e-02  0.006255 -0.007431
4   ctr_Dic    -4.402386  8.224329e-02  0.006227 -0.002931
6  lit_risk    -1.869038  2.288508e-01  0.006207  0.000311
3   lit_Dic    -0.374207  8.552431e-01  0.006223 -0.002411


弱语气，不稳定风险对期望增量有正向影响显著，强语气反而会弱化期望增量