In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
path = '../document/AnnualReports16_processed2.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,cik,company_name,filed_date,item7,market_abnormal_return,nasdq,market_value,btm,pre_alpha,pre_rmse,InstOwn_Perc,log_share
0,804212.0,airgas inc,20160510.0,ITEM 7. MANAGEMENT S DISCUSSION AND ...,-0.008756,0,9.238859,-1.61556,0.168271,2.084189,0.846428,1.137248
1,880460.0,"perfumania holdings, inc.",20160429.0,ITEM 7. MANAGEMENT S DISCUSSION AND ANALY...,-0.004723,1,3.546929,0.88362,-0.149365,4.478955,0.0974,-2.741746
2,1276591.0,hansen medical inc,20160425.0,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALY...,-0.007461,1,3.784871,-3.415026,-0.183296,7.257763,0.305207,0.852817
3,1299969.0,"comstock holding companies, inc.",20160404.0,Item 7. Management s Disc ussion and Analysis...,0.005079,1,1.573336,,-0.331217,6.174199,0.123161,0.403309
4,12659.0,h&r block inc,20160617.0,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,-0.001475,0,8.40364,-5.263362,-0.113956,1.942857,0.95754,1.339484


### test processing

In [3]:
import nltk
import spacy 
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import WordNetLemmatizer

In [4]:
# Function to count the number of sentences in a text
def count_sentences(text):
    # Handle NaN values by returning 0 sentences
    if pd.isnull(text):
        return 0
    sentences = nltk.sent_tokenize(text)
    return len(sentences)

# Function to delete the first sentence in a text
def delete_first_sentence(text):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) > 2:
        return ' '.join(sentences[2:])
    else:
        return text

# Apply the count_sentences function to the 'item7' column and create a new column 'sentence_count'
df['sentence_count'] = df['item7'].apply(count_sentences)

# Filter the DataFrame to keep only rows with 10 or more sentences in 'item7'
df = df[df['sentence_count'] > 10]

# Delete the first sentence in each text in the 'item7' column
df['item7'] = df['item7'].apply(delete_first_sentence)

df.head()

Unnamed: 0,cik,company_name,filed_date,item7,market_abnormal_return,nasdq,market_value,btm,pre_alpha,pre_rmse,InstOwn_Perc,log_share,sentence_count
0,804212.0,airgas inc,20160510.0,RESULTS OF OPERATIONS: 2016 COMPARED TO ...,-0.008756,0,9.238859,-1.61556,0.168271,2.084189,0.846428,1.137248,401
1,880460.0,"perfumania holdings, inc.",20160429.0,Retail sales decreased 14.8% compared to the p...,-0.004723,1,3.546929,0.88362,-0.149365,4.478955,0.0974,-2.741746,173
2,1276591.0,hansen medical inc,20160425.0,"In some cases, these statements may be identif...",-0.007461,1,3.784871,-3.415026,-0.183296,7.257763,0.305207,0.852817,296
3,1299969.0,"comstock holding companies, inc.",20160404.0,This discussion and analysis contains forward-...,0.005079,1,1.573336,,-0.331217,6.174199,0.123161,0.403309,162
4,12659.0,h&r block inc,20160617.0,Subsequent to the closing of the P A Transacti...,-0.001475,0,8.40364,-5.263362,-0.113956,1.942857,0.95754,1.339484,253


In [5]:
# Simple preprocessing by removing extra lines and lowercasing all text
df['item7'] = df['item7'].replace('\n','', regex=True)
df['item7'] = df['item7'].replace('\r','', regex=True)
df['item7'] = df['item7'].replace('\r','', regex=True)
df['item7'] = df['item7'].replace('[\d.,]+|[^\w\s]', '', regex=True)
df['item7'] = [x.lower() for x in df['item7']]
df['item7'] = df['item7'].replace('item 7.','', regex=True)

# Futher preprocessing by removing all stopwords and lemmatizing all text
documents = []

stemmer = WordNetLemmatizer()

for text in df['item7']:
    # Load English tokenizer, tagger, parser, NER and word vectors
    nlp = English()

    #  "nlp" Object is used to create documents with linguistic annotations.
    my_doc = nlp(text)

    # Create list of word tokens
    token_list = []
    for token in my_doc:
        token_list.append(token.text)

    # Create list of word tokens after removing stopwords
    filtered_sentence =[] 

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 

    document = [stemmer.lemmatize(word) for word in filtered_sentence]
    document = ' '.join(document)

    documents.append(document)

df['item7'] = documents
df.head()

Unnamed: 0,cik,company_name,filed_date,item7,market_abnormal_return,nasdq,market_value,btm,pre_alpha,pre_rmse,InstOwn_Perc,log_share,sentence_count
0,804212.0,airgas inc,20160510.0,result operation compared overvie...,-0.008756,0,9.238859,-1.61556,0.168271,2.084189,0.846428,1.137248,401
1,880460.0,"perfumania holdings, inc.",20160429.0,retail sale decreased compared prior year re...,-0.004723,1,3.546929,0.88362,-0.149365,4.478955,0.0974,-2.741746,173
2,1276591.0,hansen medical inc,20160425.0,case statement identified terminology ...,-0.007461,1,3.784871,-3.415026,-0.183296,7.257763,0.305207,0.852817,296
3,1299969.0,"comstock holding companies, inc.",20160404.0,discussion analysis contains forwardlooking st...,0.005079,1,1.573336,,-0.331217,6.174199,0.123161,0.403309,162
4,12659.0,h&r block inc,20160617.0,subsequent closing p transaction subsidiary su...,-0.001475,0,8.40364,-5.263362,-0.113956,1.942857,0.95754,1.339484,253


### nrc list

In [6]:
nrc = pd.read_csv('NRC-Emotion-Lexicon.txt', sep = '\t', names = ['term', 'category', 'associated'])
nrc.head()

Unnamed: 0,term,category,associated
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0


In [8]:
# rearrangement
category_list = nrc['category'] .unique().tolist()
filtered_df = nrc[nrc['associated'] == 1]
grouped_df = filtered_df.groupby('category')['term'].apply(list)
grouped_df

category
anger           [abandoned, abandonment, abhor, abhorrent, abo...
anticipation    [abundance, accelerate, accolade, accompanimen...
disgust         [aberration, abhor, abhorrent, abject, abnorma...
fear            [abandon, abandoned, abandonment, abduction, a...
joy             [absolution, abundance, abundant, accolade, ac...
negative        [abandon, abandoned, abandonment, abduction, a...
positive        [abba, ability, abovementioned, absolute, abso...
sadness         [abandon, abandoned, abandonment, abduction, a...
surprise        [abandonment, abduction, abrupt, accident, acc...
trust           [abacus, abbot, absolution, abundance, academi...
Name: term, dtype: object

In [9]:
anti_list = grouped_df.loc['anticipation']
nrc_pos_list = grouped_df.loc['positive']
nrc_neg_list = grouped_df.loc['negative']
ang_list = grouped_df.loc['anger']
anti_list =  grouped_df.loc['anticipation']
dis_list =  grouped_df.loc['disgust']
joy_list = grouped_df.loc['joy']
fear_list = grouped_df.loc['fear']
sad_list =  grouped_df.loc['sadness']
surp_list = grouped_df.loc['surprise']
tru_list =  grouped_df.loc['trust']

### McDonald list

In [10]:
mcd = pd.read_csv('Loughran-McDonald_MasterDictionary_1993-2023.csv')
mcd['Word'] = mcd['Word'].str.lower()
mcd.head()

Unnamed: 0,Word,Seq_num,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Complexity,Syllables,Source
0,aardvark,1,664,2.69e-08,1.86e-08,4.05e-06,131,0,0,0,0,0,0,0,0,2,12of12inf
1,aardvarks,2,3,1.21e-10,8.23e-12,9.02e-09,1,0,0,0,0,0,0,0,0,2,12of12inf
2,abaci,3,9,3.64e-10,1.11e-10,5.16e-08,7,0,0,0,0,0,0,0,0,3,12of12inf
3,aback,4,29,1.17e-09,6.33e-10,1.56e-07,28,0,0,0,0,0,0,0,0,2,12of12inf
4,abacus,5,9349,3.79e-07,3.83e-07,3.46e-05,1239,0,0,0,0,0,0,0,0,3,12of12inf


In [11]:
neg_list = set(mcd[mcd['Negative'] != 0]['Word'])
pos_list = set(mcd[mcd['Positive'] != 0]['Word'])
unc_list= set(mcd[mcd['Uncertainty'] != 0]['Word'])
lit_list = set(mcd[mcd['Litigious'] != 0]['Word'])
stg_list = set(mcd[mcd['Strong_Modal'] != 0]['Word'])
weak_list = set(mcd[mcd['Weak_Modal'] != 0]['Word'])
ctr_list = set(mcd[mcd['Constraining'] != 0]['Word'])
Comp_list = set(mcd[mcd['Complexity'] != 0]['Word'])

###  count positive anticipation sentiment **计算积极预期的方法（新）**

In [12]:
# A Function to Construct a Sentiment Variable Using a Lexicon-Based Approach
def sentiment_score(text, sen_list):
    temp_list = []
    for t in text:
        if isinstance(t, str):
            temp = 0
            for w in sen_list:
                temp += t.count(w)
            if len(t) != 0:
                temp_list.append(temp/len(t))
            else:
                temp_list.append(0)
        else:
            temp_list.append(0)
    return temp_list

In [13]:
sen_df = pd.DataFrame(df['item7']).copy()
sen_df['Pos_Dic'] = sentiment_score(df['item7'], pos_list)
sen_df['Neg_Dic'] = sentiment_score(df['item7'], neg_list)
sen_df['Anti_Dic'] = sentiment_score(df['item7'], anti_list)
sen_df.head()

Unnamed: 0,item7,Pos_Dic,Neg_Dic,Anti_Dic
0,result operation compared overvie...,0.00394,0.008099,0.0181
1,retail sale decreased compared prior year re...,0.00686,0.01243,0.019106
2,case statement identified terminology ...,0.003257,0.012932,0.019733
3,discussion analysis contains forwardlooking st...,0.004469,0.010215,0.017778
4,subsequent closing p transaction subsidiary su...,0.003619,0.014148,0.019874


In [14]:
sen_df.describe()

Unnamed: 0,Pos_Dic,Neg_Dic,Anti_Dic
count,2881.0,2881.0,2881.0
mean,0.004356,0.009945,0.019027
std,0.001128,0.002125,0.002314
min,0.000755,0.001866,0.00542
25%,0.003573,0.008569,0.017517
50%,0.004228,0.00992,0.019006
75%,0.004978,0.011302,0.020497
max,0.013711,0.027659,0.037185


In [16]:
# List 1: A_P_list - Intersection of A_list and P_list
A_P_list = list(set(anti_list) & set(pos_list))

# List 2: A_N_list - Intersection of A_list and N_list
A_N_list = list(set(anti_list) & set(neg_list))

# List 3: Remaining elements in A_list that do not appear in List 1 and List 2
Neutral_list = [x for x in anti_list if x not in A_P_list and x not in A_N_list]

In [17]:
sen_df['pos_anti_score'] = sentiment_score(df['item7'], A_P_list)
sen_df['neg_anti_score'] = sentiment_score(df['item7'], A_N_list)
sen_df['neutral_anti_score'] = sentiment_score(df['item7'], Neutral_list)


Unnamed: 0,item7,Pos_Dic,Neg_Dic,Anti_Dic,pos_anti_score,neg_anti_score,neutral_anti_score
0,result operation compared overvie...,0.00394,0.008099,0.0181,0.001162,8.4e-05,0.016854
1,retail sale decreased compared prior year re...,0.00686,0.01243,0.019106,0.001059,0.000138,0.017909
2,case statement identified terminology ...,0.003257,0.012932,0.019733,0.000263,0.000216,0.019254
3,discussion analysis contains forwardlooking st...,0.004469,0.010215,0.017778,0.000786,0.000147,0.016845
4,subsequent closing p transaction subsidiary su...,0.003619,0.014148,0.019874,0.000559,0.000329,0.018985


In [19]:
sen_df['final_score'] = sen_df['neutral_anti_score'] + sen_df['pos_anti_score'] - sen_df['neg_anti_score']
sen_df.head()

Unnamed: 0,item7,Pos_Dic,Neg_Dic,Anti_Dic,pos_anti_score,neg_anti_score,neutral_anti_score,final_score
0,result operation compared overvie...,0.00394,0.008099,0.0181,0.001162,8.4e-05,0.016854,0.017932
1,retail sale decreased compared prior year re...,0.00686,0.01243,0.019106,0.001059,0.000138,0.017909,0.01883
2,case statement identified terminology ...,0.003257,0.012932,0.019733,0.000263,0.000216,0.019254,0.019302
3,discussion analysis contains forwardlooking st...,0.004469,0.010215,0.017778,0.000786,0.000147,0.016845,0.017484
4,subsequent closing p transaction subsidiary su...,0.003619,0.014148,0.019874,0.000559,0.000329,0.018985,0.019216


#### 原增量计算方法（未采纳）

In [84]:
#sen_df['pos_anti_increment'] = (sen_df['Pos_Dic'] + sen_df['Anti_Dic'])/ sen_df['Anti_Dic']
#sen_df['neg_anti_increment'] = (sen_df['Anti_Dic'] - sen_df['Neg_Dic'])/ sen_df['Anti_Dic']
#sen_df.head()

Unnamed: 0,item7,Pos_Dic,Neg_Dic,Anti_Dic,pos_anti_increment,neg_anti_increment,result,posanti_increment
0,organic sale decreased compared prior ye...,0.003957,0.012075,0.018113,1.218487,0.333333,0.333333,-1.23
1,decrease retail sale lower mall traffic ...,0.006885,0.009103,0.019084,1.360775,0.523002,0.523002,-7.6
2,statement involve known unknown risk uncertain...,0.003248,0.011573,0.01973,1.164634,0.413415,0.413415,-1.37
3,actual result differ materially anticipated fo...,0.004487,0.008776,0.017798,1.252078,0.506925,0.506925,-3.15
4,subsequent closing p transaction bofi began of...,0.003611,0.009907,0.019913,1.181364,0.502496,0.502496,-2.16


In [47]:
#sen_df['result'] = sen_df.apply(lambda row: row['pos_anti_increment'] if row['Pos_Dic'] > row['Neg_Dic'] else row['neg_anti_increment'], axis=1)
#sen_df.head()

Unnamed: 0,item7,Pos_Dic,Neg_Dic,Anti_Dic,pos_anti_increment,neg_anti_increment,result
0,organic sale decreased compared prior ye...,0.003957,0.012075,0.018113,1.218487,0.333333,0.333333
1,decrease retail sale lower mall traffic ...,0.006885,0.009103,0.019084,1.360775,0.523002,0.523002
2,statement involve known unknown risk uncertain...,0.003248,0.011573,0.01973,1.164634,0.413415,0.413415
3,actual result differ materially anticipated fo...,0.004487,0.008776,0.017798,1.252078,0.506925,0.506925
4,subsequent closing p transaction bofi began of...,0.003611,0.009907,0.019913,1.181364,0.502496,0.502496


In [121]:
#sen_df.isnull().sum()

item7                 0
Pos_Dic               0
Neg_Dic               0
Anti_Dic              0
pos_anti_increment    0
neg_anti_increment    0
result                0
posanti_increment     0
dtype: int64

### 前期回报与预期增量的影响

In [20]:
X = np.array(df['pre_alpha']).reshape(-1, 1)
y = np.array(sen_df['final_score']).reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

y_train = np.array(y_train)
model.fit(X_train, y_train)

In [21]:
import statsmodels.api as sm
# Coefficient
coefficient = model.coef_[0]
print("Coefficient:", coefficient)

# Predict on the test set
y_pred = model.predict(X_test)

# MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# R2
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

# P-value using statsmodels
X_train_sm = sm.add_constant(X_train)  # Add a constant term to X2_train
model_sm = sm.OLS(y_train, X_train_sm)
results = model_sm.fit()
p_value = results.pvalues[1]
print("P-value:", p_value)

Coefficient: [-0.00085832]
Mean Squared Error: 5.579389081910729e-06
R-squared: -0.001255950548003204
P-value: 0.0033075586621173314


前期回报情况不好的可能会对未来更积极

### 不稳定因素与预期增量的影响

In [23]:
df2 = pd.DataFrame(sen_df['final_score']).copy()
df2['unc_Dic'] = sentiment_score(df['item7'], unc_list)
df2['stg_Dic'] = sentiment_score(df['item7'], stg_list)
df2['weak_Dic'] = sentiment_score(df['item7'], weak_list)

df2['lit_Dic'] = sentiment_score(df['item7'], lit_list)
df2['ctr_Dic'] = sentiment_score(df['item7'], ctr_list)

df2['unc_risk'] = df2['unc_Dic'] + df2['weak_Dic'] - df2['stg_Dic']
df2['lit_risk'] = df2['lit_Dic'] + df2['ctr_Dic']


In [24]:
df2.head()

Unnamed: 0,final_score,unc_Dic,stg_Dic,weak_Dic,lit_Dic,ctr_Dic,unc_risk,lit_risk
0,0.017932,0.00229,0.000505,0.000168,0.002408,0.002542,0.001953,0.00495
1,0.01883,0.003867,0.000645,0.000276,0.001749,0.003913,0.003499,0.005663
2,0.019302,0.003401,7.2e-05,0.000287,0.004359,0.004359,0.003616,0.008717
3,0.017484,0.002259,9.8e-05,4.9e-05,0.002406,0.002799,0.00221,0.005206
4,0.019216,0.003685,0.000165,0.000625,0.007403,0.001711,0.004146,0.009114


不稳定风险和诉讼风险提高指向现状不满，与积极预期语气

### p2

In [28]:
df = pd.concat([df,sen_df['final_score']],axis = 1)
df.head()

Unnamed: 0,cik,company_name,filed_date,item7,market_abnormal_return,nasdq,market_value,btm,pre_alpha,pre_rmse,InstOwn_Perc,log_share,sentence_count,final_score
0,804212.0,airgas inc,20160510.0,result operation compared overvie...,-0.008756,0,9.238859,-1.61556,0.168271,2.084189,0.846428,1.137248,401,0.017932
1,880460.0,"perfumania holdings, inc.",20160429.0,retail sale decreased compared prior year re...,-0.004723,1,3.546929,0.88362,-0.149365,4.478955,0.0974,-2.741746,173,0.01883
2,1276591.0,hansen medical inc,20160425.0,case statement identified terminology ...,-0.007461,1,3.784871,-3.415026,-0.183296,7.257763,0.305207,0.852817,296,0.019302
3,1299969.0,"comstock holding companies, inc.",20160404.0,discussion analysis contains forwardlooking st...,0.005079,1,1.573336,,-0.331217,6.174199,0.123161,0.403309,162,0.017484
4,12659.0,h&r block inc,20160617.0,subsequent closing p transaction subsidiary su...,-0.001475,0,8.40364,-5.263362,-0.113956,1.942857,0.95754,1.339484,253,0.019216


In [38]:
df_clean = df.dropna()