In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
path = 'D:/study/poly/sem2/MM5427 Textual Analysis in Business/group project/groupcode/AnnualReports16_processed.csv'
dataset = pd.read_csv(path)

In [5]:
lexicon = pd.read_csv('NRC-Emotion-Lexicon.txt', sep = '\t', names = ['term', 'category', 'associated'])
lexicon.head()

Unnamed: 0,term,category,associated
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0


In [6]:
category_list = lexicon['category'].unique().tolist()
category_list

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'negative',
 'positive',
 'sadness',
 'surprise',
 'trust']

In [7]:
filtered_df = lexicon[lexicon['associated'] == 1]
filtered_df.head()

Unnamed: 0,term,category,associated
19,abacus,trust,1
23,abandon,fear,1
25,abandon,negative,1
27,abandon,sadness,1
30,abandoned,anger,1


In [8]:
grouped_df = filtered_df.groupby('category')['term'].apply(list)
grouped_df

category
anger           [abandoned, abandonment, abhor, abhorrent, abo...
anticipation    [abundance, accelerate, accolade, accompanimen...
disgust         [aberration, abhor, abhorrent, abject, abnorma...
fear            [abandon, abandoned, abandonment, abduction, a...
joy             [absolution, abundance, abundant, accolade, ac...
negative        [abandon, abandoned, abandonment, abduction, a...
positive        [abba, ability, abovementioned, absolute, abso...
sadness         [abandon, abandoned, abandonment, abduction, a...
surprise        [abandonment, abduction, abrupt, accident, acc...
trust           [abacus, abbot, absolution, abundance, academi...
Name: term, dtype: object

In [9]:
# A Function to Construct a Sentiment Variable Using a Lexicon-Based Approach
def sentiment_score(text, sen_list):
    temp_list = []
    for t in text:
        if isinstance(t, str):
            temp = 0
            for w in sen_list:
                temp += t.count(w)
            if len(t) != 0:
                temp_list.append(temp/len(t))
            else:
                temp_list.append(0)
        else:
            temp_list.append(0)
    return temp_list

In [10]:
dataset['Pos_Dic'] = sentiment_score(dataset['processed_text'], grouped_df.loc['positive'])
dataset['Neg_Dic'] = sentiment_score(dataset['processed_text'], grouped_df.loc['negative'])
dataset['Ang_Dic'] = sentiment_score(dataset['processed_text'], grouped_df.loc['anger'])
dataset['Anti_Dic'] = sentiment_score(dataset['processed_text'], grouped_df.loc['anticipation'])
dataset['Dis_Dic'] = sentiment_score(dataset['processed_text'], grouped_df.loc['disgust'])
dataset['Fear_Dic'] = sentiment_score(dataset['processed_text'], grouped_df.loc['fear'])
dataset['Joy_Dic'] = sentiment_score(dataset['processed_text'], grouped_df.loc['joy'])
dataset['Sad_Dic'] = sentiment_score(dataset['processed_text'], grouped_df.loc['sadness'])
dataset['Surp_Dic'] = sentiment_score(dataset['processed_text'], grouped_df.loc['surprise'])
dataset['Tru_Dic'] = sentiment_score(dataset['processed_text'], grouped_df.loc['trust'])
dataset.head()

Unnamed: 0,cik,company_name,filed_date,market_abnormal_return,nasdq,market_value,btm,pre_alpha,pre_rmse,InstOwn_Perc,...,Pos_Dic,Neg_Dic,Ang_Dic,Anti_Dic,Dis_Dic,Fear_Dic,Joy_Dic,Sad_Dic,Surp_Dic,Tru_Dic
0,804212.0,airgas inc,20160510.0,-0.008756,0,9.238859,-1.61556,0.168271,2.084189,0.846428,...,0.043997,0.042388,0.018216,0.017884,0.013985,0.023342,0.011912,0.01941,0.005342,0.024586
1,880460.0,"perfumania holdings, inc.",20160429.0,-0.004723,1,3.546929,0.88362,-0.149365,4.478955,0.0974,...,0.045721,0.038646,0.015415,0.018472,0.012664,0.021266,0.008777,0.018079,0.005721,0.025633
2,1276591.0,hansen medical inc,20160425.0,-0.007461,1,3.784871,-3.415026,-0.183296,7.257763,0.305207,...,0.045874,0.037581,0.016762,0.018314,0.010997,0.024323,0.010709,0.016341,0.004457,0.02787
3,1299969.0,"comstock holding companies, inc.",20160404.0,0.005079,1,1.573336,,-0.331217,6.174199,0.123161,...,0.043749,0.033826,0.014072,0.016462,0.009968,0.018176,0.008479,0.014523,0.004014,0.023408
4,12659.0,h&r block inc,20160617.0,-0.001475,0,8.40364,-5.263362,-0.113956,1.942857,0.95754,...,0.041072,0.042139,0.018355,0.019077,0.013021,0.025729,0.00957,0.021587,0.006056,0.02667


In [11]:
dataset['Sent_Dic_pos_surp'] = (dataset['Pos_Dic'] + dataset['Anti_Dic'] + dataset['Joy_Dic'] + dataset['Surp_Dic'] + dataset['Tru_Dic'] 
- dataset['Neg_Dic'] - dataset['Ang_Dic'] - dataset['Dis_Dic'] - dataset['Fear_Dic'] - dataset['Sad_Dic'] )

dataset['Sent_Dic_neg_surp'] = (dataset['Pos_Dic'] + dataset['Anti_Dic'] + dataset['Joy_Dic']  + dataset['Tru_Dic'] -  dataset['Surp_Dic']
- dataset['Neg_Dic'] - dataset['Ang_Dic'] - dataset['Dis_Dic'] - dataset['Fear_Dic'] - dataset['Sad_Dic'] )
dataset.head()

Unnamed: 0,cik,company_name,filed_date,market_abnormal_return,nasdq,market_value,btm,pre_alpha,pre_rmse,InstOwn_Perc,...,Ang_Dic,Anti_Dic,Dis_Dic,Fear_Dic,Joy_Dic,Sad_Dic,Surp_Dic,Tru_Dic,Sent_Dic_pos_surp,Sent_Dic_neg_surp
0,804212.0,airgas inc,20160510.0,-0.008756,0,9.238859,-1.61556,0.168271,2.084189,0.846428,...,0.018216,0.017884,0.013985,0.023342,0.011912,0.01941,0.005342,0.024586,-0.01362,-0.024304
1,880460.0,"perfumania holdings, inc.",20160429.0,-0.004723,1,3.546929,0.88362,-0.149365,4.478955,0.0974,...,0.015415,0.018472,0.012664,0.021266,0.008777,0.018079,0.005721,0.025633,-0.001747,-0.013188
2,1276591.0,hansen medical inc,20160425.0,-0.007461,1,3.784871,-3.415026,-0.183296,7.257763,0.305207,...,0.016762,0.018314,0.010997,0.024323,0.010709,0.016341,0.004457,0.02787,0.001219,-0.007694
3,1299969.0,"comstock holding companies, inc.",20160404.0,0.005079,1,1.573336,,-0.331217,6.174199,0.123161,...,0.014072,0.016462,0.009968,0.018176,0.008479,0.014523,0.004014,0.023408,0.005548,-0.002481
4,12659.0,h&r block inc,20160617.0,-0.001475,0,8.40364,-5.263362,-0.113956,1.942857,0.95754,...,0.018355,0.019077,0.013021,0.025729,0.00957,0.021587,0.006056,0.02667,-0.018387,-0.030498


In [31]:
# Split the dataset into training and test sets
features = dataset.loc[:, 'Pos_Dic':'Sent_Dic_neg_surp']
X = features
y = dataset['market_abnormal_return']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### single linear regression

In [53]:
import statsmodels.api as sm

results = []

# Iterate over each feature and evaluate the linear regression model
for feature in features:
    # Create the linear regression model
    model = sm.OLS(y_train, sm.add_constant(X_train[[feature]]))
    results_single = model.fit()

    # Predict on the test set
    X_test_const = sm.add_constant(X_test[[feature]])
    y_pred = results_single.predict(X_test_const)

    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Get the regression coefficient and p-value
    coef = results_single.params[1]
    p_value = results_single.pvalues[1]

    # Append the results to the list
    results.append({'Feature': feature,  'p-value': p_value,'Coefficient': coef, 'MSE': mse, 'R2 Score': r2})

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)
sorted_results = results_df.sort_values(by=['MSE', 'R2 Score'], ascending=[True, False])
print(sorted_results)

              Feature  Coefficient       MSE  R2 Score   p-value
9             Tru_Dic    -0.126047  0.002100  0.002091  0.316646
0             Pos_Dic    -0.086012  0.002100  0.001987  0.277689
7             Sad_Dic    -0.125231  0.002104  0.000062  0.522826
3            Anti_Dic    -0.099257  0.002105 -0.000057  0.586804
6             Joy_Dic    -0.118876  0.002105 -0.000103  0.661537
1             Neg_Dic    -0.026048  0.002105 -0.000160  0.786389
4             Dis_Dic    -0.085267  0.002105 -0.000296  0.756427
5            Fear_Dic    -0.051266  0.002105 -0.000358  0.724840
8            Surp_Dic    -0.642024  0.002105 -0.000396  0.198065
10  Sent_Dic_pos_surp    -0.105382  0.002106 -0.000530  0.159715
11  Sent_Dic_neg_surp    -0.085121  0.002106 -0.000739  0.282012
2             Ang_Dic    -0.019896  0.002106 -0.000755  0.925399


In [32]:
results = []

# Iterate over each feature and evaluate the linear regression model
for feature in features:
    # Create the linear regression model
    model = LinearRegression()
    model.fit(X_train[[feature]], y_train)

    # Predict on the test set
    y_pred = model.predict(X_test[[feature]])

    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Get the regression coefficient
    coef = model.coef_[0]
    
    # Append the results to the list
    results.append({'Feature': feature,'Coefficient': coef, 'MSE': mse, 'R2 Score': r2})

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)
sorted_results = results_df.sort_values(by=['MSE', 'R2 Score'], ascending=[True, False])
sorted_results

Unnamed: 0,Feature,Coefficient,MSE,R2 Score
9,Tru_Dic,-0.126047,0.0021,0.002091
0,Pos_Dic,-0.086012,0.0021,0.001987
7,Sad_Dic,-0.125231,0.002104,6.2e-05
3,Anti_Dic,-0.099257,0.002105,-5.7e-05
6,Joy_Dic,-0.118876,0.002105,-0.000103
1,Neg_Dic,-0.026048,0.002105,-0.00016
4,Dis_Dic,-0.085267,0.002105,-0.000296
5,Fear_Dic,-0.051266,0.002105,-0.000358
8,Surp_Dic,-0.642024,0.002105,-0.000396
10,Sent_Dic_pos_surp,-0.105382,0.002106,-0.00053


### multiple linear regresson

In [46]:
new_X_train = X_train.drop(['Sent_Dic_pos_surp', 'Sent_Dic_neg_surp'], axis=1)
new_X_test = X_test.drop(['Sent_Dic_pos_surp', 'Sent_Dic_neg_surp'], axis=1)

In [52]:
import statsmodels.api as sm

# Add a constant column to the new_X_train data frame
X_train_const = sm.add_constant(new_X_train)

# Create and fit the linear regression model using statsmodels
model = sm.OLS(y_train, X_train_const)
results = model.fit()

# Predict on the test set
X_test_const = sm.add_constant(new_X_test)
y_pred = results.predict(X_test_const)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('MSE of multiple linear regression model:', mse)
print('R2 of multiple linear regression model:', r2)

# Get the coefficients and p-values from the results
coefficients = results.params[1:]
p_values = results.pvalues[1:]

# Create a DataFrame with coefficients and p-values
coeff_df = pd.DataFrame({'Coefficient': coefficients, 'p-value': p_values})

# Print the coefficients and p-values
print(coeff_df)

MSE of multiple linear regression model: 0.002118389264266675
R2 of multiple linear regression model: -0.006553544232399622
          Coefficient   p-value
Pos_Dic     -0.484324  0.106901
Neg_Dic      0.520680  0.163170
Ang_Dic      1.226681  0.124691
Anti_Dic     0.496728  0.437862
Dis_Dic      0.560509  0.550583
Fear_Dic    -0.395017  0.425990
Joy_Dic      1.077480  0.193601
Sad_Dic     -1.173209  0.063385
Surp_Dic    -1.772183  0.105159
Tru_Dic     -0.362393  0.380626
