In [1]:
import json
import numpy as np

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')

from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from afinn import Afinn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import custom_lexicons.senticnet.senticnet as sentic

import numpy as np 
import pandas as pd
import data_reader
import results_analyser
import itertools

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\phoec\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\phoec\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\phoec\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


In [2]:
FILE_DIR = "./custom_lexicons/"

NTUSD_FILEPATH = "ntusd/NTUSD_Fin_word_v1.0.json"
STOCKTWITLEXI_FILEPATH = "stocktwitlexi/domain_lexicon_raw_norm.csv"
SENTI_DD_FILEPATH = "sentidd/sentidd_data1.csv"
LM_FILEPATH = "sentidd/LM_Word_List.csv"

# AFINN
afinn = Afinn()

# VADER
analyzer = SentimentIntensityAnalyzer()

# NTUSD-FIN
with open(FILE_DIR+NTUSD_FILEPATH, "r") as f:
    data = f.read()
    NTUSD = json.loads(data)
word_sent_dict = {}
for i in range(len(NTUSD)):
    word_sent_dict[NTUSD[i]["token"]] = NTUSD[i]["market_sentiment"]
    
# STOCKTWITLEXI    
stocktwitlexi = pd.read_csv(FILE_DIR+STOCKTWITLEXI_FILEPATH, header=None, index_col=0)
stocktwitlexi = stocktwitlexi.to_dict()[1]

# SENTIDD
sentidd = pd.read_csv(FILE_DIR+SENTI_DD_FILEPATH)
sentidd_dict = dict(zip(zip(sentidd.entity, sentidd.directional_word), sentidd.sentiment))
lm_df = pd.read_csv(FILE_DIR+LM_FILEPATH)
lm_dict = dict(zip(lm_df.word, lm_df.label))

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def remove_stopwords(data):
    sentence_token = [s.split(' ') for s in data] 
    return sentence_token

def get_wordnet_tag(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

In [3]:
def standardise_scores(score, count):
    if (count>0):
        return score/count
    else:
        return score

def normalise_scores(pred_raw, lexicon):
    if(lexicon=="senticnet"): #-1 to 1
        return pred_raw
    elif(lexicon=="ntusd"): #-3.81 to 1.22, range is 5 so /2.5
        return [pred/2.5 for pred in pred_raw]
    elif(lexicon=="sentiwordnet"): #-1 to 1 since pos-neg
        return pred_raw
    elif(lexicon=="stocktwitlexi"): #-1 to 1
        return pred_raw
    elif(lexicon=="afinn"): #-5 to 5
        return [pred/5 for pred in pred_raw]
    elif(lexicon=="vader"): #-1 to 1
        return pred_raw
    elif(lexicon=="sentidd"): #-2 to 2
        return [pred/2 for pred in pred_raw]

In [4]:
def senti_dd_polarity(text, sentidd_dict, lm_dict):
    def lm_score(text, lm_dict):
        tokens = word_tokenize(text)
        count = 0
        score = 0
        for token in tokens:
            try:
                if lm_dict[token]=="positive":
                    score += 1
                    count += 1
                elif lm_dict[token]=="negative":
                    score -= 1
                    count += 1
            except:
                pass

        return score/count if count>0 else score

    def senti_dd_score(text, sentidd_dict):
        tokens = word_tokenize(text)
        count = 0
        score = 0
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        for stemmed_token in stemmed_tokens:
            for lemmatized_token in lemmatized_tokens:
                
                try:
                    if (sentidd_dict[(lemmatized_token, stemmed_token)] =="positive"):
                        score += 1
                        count += 1
                    elif (sentidd_dict[(lemmatized_token, stemmed_token)] =="negative"):
                        score -= 1
                        count += 1
                except:
                    pass
        return score/count if count>0 else score
    
    score = lm_score(text, lm_dict)
    context_sentiment_score = senti_dd_score(text, sentidd_dict)
    if context_sentiment_score > 0: score += 1
    elif context_sentiment_score < 0: score -= 1

    return score

In [5]:
def individual_scoring(X, y_class):
    sentic_pred_raw = []
    ntusd_pred_raw = []
    sentiwordnet_pred_raw = []
    stocktwitlexi_pred_raw = []
    afinn_pred_raw = []
    vader_pred_raw = []
    sentidd_pred_raw = []
    
    for test_tweet in X:
        sentic_score = 0
        ntusd_score = 0
        sentiwordnet_score = 0
        stocktwitlexi_score = 0
        afinn_score = 0
        
        sentic_count = 0
        ntusd_count = 0
        sentiwordnet_count = 0
        stocktwitlexi_count = 0
        afinn_count = 0
        
        sentence_tagged = np.array(nltk.pos_tag(test_tweet))
        for tagged in sentence_tagged:
            word = tagged[0]
            wn_tag = get_wordnet_tag(tagged[1])
            
            #Senticnet
            try: 
                sentic_score += sentic.senticnet[word][7]
                sentic_count += 1
            except:
                pass
            #NTUSD
            try: 
                ntusd_score += word_sent_dict[word]
                ntusd_count += 1
            except:
                pass
            
            #Senticwordnet
            if wn_tag in (wn.NOUN, wn.ADJ, wn.ADV,  wn.VERB):            
                lemma = lemmatizer.lemmatize(word, pos=wn_tag)
                if lemma:
                    synsets = wn.synsets(lemma, pos=wn_tag)
                    if synsets:
                        swn_synset = swn.senti_synset(synsets[0].name())
                        sentiwordnet_score += swn_synset.pos_score() - swn_synset.neg_score()
                        sentiwordnet_count += 1
            #Stocktwitlexi
            try: 
                stocktwitlexi_score += stocktwitlexi[word]
                stocktwitlexi_count += 1
            except:
                pass
            
            #Afinn
            try: 
                afinn_score += afinn.score(word)
                afinn_count += 1
            except:
                pass
        
        #Afinn
        s = " ".join(test_tweet)
        
        #Vader
        vader_output = analyzer.polarity_scores(s)
        vader_score = vader_output["compound"] #vader_output["pos"] - vader_output["neg"]
        
        #Senti-DD
        sentidd_score = senti_dd_polarity(s, sentidd_dict, lm_dict)
        
        sentic_score = standardise_scores(sentic_score, sentic_count)
        ntusd_score = standardise_scores(ntusd_score, ntusd_count)
        sentiwordnet_score = standardise_scores(sentiwordnet_score, sentiwordnet_count)
        stocktwitlexi_score = standardise_scores(stocktwitlexi_score, stocktwitlexi_count)
        afinn_score = standardise_scores(afinn_score, afinn_count)
        vader_score = vader_score #already normalised
        sentidd_score = sentidd_score #already normalised
        
        sentic_pred_raw.append(sentic_score)
        ntusd_pred_raw.append(ntusd_score)
        sentiwordnet_pred_raw.append(sentiwordnet_score)
        stocktwitlexi_pred_raw.append(stocktwitlexi_score)
        afinn_pred_raw.append(afinn_score)
        vader_pred_raw.append(vader_score)
        sentidd_pred_raw.append(sentidd_score)
        
    sentic_pred = normalise_scores(sentic_pred_raw, "senticnet")
    ntusd_pred = normalise_scores(ntusd_pred_raw, "ntusd")
    sentiwordnet_pred = normalise_scores(sentiwordnet_pred_raw, "sentiwordnet")
    stocktwitlexi_pred = normalise_scores(stocktwitlexi_pred_raw, "stocktwitlexi")
    afinn_pred = normalise_scores(afinn_pred_raw, "afinn")
    vader_pred = normalise_scores(vader_pred_raw, "vader")
    sentidd_pred = normalise_scores(sentidd_pred_raw, "sentidd")

    combined_df = pd.DataFrame()
    combined_df['senticnet'] = sentic_pred
    combined_df['ntusd'] = ntusd_pred
    combined_df['sentiwordnet'] = sentiwordnet_pred
    combined_df['afinn'] = afinn_pred
    combined_df['vader'] = vader_pred
    combined_df['stocktwitlexi'] = stocktwitlexi_pred
    combined_df['sentidd'] = sentidd_pred
    combined_df['actual_class'] = y_class
    
    return combined_df

In [6]:
def combine_voting_leave2soft(row):
    lowest_col = row.sort_values().idxmin()
    highest_col = row.sort_values().idxmax()
    lowest_dict[lowest_col] += 1
    highest_dict[highest_col] += 1
    return row.sort_values().iloc[1:4].mean()

def combine_voting_soft(row):
    return row.sort_values().mean()

def combine_voting_hard(row):
    pos_vote = (row > 0).sum()
    neg_vote = (row <= 0).sum()
    return 1 if pos_vote>neg_vote else 0

In [7]:
def generate_results(combined_df):
    combination_list = []
    lexicons = ["senticnet", "ntusd", "sentiwordnet", "stocktwitlexi", "afinn", "vader", "sentidd"]

    for i in range(1, len(lexicons)+1):
        combination_tuples = list(itertools.combinations(lexicons, i))
        combination_list.extend([list(elem) for elem in combination_tuples])

    y_class = combined_df['actual_class']
    results_df = pd.DataFrame()

    for combination in combination_list:
        curr_df = combined_df[combination]

        if(len(combination)>1): #more than 1
            voting_soft = curr_df.apply(combine_voting_soft, axis=1)
            pred_class = results_analyser.probability_to_class(voting_soft)
            results_df = results_analyser.calculate_metrics(results_df, y_class, pred_class, '_'.join(combination)+"_soft")

            if(len(combination)%2==1): #odd number
                voting_hard = curr_df.apply(combine_voting_hard, axis=1)
                results_df = results_analyser.calculate_metrics(results_df, y_class, voting_hard, '_'.join(combination)+"_hard")

            if(len(combination)>=5 and len(combination)%2==1): #odd and gte 5
                voting_leave2soft = curr_df.apply(combine_voting_leave2soft, axis=1)
                pred_class = results_analyser.probability_to_class(voting_leave2soft)
                results_df = results_analyser.calculate_metrics(results_df, y_class, pred_class, '_'.join(combination)+"_leave2soft")
        else:
            pred_class = results_analyser.probability_to_class(combined_df[combination[0]])
            results_df = results_analyser.calculate_metrics(results_df, y_class, pred_class, '_'.join(combination))

    return results_df

## Data 1

In [8]:
data1_X, data1_y_class = data_reader.read_data1("list")

In [9]:
combined1_df = individual_scoring(data1_X, data1_y_class)
combined1_df

Unnamed: 0,senticnet,ntusd,sentiwordnet,afinn,vader,stocktwitlexi,sentidd,actual_class
0,0.8300,-0.383587,0.000000,0.000000,0.0000,0.070687,-0.5,0
1,0.6160,0.242586,0.000000,0.060000,0.5267,0.137161,-0.5,1
2,-0.0750,-0.021231,0.000000,0.040000,0.4497,0.094244,0.0,1
3,0.1825,0.031240,0.022727,0.000000,0.6310,0.104018,0.0,1
4,0.8100,0.129776,0.145833,0.040000,0.4404,0.083150,0.5,1
...,...,...,...,...,...,...,...,...
2018,0.9065,-0.052211,0.041667,0.000000,0.0000,0.128948,0.0,1
2019,-0.0895,-0.236600,0.125000,0.111111,0.7717,0.098817,0.0,0
2020,0.8090,0.127908,0.000000,0.000000,0.0000,0.097468,0.0,1
2021,0.5555,-0.015682,-0.062500,0.000000,0.0000,0.082139,0.0,1


In [10]:
lowest_dict = {key: 0 for key in combined1_df.columns}
highest_dict = {key: 0 for key in combined1_df.columns}

In [11]:
result1_df = generate_results(combined1_df)
print(lowest_dict)
print(highest_dict)
display(result1_df)

{'senticnet': 6619, 'ntusd': 6304, 'sentiwordnet': 8541, 'afinn': 7789, 'vader': 5433, 'stocktwitlexi': 1559, 'sentidd': 8261, 'actual_class': 0}
{'senticnet': 16871, 'ntusd': 6507, 'sentiwordnet': 1983, 'afinn': 1013, 'vader': 7395, 'stocktwitlexi': 7576, 'sentidd': 3161, 'actual_class': 0}


Unnamed: 0,Accuracy,Experiment,F1_score,Precision,Recall
0,0.618389,senticnet,0.715968,0.691542,0.742182
1,0.709343,ntusd,0.782383,0.759885,0.806255
2,0.566485,sentiwordnet,0.609006,0.732833,0.520976
3,0.695502,stocktwitlexi,0.805923,0.686527,0.975591
4,0.506179,afinn,0.504218,0.721591,0.387490
...,...,...,...,...,...
201,0.670292,senticnet_sentiwordnet_stocktwitlexi_afinn_vad...,0.760158,0.719048,0.806255
202,0.722689,ntusd_sentiwordnet_stocktwitlexi_afinn_vader_s...,0.794430,0.764457,0.826850
203,0.680178,senticnet_ntusd_sentiwordnet_stocktwitlexi_afi...,0.768183,0.724324,0.817696
204,0.611962,senticnet_ntusd_sentiwordnet_stocktwitlexi_afi...,0.664099,0.756335,0.591915


## Data 2

In [12]:
data2_X, data2_y_class = data_reader.read_data2("list")

In [13]:
combined2_df = individual_scoring(data2_X, data2_y_class)
combined2_df

Unnamed: 0,senticnet,ntusd,sentiwordnet,afinn,vader,stocktwitlexi,sentidd,actual_class
0,0.302600,-0.084072,0.015625,0.038095,0.5859,0.062336,0.5,1
1,-0.126786,0.131360,0.017045,-0.005263,0.2023,0.088886,-0.5,0
2,0.137750,0.026968,-0.013158,-0.024242,-0.7351,0.088475,0.0,1
3,0.857500,0.286495,0.020833,0.018182,0.3818,0.089604,0.0,1
4,0.854500,0.309721,0.015625,0.040000,0.6249,0.116167,0.5,1
...,...,...,...,...,...,...,...,...
870,0.802500,0.152355,-0.050000,0.020000,0.2732,0.088337,0.5,1
871,0.661000,0.088727,0.022727,0.030769,0.5719,0.082826,0.0,1
872,0.850500,0.170936,-0.057692,0.000000,0.0000,0.040192,-0.5,1
873,0.833000,-0.152019,0.041667,0.080000,0.2023,0.049296,0.0,1


In [14]:
lowest_dict = {key: 0 for key in combined2_df.columns}
highest_dict = {key: 0 for key in combined2_df.columns}

In [15]:
result2_df = generate_results(combined2_df)
print(lowest_dict)
print(highest_dict)
display(result2_df)

{'senticnet': 1753, 'ntusd': 2551, 'sentiwordnet': 3328, 'afinn': 3449, 'vader': 2736, 'stocktwitlexi': 490, 'sentidd': 4943, 'actual_class': 0}
{'senticnet': 7454, 'ntusd': 1836, 'sentiwordnet': 561, 'afinn': 223, 'vader': 4644, 'stocktwitlexi': 2725, 'sentidd': 1807, 'actual_class': 0}


Unnamed: 0,Accuracy,Experiment,F1_score,Precision,Recall
0,0.668571,senticnet,0.760726,0.672993,0.874763
1,0.662857,ntusd,0.739629,0.691419,0.795066
2,0.635429,sentiwordnet,0.691787,0.704724,0.679317
3,0.620571,stocktwitlexi,0.755882,0.617047,0.975332
4,0.717714,afinn,0.740818,0.828638,0.669829
...,...,...,...,...,...
201,0.798857,senticnet_sentiwordnet_stocktwitlexi_afinn_vad...,0.847487,0.779904,0.927894
202,0.817143,ntusd_sentiwordnet_stocktwitlexi_afinn_vader_s...,0.855072,0.818024,0.895636
203,0.800000,senticnet_ntusd_sentiwordnet_stocktwitlexi_afi...,0.849268,0.777603,0.935484
204,0.766857,senticnet_ntusd_sentiwordnet_stocktwitlexi_afi...,0.813528,0.784832,0.844402


## Data 3

In [16]:
data3_X, data3_y_class = data_reader.read_data3("list")

In [17]:
combined3_df = individual_scoring(data3_X, data3_y_class)
combined3_df

Unnamed: 0,senticnet,ntusd,sentiwordnet,afinn,vader,stocktwitlexi,sentidd,actual_class
0,0.931000,-0.061594,0.312500,0.000000,0.0000,0.095693,0.0,1
1,0.303667,-0.191655,0.015625,0.000000,0.2023,0.087639,0.5,0
2,0.020500,-0.300220,0.150000,0.066667,0.0772,-0.183496,0.5,0
3,0.000000,-0.244225,0.013889,-0.080000,-0.6115,0.040343,0.0,0
4,0.358000,0.159665,-0.033333,0.009524,0.0000,0.045528,0.0,0
...,...,...,...,...,...,...,...,...
1156,0.446667,-0.015374,-0.093750,-0.027273,-0.0783,0.095814,-0.5,0
1157,0.024500,-0.425183,-0.062500,0.000000,0.0000,0.088961,0.0,0
1158,0.472500,-0.472243,0.000000,-0.100000,-0.3182,-0.134641,0.0,0
1159,0.879000,-0.260604,0.000000,0.000000,0.0000,0.090518,0.0,0


In [18]:
lowest_dict = {key: 0 for key in combined3_df.columns}
highest_dict = {key: 0 for key in combined3_df.columns}

In [19]:
result3_df = generate_results(combined3_df)
print(lowest_dict)
print(highest_dict)
display(result3_df)

{'senticnet': 3888, 'ntusd': 4979, 'sentiwordnet': 4024, 'afinn': 3621, 'vader': 3848, 'stocktwitlexi': 813, 'sentidd': 4369, 'actual_class': 0}
{'senticnet': 9109, 'ntusd': 2946, 'sentiwordnet': 1530, 'afinn': 567, 'vader': 4201, 'stocktwitlexi': 5205, 'sentidd': 1984, 'actual_class': 0}


Unnamed: 0,Accuracy,Experiment,F1_score,Precision,Recall
0,0.584841,senticnet,0.668956,0.625964,0.718289
1,0.761413,ntusd,0.790943,0.809892,0.772861
2,0.558140,sentiwordnet,0.585956,0.647059,0.535398
3,0.638243,stocktwitlexi,0.755530,0.624038,0.957227
4,0.538329,afinn,0.508257,0.672330,0.408555
...,...,...,...,...,...
201,0.640827,senticnet_sentiwordnet_stocktwitlexi_afinn_vad...,0.713008,0.668387,0.764012
202,0.703704,ntusd_sentiwordnet_stocktwitlexi_afinn_vader_s...,0.747059,0.744868,0.749263
203,0.666667,senticnet_ntusd_sentiwordnet_stocktwitlexi_afi...,0.729937,0.692715,0.771386
204,0.608958,senticnet_ntusd_sentiwordnet_stocktwitlexi_afi...,0.622924,0.712928,0.553097


## Save results

In [24]:
RESULTS_FILE_DIR = "./results/"

result1_df.to_csv(RESULTS_FILE_DIR+"result1_df.csv", index=False)
result2_df.to_csv(RESULTS_FILE_DIR+"result2_df.csv", index=False)
result3_df.to_csv(RESULTS_FILE_DIR+"result3_df.csv", index=False)

In [25]:
result1_df.iloc[result1_df['F1_score'].sort_values().index].tail(20)

Unnamed: 0,Accuracy,Experiment,F1_score,Precision,Recall
78,0.705882,sentiwordnet_stocktwitlexi_afinn_soft,0.80107,0.713095,0.913806
153,0.710826,senticnet_ntusd_stocktwitlexi_afinn_vader_leav...,0.801223,0.722426,0.899314
159,0.720217,senticnet_ntusd_stocktwitlexi_vader_sentidd_le...,0.80265,0.739242,0.877956
60,0.737024,ntusd_sentiwordnet_afinn_soft,0.802817,0.780822,0.826087
31,0.708848,senticnet_ntusd_stocktwitlexi_hard,0.803993,0.713105,0.921434
59,0.724172,ntusd_sentiwordnet_stocktwitlexi_hard,0.804485,0.744005,0.875667
186,0.732575,ntusd_sentiwordnet_stocktwitlexi_vader_sentidd...,0.805605,0.761549,0.855072
3,0.695502,stocktwitlexi,0.805923,0.686527,0.975591
22,0.697973,stocktwitlexi_afinn_soft,0.806829,0.688985,0.973303
68,0.73307,ntusd_stocktwitlexi_vader_soft,0.809725,0.752456,0.87643


In [26]:
result2_df.iloc[result2_df['F1_score'].sort_values().index].tail(20)

Unnamed: 0,Accuracy,Experiment,F1_score,Precision,Recall
201,0.798857,senticnet_sentiwordnet_stocktwitlexi_afinn_vad...,0.847487,0.779904,0.927894
198,0.797714,senticnet_ntusd_sentiwordnet_stocktwitlexi_vad...,0.847807,0.775157,0.935484
107,0.802286,senticnet_ntusd_vader_sentidd_soft,0.847845,0.790164,0.914611
160,0.802286,senticnet_ntusd_afinn_vader_sentidd_soft,0.847845,0.790164,0.914611
200,0.798857,senticnet_ntusd_stocktwitlexi_afinn_vader_sent...,0.848537,0.776378,0.935484
203,0.8,senticnet_ntusd_sentiwordnet_stocktwitlexi_afi...,0.849268,0.777603,0.935484
199,0.808,senticnet_ntusd_sentiwordnet_afinn_vader_senti...,0.852373,0.793781,0.920304
148,0.808,senticnet_ntusd_sentiwordnet_vader_sentidd_soft,0.852632,0.792822,0.922201
202,0.817143,ntusd_sentiwordnet_stocktwitlexi_afinn_vader_s...,0.855072,0.818024,0.895636
184,0.817143,ntusd_sentiwordnet_stocktwitlexi_vader_sentidd...,0.855335,0.816926,0.897533


In [27]:
result3_df.iloc[result3_df['F1_score'].sort_values().index].tail(20)

Unnamed: 0,Accuracy,Experiment,F1_score,Precision,Recall
138,0.705426,senticnet_ntusd_sentiwordnet_stocktwitlexi_vad...,0.774108,0.700957,0.864307
119,0.726098,ntusd_sentiwordnet_stocktwitlexi_vader_soft,0.774468,0.745902,0.80531
156,0.706288,senticnet_ntusd_stocktwitlexi_afinn_sentidd_le...,0.77462,0.701796,0.864307
183,0.716624,ntusd_sentiwordnet_stocktwitlexi_afinn_sentidd...,0.775733,0.721166,0.839233
59,0.713178,ntusd_sentiwordnet_stocktwitlexi_hard,0.779616,0.707083,0.868732
69,0.728682,ntusd_stocktwitlexi_vader_hard,0.782308,0.736021,0.834808
180,0.724376,ntusd_sentiwordnet_stocktwitlexi_afinn_vader_l...,0.782313,0.72601,0.848083
31,0.704565,senticnet_ntusd_stocktwitlexi_hard,0.782498,0.686318,0.910029
141,0.716624,senticnet_ntusd_sentiwordnet_stocktwitlexi_sen...,0.78341,0.707491,0.877581
15,0.751077,ntusd_afinn_soft,0.784167,0.794251,0.774336
