In [1]:
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
import data_reader
import results_analyser
from tqdm import tqdm

stemmer = PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [2]:
def senti_dd_polarity(text, sentidd_dict, lm_dict):
    def lm_score(text, lm_dict):
        tokens = word_tokenize(text)
        count = 0
        score = 0
        for token in tokens:
            try:
                if lm_dict[token]=="positive":
                    score += 1
                    count += 1
                elif lm_dict[token]=="negative":
                    score -= 1
                    count += 1
            except:
                pass

        return score/count if count>0 else score

    def senti_dd_score(text, sentidd_dict):
        tokens = word_tokenize(text)
        count = 0
        score = 0
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        for stemmed_token in stemmed_tokens:
            for lemmatized_token in lemmatized_tokens:
                
                try:
                    if (sentidd_dict[(lemmatized_token, stemmed_token)] =="positive"):
                        score += 1
                        count += 1
                    elif (sentidd_dict[(lemmatized_token, stemmed_token)] =="negative"):
                        score -= 1
                        count += 1
                except:
                    pass
        return score/count if count>0 else score
    
    score = lm_score(text, lm_dict)
    context_sentiment_score = senti_dd_score(text, sentidd_dict)
    if context_sentiment_score > 0: score += 1
    elif context_sentiment_score < 0: score -= 1

    return score


def sentidd_processor(X, sentidd_dict, lm_dict):
    senti_dd_pred = []
    for test_tweet in tqdm(X):
        s = " ".join(test_tweet)
        senti_dd_score = senti_dd_polarity(s, sentidd_dict, lm_dict)
        senti_dd_pred.append(senti_dd_score)
    
    return senti_dd_pred

def generate_results(X, y_class, experiment):
    results_df = pd.DataFrame()

    sentidd_pred = sentidd_processor(X, sentidd_dict, lm_dict)
    #remove neutral
    sentidd_pred = np.array(sentidd_pred)
    y_class = np.array(y_class)[sentidd_pred != 0].tolist()
    sentidd_pred = sentidd_pred[sentidd_pred != 0].tolist()

    pred_class = results_analyser.probability_to_class(sentidd_pred)
    results_df = results_analyser.calculate_metrics(results_df, y_class, pred_class, experiment)

    return results_df

In [3]:
FILE_DIR = "./custom_lexicons/sentidd/"
LM_FILEPATH = FILE_DIR+"LM_Word_List.csv"
lm_df = pd.read_csv(LM_FILEPATH)
lm_dict = dict(zip(lm_df.word, lm_df.label))

## Sentidd using data1

In [4]:
SENTI_DD_FILEPATH = FILE_DIR+"sentidd_data1.csv"
sentidd = pd.read_csv(SENTI_DD_FILEPATH)
sentidd_dict = dict(zip(zip(sentidd.entity, sentidd.directional_word), sentidd.sentiment))

In [5]:
data2_X, data2_y_class = data_reader.read_data2("list")
data3_X, data3_y_class = data_reader.read_data3("list")

pd.concat([generate_results(data2_X, data2_y_class, "data2"), generate_results(data3_X, data3_y_class, "data3")])

100%|███████████████████████████████████████████████████████████████████████████████| 875/875 [00:01<00:00, 464.39it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1161/1161 [00:00<00:00, 1622.52it/s]


Unnamed: 0,Accuracy,Experiment,F1_score,Precision,Recall
0,0.851441,data2,0.858947,0.906667,0.816
0,0.59126,data3,0.622328,0.696809,0.562232


## Sentidd using data4

In [6]:
SENTI_DD_FILEPATH = FILE_DIR+"sentidd_data4.csv"
sentidd = pd.read_csv(SENTI_DD_FILEPATH)
sentidd_dict = dict(zip(zip(sentidd.entity, sentidd.directional_word), sentidd.sentiment))

In [7]:
data2_X, data2_y_class = data_reader.read_data2("list")
data3_X, data3_y_class = data_reader.read_data3("list")

pd.concat([generate_results(data2_X, data2_y_class, "data2"), generate_results(data3_X, data3_y_class, "data3")])

100%|██████████████████████████████████████████████████████████████████████████████| 875/875 [00:00<00:00, 1396.42it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1161/1161 [00:00<00:00, 1532.50it/s]


Unnamed: 0,Accuracy,Experiment,F1_score,Precision,Recall
0,0.769767,data2,0.765957,0.89011,0.672199
0,0.601852,data3,0.619469,0.686275,0.564516
