# Valence analysis

In [1]:
import pandas as pd
import nltk

from sentence_transformers import SentenceTransformer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
import json
import pickle
from keybert import KeyBERT
import numpy as np


from datetime import datetime
from datetime import timezone
from itertools import chain
import itertools

wordnet_lemmatizer = WordNetLemmatizer()
from nrclex import NRCLex

import matplotlib.pyplot as plt

from pyplutchik import plutchik

In [4]:
with open(f"../models/topic_modeling_posts.pickle", 'rb') as input_file:
    topic_modeling_ec = pd.read_pickle(input_file)

with open('../data/VAD_Lexicon_Valence.json') as json_file:
    lexicon = json.load(json_file)

with open('../data/acronym_list.json') as json_file:
    acronym_list = json.load(json_file)

with open('../data/contractions_dict.json') as json_file:
    contractions_dict = json.load(json_file)

topics = ["minority", "politics", "guncontrol"]

stopwords = pd.read_csv("stop_words_eng.csv", header = 0, names = ["stops"])

stopwords_list = stopwords.stops.to_list()
stopwords_list += ["andnbsp", "wanna", "didn t", "didnt"]

src_results = "/results/reddit_results/"

In [None]:
def extract_vocabulary(text_list):
    vocabulary_list = []
    kw_model = KeyBERT()
    for el in text_list:
        keyword_list = []
        keywords = kw_model.extract_keywords(el)
        if len(keywords) > 0:  
            for key in keywords:
                k = lemmatizer(key[0])
                keyword_list.append(k)
        vocabulary_list.append(keyword_list)
    return vocabulary_list
    
def lemmatizer(text):
    tokenization = nltk.word_tokenize(text)
    lemmatized_sent = " ".join([wordnet_lemmatizer.lemmatize(w) for w in tokenization])
    lemmatized_sent = " ".join(w for w in lemmatized_sent.split() if len(w)>1)
    return lemmatized_sent

def extract_valence(lexicon, text):
    word_list = [word for word in text.split(" ") if word not in (stopwords_list)]
    valence = 0
    n_word = 0
    avg_valence = 0
    for word in word_list:
        if word in acronym_list:
            word = acronym_list[word]
        if word in lexicon: 
            valence += lexicon[word]
            n_word += 1
    if (valence > 0) and (n_word > 0):
        avg_valence = valence/n_word
    return avg_valence
    

In [None]:
def valence_analysis():
    for t in topics:
        df_post = topic_modeling_ec[t]
        print("Columns: ", df_post.columns)
        df_post['labels'] = df_post['labels'].str.replace('-',' ')
        print("Extracting vocabulary")
        df_post = df_post[df_post.clean_text != '']

        clean_text_list = df_post.clean_text.values
        clean_text_list = [[el] for el in clean_text_list]
        
        df_post['Vocabulary'] = [' '.join(map(str, l)) for l in df_post['Vocabulary']]

        # print(f"Records in the dataset {t}: ", len(df_post))
        df_plot = df_post.groupby(["EC_val", "labels"]).agg(clean_text = ("clean_text", " ".join),
                                                             Volume=("labels", "size")).reset_index()
        
        vocabulary = extract_vocabulary(clean_text)
        df_post["Vocabulary"] = vocabulary
    #   df_plot['Lemmatized'] = df_plot .Vocabulary.apply(lambda x: lemmatizer(x))

        df_plot["Valence"] = df_plot["Vocabulary"].map(lambda x: extract_valence(lexicon, x))

        df_plot = df_plot.rename({"EC_val": "Cluster"})
        df_plot.to_csv(f"valence_post_{t}.csv", index = False)
            




In [5]:
df = topic_modeling_ec["minority"]