In [2]:
import emoji
import nltk
import sqlite3
import pandas as pd
import re
import json
import ast
import string
from liwc import Liwc

In [4]:
#nlp = spacy.load("pt_core_news_sm")
stopwords = nltk.corpus.stopwords.words('portuguese')
nltk.download('punkt')
#classes = ['VERB', 'NOUN', 'ADJ']
LIWCLocation = 'LIWC2015.dic'

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Diego\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# function to return key for any value
def get_key(classes, my_dict):
    keys = []
    for _class in classes:
        for key, value in my_dict.items():
            if _class == value:
                keys.append(key)
    
    return " ".join(keys)

In [8]:
# def pos_tagger(text):
#     doc = nlp(text)
#     pos_tagging_dict = {}
#     for token in doc:
#         pos_tagging_dict[token.text] = token.pos_
#     return pos_tagging_dict

In [10]:
# def lemmatize(text):
#     doc = nlp(text)
#     lemma_list = []
#     for token in doc:
#         lemma_list.append(token.lemma_)
#     return " ".join(lemma_list)

In [12]:
def tokenize(text):
    tokens = nltk.tokenize.word_tokenize(text, language="portuguese")
    return tokens

In [14]:
def remove_stopwords(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if token not in stopwords]
    text = " ".join(tokens)
    return text

In [16]:
def remove_emoji(text):
    allchars = [str for str in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI["pt"]]
    text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return text

In [18]:
def remove_tags(text):
    pattern = r"@[a-zA-Z0-9]+"
    text = re.sub(pattern, "", text)
    return text

In [20]:
def remove_hashtags(text):
    pattern = r"#[a-zA-Z0-9]+"
    text = re.sub(pattern, "", text)
    return text

In [22]:
def remove_quotes(text):
    text = re.sub(r"""['"]+""", "", text)
    return text

In [24]:
def remove_urls(text):
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text)
    return text

In [26]:
def get_liwc_analysis(text):
    liwc = Liwc(LIWCLocation)
    tokens = tokenize(text)
    liwc_analysis = dict(liwc.parse(tokens))
    return liwc_analysis

In [28]:
def check_subjectivity(liwc_analysis):    
    classes = ['compare (Comparisons)', 
               'affect (Affect)',
               'posemo (Positive Emotions)', 
               'negemo (Negative Emotions)']
    for class_ in classes:
        if class_ in liwc_analysis:
            return 1
    return 0

In [30]:
# def get_sentiment_words_rate(text):
#     senti_dict = {"posemo": 0,
#                   "negemo": 0}
#     tokens = nltk.tokenize.word_tokenize(text, language="portuguese")
#     if len(tokens) > 0:
#         for token in tokens:
#             if token in liwc:
#                 if "posemo" in liwc[token]:
#                     senti_dict["posemo"] += 1
#                 elif "negemo" in liwc[token]:
#                     senti_dict["negemo"] += 1
#         senti_dict["posemo"] = round(senti_dict["posemo"] / len(tokens), 2)
#         senti_dict["negemo"] = round(senti_dict["negemo"] / len(tokens), 2)
#     return senti_dict

In [32]:
def normalize(text):
    normalizer = Normaliser()
    text = normalizer.normalise(text)
    #text = text.replace('username', '')
    #text = text.replace('hashtag', '')
    #text = text.replace('number', '')
    #text = text.replace('url', '')
    return text

In [34]:
def preprocess(df):
    df['preprocessed_text'] = df.text.apply(remove_urls)
    df['preprocessed_text'] = df.preprocessed_text.apply(remove_tags)
    df['preprocessed_text'] = df.preprocessed_text.apply(remove_hashtags)
    df['preprocessed_text'] = df.preprocessed_text.apply(remove_emoji)
    df['preprocessed_text'] = df.preprocessed_text.apply(remove_quotes)

    custom_pipeline = [preprocessing.fillna,
                       preprocessing.lowercase,
                       preprocessing.remove_brackets,
                       preprocessing.remove_digits,
                       preprocessing.remove_punctuation,
                       preprocessing.remove_whitespace]
    df['preprocessed_text'] = df.preprocessed_text.pipe(hero.clean, custom_pipeline)
    return df

In [65]:
def create_columns_from_liwc(df):
    classes = ['compare (Comparisons)',
                'affect (Affect)',
                'posemo (Positive Emotions)',
                'negemo (Negative Emotions)']
    df['liwc_analysis'] = df['liwc_analysis'].apply(lambda x: ast.literal_eval(x))
    for col in classes:
        df[col] = df.liwc_analysis.apply(lambda x: x[col] if col in x.keys() else '0')
    return df

In [38]:
def main():
    con = sqlite3.connect("tweets.db")
    sql = "select id_str, text from tweets where text not like '%RT @%' order by random() limit 2000"
    df = pd.read_sql(sql, con=con, index_col="id_str")
    df = preprocess(df)
    df['liwc_analysis'] = df.preprocessed_text.apply(get_liwc_analysis)
    df['is_subjective'] = df.liwc_analysis.apply(check_subjectivity)
    #df = filter_dataset_with_sentiment_words(df)
    return df