In [1]:
import spacy
import emoji
import nltk
import sqlite3
import pandas as pd
from enelvo.normaliser import Normaliser
import re
import json
import string
from texthero import preprocessing
import texthero as hero
from liwc import Liwc

In [2]:
nlp = spacy.load("pt_core_news_sm")
stopwords = nltk.corpus.stopwords.words('portuguese')
nltk.download('punkt')
#classes = ['VERB', 'NOUN', 'ADJ']
LIWCLocation = 'LIWC2015.dic'

[nltk_data] Downloading package punkt to /home/diego/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# function to return key for any value
def get_key(classes, my_dict):
    keys = []
    for _class in classes:
        for key, value in my_dict.items():
            if _class == value:
                keys.append(key)
    
    return " ".join(keys)

In [4]:
# def pos_tagger(text):
#     doc = nlp(text)
#     pos_tagging_dict = {}
#     for token in doc:
#         pos_tagging_dict[token.text] = token.pos_
#     return pos_tagging_dict

In [5]:
# def lemmatize(text):
#     doc = nlp(text)
#     lemma_list = []
#     for token in doc:
#         lemma_list.append(token.lemma_)
#     return " ".join(lemma_list)

In [6]:
def tokenize(text):
    tokens = nltk.tokenize.word_tokenize(text, language="portuguese")
    return tokens

In [7]:
def remove_stopwords(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if token not in stopwords]
    text = " ".join(tokens)
    return text

In [8]:
def remove_emoji(text):
    allchars = [str for str in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI["pt"]]
    text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return text

In [9]:
def remove_tags(text):
    pattern = r"@[a-zA-Z0-9]+"
    text = re.sub(pattern, "", text)
    return text

In [10]:
def remove_hashtags(text):
    pattern = r"#[a-zA-Z0-9]+"
    text = re.sub(pattern, "", text)
    return text

In [11]:
def remove_quotes(text):
    text = re.sub(r"""['"]+""", "", text)
    return text

In [12]:
def remove_urls(text):
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text)
    return text

In [13]:
def get_liwc_analysis(text):
    liwc = Liwc(LIWCLocation)
    tokens = tokenize(text)
    liwc_analysis = dict(liwc.parse(tokens))
    return liwc_analysis

In [14]:
def check_subjectivity(liwc_analysis):    
    classes = ['compare (Comparisons)', 
               'affect (Affect)',
               'posemo (Positive Emotions)', 
               'negemo (Negative Emotions)']
    for class_ in classes:
        if class_ in liwc_analysis:
            return 1
    return 0

In [15]:
# def get_sentiment_words_rate(text):
#     senti_dict = {"posemo": 0,
#                   "negemo": 0}
#     tokens = nltk.tokenize.word_tokenize(text, language="portuguese")
#     if len(tokens) > 0:
#         for token in tokens:
#             if token in liwc:
#                 if "posemo" in liwc[token]:
#                     senti_dict["posemo"] += 1
#                 elif "negemo" in liwc[token]:
#                     senti_dict["negemo"] += 1
#         senti_dict["posemo"] = round(senti_dict["posemo"] / len(tokens), 2)
#         senti_dict["negemo"] = round(senti_dict["negemo"] / len(tokens), 2)
#     return senti_dict

In [16]:
def normalize(text):
    normalizer = Normaliser()
    text = normalizer.normalise(text)
    #text = text.replace('username', '')
    #text = text.replace('hashtag', '')
    #text = text.replace('number', '')
    #text = text.replace('url', '')
    return text

In [17]:
def preprocess(df):
    df['preprocessed_text'] = df.text.apply(remove_urls)
    df['preprocessed_text'] = df.preprocessed_text.apply(remove_tags)
    df['preprocessed_text'] = df.preprocessed_text.apply(remove_hashtags)
    df['preprocessed_text'] = df.preprocessed_text.apply(remove_emoji)
    df['preprocessed_text'] = df.preprocessed_text.apply(remove_quotes)

    custom_pipeline = [preprocessing.fillna,
                       preprocessing.lowercase,
                       preprocessing.remove_brackets,
                       preprocessing.remove_digits,
                       preprocessing.remove_punctuation,
                       preprocessing.remove_whitespace]
    df['preprocessed_text'] = df.preprocessed_text.pipe(hero.clean, custom_pipeline)
    return df

In [18]:
# def filter_dataset_with_sentiment_words(df):
#     df["sentiment_words_rate"] = df.preprocessed_text.apply(get_sentiment_words_rate)
#     df["posemo_rate"] = df.sentiment_words_rate.apply(lambda x: x["posemo"])
#     df["negemo_rate"] = df.sentiment_words_rate.apply(lambda x: x["negemo"])
#     df = df[(df.posemo_rate > 0) | (df.negemo_rate > 0)]
#     return df

In [19]:
def main():
    con = sqlite3.connect("tweets.db")
    sql = "select id_str, text from tweets where text not like '%RT @%' order by random() limit 2000"
    df = pd.read_sql(sql, con=con, index_col="id_str")
    df = preprocess(df)
    df['liwc_analysis'] = df.preprocessed_text.apply(get_liwc_analysis)
    df['is_subjective'] = df.liwc_analysis.apply(check_subjectivity)
    #df = filter_dataset_with_sentiment_words(df)
    return df

In [20]:
df = main()

In [27]:
df.tail(20)

Unnamed: 0_level_0,text,preprocessed_text,liwc_analysis,is_subjective
id_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1365114860355354624,VAMO,vamo,{},0
1363559739808956416,"@SelecaoTalk Even if flamengo win today, they ...",even if flamengo win today they might lose to ...,"{'adj (Adjectives)': 1, 'relativ (Relativity)'...",0
1363617157846863880,https://t.co/B0u9Y6fPAL,,{},0
1363544459850481672,@marinofilho @luizmarquesdias Eu quero que o F...,eu quero que o flamengo se foda meus amigo se ...,"{'function (Function Words)': 6, 'pronoun (Pro...",1
1363595061808795650,@Flamengo SEGUE O LÍDERRRRR https://t.co/yFP2h...,segue o líderrrrr,"{'function (Function Words)': 2, 'auxverb (Aux...",0
1363559646536097793,cabelin na régua e camisa do Flamengo,cabelin na régua e camisa do flamengo,"{'function (Function Words)': 3, 'article (Art...",0
1363603541122809857,@Pedro9oficial @Flamengo Brabissimo 🔴⚫,brabissimo,{},0
1365119513105227776,Flamengo ou cavalo paraguaio ? 🤣,flamengo ou cavalo paraguaio,"{'function (Function Words)': 1, 'conj (Conjun...",1
1364978235331731456,VOCE RECEBEU O DESPERTADOR DA SORTE \n\n⏰⏰⏰⏰⏰⏰...,voce recebeu o despertador da sorte repasse pr...,"{'function (Function Words)': 7, 'pronoun (Pro...",1
1363617280446373891,Te amo @Flamengo ❤️🖤❤️🖤,te amo,"{'function (Function Words)': 1, 'pronoun (Pro...",0


In [23]:
df.is_subjective.value_counts()

1    1171
0     829
Name: is_subjective, dtype: int64

In [24]:
df.to_csv('./data/data.xlsx')