In [None]:
!pip install spacy
!pip install numpy
!pip install senticnet
!apt-get update
!apt-get install -y hunspell libhunspell-dev
!pip install hunspell==0.5.5

In [None]:
import spacy
spacy.cli.download("pt_core_news_sm")

In [None]:
import csv, re, os
import numpy as np
import pandas as pd
import spacy
import pt_core_news_sm
import csv
from senticnet.babelsenticnet import BabelSenticNet as SenticNet
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_validate
from spacy.matcher import PhraseMatcher
import hunspell

In [None]:
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__),'..'))
words_dir = os.path.join(PROJECT_ROOT, 'feature_extraction', 'res','palavras.txt')
liwc_dir = os.path.join(PROJECT_ROOT, 'feature_extraction', 'res','LIWC')
lexico_path = os.path.join(PROJECT_ROOT, 'feature_extraction', 'res','SentiProdBR')
emoticon_dir = os.path.join(PROJECT_ROOT, 'feature_extraction', 'res','emoticon_sentiment_lexicon.tsv')
emoji_dir = os.path.join(PROJECT_ROOT, 'feature_extraction', 'res','emoji_ranking.csv')
girias_dir = os.path.join(PROJECT_ROOT, 'feature_extraction', 'res','girias.txt')
dic_path = os.path.join(PROJECT_ROOT, 'feature_extraction', 'res','Dic. Hunspell', 'pt_BR.dic.txt')
aff_path = os.path.join(PROJECT_ROOT, 'feature_extraction', 'res','Dic. Hunspell', 'pt_BR.aff.txt')
extract_path = os.path.join(PROJECT_ROOT, 'dataset')

In [None]:
feature_names = [
'qtCharacters',
'qtSentences',
'qtWords',
'qtCapitalizedWords',
'qtCapitalizedChars',
'propCapitalizedWords',
'propCapitalizedChars',
'POS_adjectives',
'POS_adp',
'POS_adv',
'POS_aux',
'POS_cconj',
'POS_det',
'POS_intj',
'POS_noun',
'POS_num',
'POS_part',
'POS_pron',
'POS_propn',
'POS_punct',
'POS_sconj',
'POS_sym',
'POS_verb',
'POS_comp',
'POS_sup',
'POS_x',
'SYNT_1',
'SYNT_2',
'SYNT_3',
'SYNT_4',
'SYNT_5',
'LEX_Subjective',
'LEX_PropSubjective',
'LEX_Positive',
'LEX_PropPositive',
'LEX_Negative',
'LEX_PropNegative',
'LEX_CountNegativeSentences',
'LEX_CountPositiveSentences',
'LEX_QuotationExclamation',
'LEX_PropQuotationExclamation',
'CONC_Pleasantness',
'CONC_AvgPleasantness',
'CONC_Attention',
'CONC_AvgAttention',
'CONC_Sensitivity',
'CONC_AvgSensitivity',
'CONC_Aptitude',
'CONC_AvgAptitude',
'CONC_Polarity',
'CONC_AvgPolarity',
'TWT_CountElongated',
'TWT_CountExpressions',
'TWT_Negation',
'TWT_EmojiPolarityScore',
'TWT_EmoticonPolarityScore',
'SBJ_CountNE',
#'SBJ_FutureTense', feature retirada
'SBJ_CountCorrectWords'
]


In [None]:
class SpacyPreprocessor(BaseEstimator):

    def __init__(self):
        self.nlp = spacy.load('pt_core_news_sm')

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        return list(self.nlp.pipe(sentences, n_process = 2))


# Structural Features

In [None]:
class CountCharacters(BaseEstimator):
    def __init__(self):
        self.name = "CHARACTERS"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for sentence in sentences:
          list_count.append(len(sentence))
        return np.array(list_count).reshape(-1,1)

In [None]:
class CountSentences(BaseEstimator):
    def __init__(self):
        self.name = "SENTENCES"
        self.nlp = spacy.load('pt_core_news_sm')

    def fit(self, x=None, y=None):
        return self

    def transform(self, documents):
        list_count = []
        for document in documents:
            sentences = list(self.nlp(document).sents)
            list_count.append(len(sentences))
        return np.array(list_count).reshape(-1, 1)

In [None]:
class CountWords(BaseEstimator):
    def __init__(self):
        self.name = "WORDS"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        tokenized_sentences = []
        for sentence in sentences:
            words = len(re.findall(r'[^\s!\?,\(\)\.]+', sentence))
            tokenized_sentences.append(words)
        return np.array(tokenized_sentences).reshape(-1, 1)

In [None]:
class CountWordsWithUpperLetter(BaseEstimator):
    def __init__(self):
        self.name = "COUNT OF CAPITALIZED WORDS"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len(re.findall(r'[A-Z]\w*', doc)))
        return np.array(list_count).reshape(-1, 1)

In [None]:
class CountUpperLetters(BaseEstimator):
    def __init__(self):
        self.name = "COUNT OF CAPITALIZED CHARACTERS"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            words = re.findall(r'([A-Z])', doc)
            list_count.append(len(words))
        return np.array(list_count).reshape(-1, 1)

In [None]:
class ProportionCapitalizedWords(BaseEstimator):
    def __init__(self):
        self.name = "Ratio of words with upper letters to lower letters"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            word_count = len(re.findall(r'[^\s!\?,\(\)\.]+', doc))
            capitalized_count = len(re.findall(r'[A-Z]\w*', doc))
            list_count.append(0 if capitalized_count == 0 else capitalized_count/word_count)
        return np.array(list_count).reshape(-1, 1)

In [None]:
class ProportionCapitalizedChars(BaseEstimator):
    def __init__(self):
        self.name = "Ratio of the uppercase characters to the sentence length"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            words = re.findall(r'([A-Z])', doc)
            list_count.append(0 if len(words) == 0 else len(words) / len(doc))
        return np.array(list_count).reshape(-1, 1)

# POS Features

In [None]:
class POS_CountAdj(BaseEstimator):
    def __init__(self):
        self.name = "ADJ"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'ADJ']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountAdp(BaseEstimator):
    def __init__(self):
        self.name = "ADP"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'ADP']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountAdv(BaseEstimator):
    def __init__(self):
        self.name = "ADV"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'ADV']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountAux(BaseEstimator):
    def __init__(self):
        self.name = "AUX"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'AUX']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountCconj(BaseEstimator):
    def __init__(self):
        self.name = "CCONJ"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'CCONJ']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountDet(BaseEstimator):
    def __init__(self):
        self.name = "DET"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'DET']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountIntj(BaseEstimator):
    def __init__(self):
        self.name = "INTJ"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'INTJ']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountNoun(BaseEstimator):
    def __init__(self):
        self.name = "NOUN"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'NOUN']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountNum(BaseEstimator):
    def __init__(self):
        self.name = "NUM"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'NUM']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountPart(BaseEstimator):
    def __init__(self):
        self.name = "PART"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'PART']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountPron(BaseEstimator):
    def __init__(self):
        self.name = "PRON"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'PRON']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountPropn(BaseEstimator):
    def __init__(self):
        self.name = "PROPN"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'PROPN']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountPunct(BaseEstimator):
    def __init__(self):
        self.name = "ADP"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'PUNCT']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountSconj(BaseEstimator):
    def __init__(self):
        self.name = "SCONJ"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'SCONJ']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountSym(BaseEstimator):
    def __init__(self):
        self.name = "SYM"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'SYM']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountVerb(BaseEstimator):
    def __init__(self):
        self.name = "VERB"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'VERB']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountComparatives(BaseEstimator):
    def __init__(self):
        self.name = "Contar comparativos"
        self.sinteticos = ['melhor','pior','maior','menor']

    def fit(self, x=None, y=None):
        return self

    def comparativo_igualdade(self, sentence):
      count_comparatives = 0
      for i in range(len(sentence)):
        if i + 2 < len(sentence):
          if sentence[i].text.lower() == "tão":
            if sentence[i+1].pos_ == "ADJ":
              if sentence[i+2].text.lower() == "quanto" or sentence[i+2].text.lower() == "como" or sentence[i+2].text.lower() == "quão" :
                  count_comparatives += 1
      return count_comparatives

    def comparativo_superioridade(self, sentence):
      count_comparatives = 0
      for i in range(len(sentence)):
        if i + 3 < len(sentence):
          if sentence[i].text.lower() == "mais":
            if sentence[i+1].pos_ == "ADJ":
              if sentence[i+2].text.lower() == "que" or (sentence[i+2].text.lower() == "do" and sentence[i+3].text.lower() == "que") :
                  count_comparatives += 1
      return count_comparatives

    def comparativo_inferioridade(self, sentence):
      count_comparatives = 0
      for i in range(len(sentence)):
        if i + 3 < len(sentence):
          if sentence[i].text.lower() == "menos":
            if sentence[i+1].pos_ == "ADJ":
              if sentence[i+2].text.lower() == "que" or (sentence[i+2].text.lower() == "do" and sentence[i+3].text.lower() == "que") :
                  count_comparatives += 1
      return count_comparatives

    def comparativo_sintetico(self, sentence):
      count_comparatives = 0
      for i in range(len(sentence)):
        if i + 2 < len(sentence):
          if sentence[i].text.lower() in self.sinteticos:
            if sentence[i+1].text.lower() == "que" or (sentence[i+1].text.lower() == "do" and sentence[i+2].text.lower() == "que") :
                count_comparatives += 1
      return count_comparatives


    def transform(self, sentences):
         list_count = []
         for doc in sentences:
             list_count.append(self.comparativo_igualdade(doc)
                              + self.comparativo_superioridade(doc)
                              + self.comparativo_inferioridade(doc)
                              + self.comparativo_sintetico(doc))
         return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountSuperlatives(BaseEstimator):
    def __init__(self):
        self.name = "Contar superlativos"
        self.nlp = SpacyPreprocessor().nlp
        self.artigos = ['o','a','os','as']
        self.intensificadores = ['muito', 'extremamente', 'completamente', 'totalmente', 'bastante', 'incrivelmente', 'super', 'mega', 'hiper']
        self.sufixos = ['íssim', 'im', 'ílim', 'érrim']

    def superlativo_relativo_inferioridade(self, sentence):
      count_superlatives, i = 0,0
      while i <= len(sentence):
        if i+3 < len(sentence):
          if sentence[i].text.lower() in self.artigos:
            if sentence[i+1].pos_ == "NOUN" and sentence[i+2].text.lower() == "menos" and sentence[i+3].pos_ == "ADJ":
              count_superlatives += 1
              i += 4
        if i+2 < len(sentence):
          if sentence[i].text.lower() in self.artigos:
            if sentence[i+1].text.lower() == "menos" and sentence[i+2].pos_ == "ADJ":
              count_superlatives += 1
              i += 3
        i += 1
      return count_superlatives

    def superlativo_relativo_superioridade(self, sentence):
      count_superlatives, i = 0,0
      while i <= len(sentence):
        if i+3 < len(sentence):
          if sentence[i].text.lower() in self.artigos:
            if sentence[i+1].pos_ == "NOUN" and sentence[i+2].text.lower() == "mais" and sentence[i+3].pos_ == "ADJ":
              count_superlatives += 1
              i += 4
        if i+2 < len(sentence):
          if sentence[i].text.lower() in self.artigos:
            if sentence[i+1].text.lower() == "mais" and sentence[i+2].pos_ == "ADJ":
              count_superlatives += 1
              i += 3
        i += 1
      return count_superlatives

    def superlativo_absoluto_analitico(self, sentence):
      count_superlatives = 0
      for i in range(len(sentence)):
        if i+1 < len(sentence):
          if sentence[i].text.lower() in self.intensificadores:
            if sentence[i+1].pos_ == "ADJ":
              count_superlatives += 1
      return count_superlatives

    def superlativo_absoluto_sintetico(self, sentence):
      count_superlatives = 0
      for token in sentence:
        base = token.lemma_.lower() if token.lemma_ else token.text.lower()
        token_base = next(self.nlp.pipe([base]))[0]
        if token_base.pos_ == "ADJ":
           for sufixo in self.sufixos:
              if token.text.lower().endswith(sufixo + 'o') or token.text.lower().endswith(sufixo + 'a') or token.text.lower().endswith(sufixo + 'os') or token.text.lower().endswith(sufixo + 'as'):
                count_superlatives += 1
                break
      return count_superlatives

    def fit(self, x=None, y=None):
        return self

    def transform(self, sentences):
         list_count = []
         for doc in sentences:
             list_count.append(self.superlativo_relativo_inferioridade(doc)
                              + self.superlativo_relativo_superioridade(doc)
                              + self.superlativo_absoluto_analitico(doc)
                              + self.superlativo_absoluto_sintetico(doc))
         return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class POS_CountX(BaseEstimator):
    def __init__(self):
        self.name = "X"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len([token for token in doc if token.pos_ == 'X']))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

# Syntactic Rules Features

In [None]:
# ADJ -> NOUN -> *
class SYNT_Rule1(BaseEstimator):
    def __init__(self):
        self.name = "RULE 1"

    def fit(self, X=None, y=None):
        return self

    def __pattern__(self, sentence):
      for i in range(len(sentence)):
        if i+1 < len(sentence):
          if sentence[i].pos_ == "ADJ":
            if sentence[i+1].pos_ == "NOUN":
              return 1
      return 0

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(self.__pattern__(doc))
        return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))

In [None]:
# ADV + [<SUP> | <COMP>] -> ADJ -> !NOUN
class SYNT_Rule2(BaseEstimator):
    def __init__(self):
        self.name = "RULE 2"

    def fit(self, X=None, y=None):
        return self

    def __pattern__(self, sentence):
      for i in range(len(sentence)):
        if i+2 < len(sentence):
          if sentence[i].pos_ == "ADV" or (sentence[i].pos_ == "ADV" and "<SUP>" in sentence[i].tag_) or (sentence[i].pos_ == "ADV" and "<COMP>" in sentence[i].tag_):
            if sentence[i+1].pos_ == "ADJ":
              if sentence[i+2].pos_ != "NOUN":
                return 1
      return 0

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(self.__pattern__(doc))
        return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))

In [None]:
# ADJ -> ADJ -> !NOUN
class SYNT_Rule3(BaseEstimator):
    def __init__(self):
        self.name = "RULE 3"

    def fit(self, X=None, y=None):
        return self

    def __pattern__(self, sentence):
      for i in range(len(sentence)):
        if i+2 < len(sentence):
          if sentence[i].pos_ == "ADJ":
            if sentence[i+1].pos_ == "ADJ":
              if sentence[i+2].pos_ != "NOUN":
                return 1
      return 0

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(self.__pattern__(doc))
        return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))

In [None]:
# NOUN -> ADJ -> !NOUN
class SYNT_Rule4(BaseEstimator):
    def __init__(self):
        self.name = "RULE 4"

    def fit(self, X=None, y=None):
        return self

    def __pattern__(self, sentence):
      for i in range(len(sentence)):
        if i+2 < len(sentence):
          if sentence[i].pos_ == "NOUN":
            if sentence[i+1].pos_ == "ADJ":
              if sentence[i+2].pos_ != "NOUN":
                return 1
      return 0

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(self.__pattern__(doc))
        return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))

In [None]:
# ADV + [<SUP> | <COMP>] -> VERB + [ PCP | GER | PS | IMPF]  -> *
class SYNT_Rule5(BaseEstimator):
    def __init__(self):
        self.name = "RULE 5"

    def fit(self, X=None, y=None):
        return self

    def __pattern__(self, sentence):
      for i in range(len(sentence)):
        if i+1 < len(sentence):
          if sentence[i].pos_ == "ADV" or (sentence[i].pos_ == "ADV" and "<SUP>" in sentence[i].tag_) or (sentence[i].pos_ == "ADV" and "<COMP>" in sentence[i].tag_):
            if sentence[i+1].pos_ == "VERB" or (sentence[i].pos_ == "VERB" and "<PCP>" in sentence[i].tag_) or (sentence[i].pos_ == "VERB" and "<GER>" in sentence[i].tag_) or (sentence[i].pos_ == "VERB" and "<PS>" in sentence[i].tag_) or (sentence[i].pos_ == "VERB" and "<IMPF>" in sentence[i].tag_):
                return 1
      return 0

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(self.__pattern__(doc))
        return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))

# Lexicon features


In [None]:
class LEX_Subjective(BaseEstimator):
    def __init__(self, proportion):
        self.name = "PROP" if proportion else "NUM" + " SUBJECTIVE WORDS"
        self.file_name = liwc_path + "LIWC_Portuguese_subjective.txt"
        self.lexicon = []
        self.proportion = proportion

    def __load_lexicon__(self):
      with open(self.file_name, "r") as f:
        self.lexicon = f.read().split('\n')

    def __value__(self, sentence):
      count = 0
      for term in sentence:
        if term.text.lower() in self.lexicon:
          count += 1
        else:
          for lexicon in self.lexicon:
              if lexicon.endswith("*") and term.text.lower().startswith(lexicon[:-1]):
                 count += 1
                 break
      if self.proportion:
          return count / len(sentence) if len(sentence) > 0 else 0
      else:
          return count

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
      self.__load_lexicon__()
      list_count = []
      for doc in sentences:
        list_count.append(self.__value__(doc))
      return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))


In [None]:
class LEX_Positive(BaseEstimator):
    def __init__(self, proportion, categoria):
        self.name = "PROP" if proportion else "NUM" + " POSITIVE WORDS"
        self.categoria = categoria
        self.lexicon = None
        self.proportion = proportion

    def __load_lexicon__(self):
      self.file_name = lexico_path + f"{self.categoria}.csv"
      self.lexicon = pd.read_csv(self.file_name)

    def __value__(self, doc):
      count = 0
      for term in doc:
        if self.lexicon[(self.lexicon['term'] == term.text) & (self.lexicon['class'] == 'positive')].any().any():
          count += 1
      if self.proportion:
          return count / len(doc) if len(doc) > 0 else 0
      else:
          return count

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
      self.__load_lexicon__()
      list_count = []
      for doc in sentences:
        list_count.append(self.__value__(doc))
      return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class LEX_Negative(BaseEstimator):
    def __init__(self, proportion, categoria):
        self.name = "PROP" if proportion else "NUM" + "NEGATIVE WORDS"
        self.categoria = categoria
        self.lexicon = None
        self.proportion = proportion

    def __load_lexicon__(self):
      self.file_name = lexico_path + f"{self.categoria}.csv"
      self.lexicon = pd.read_csv(self.file_name)

    def __value__(self, doc):
      count = 0
      for term in doc:
        if self.lexicon[(self.lexicon['term'] == term.text) & (self.lexicon['class'] == 'negative')].any().any():
          count += 1
      if self.proportion:
          return count / len(doc) if len(doc) > 0 else 0
      else:
          return count

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        self.__load_lexicon__()
        list_count = []
        for doc in sentences:
            list_count.append(self.__value__(doc))
        return np.array(list_count).reshape(-1, 1)  if list_count else np.zeros((len(sentences), 1))

In [None]:
class LEX_CountSentencesByPolarity(BaseEstimator):
    def __init__(self, polarity, categoria):
        self.name = f"NUMBER OF + {polarity} + SENTENCES"
        self.categoria = categoria
        self.polarity = polarity
        self.nlp = spacy.load('pt_core_news_sm')

    def fit(self, x=None, y=None):
        return self

    def transform(self, documents):
        list_count = []
        for document in documents:
            count = 0
            sentences = list(self.nlp(document).sents)
            for sentence in sentences:
              positive = LEX_Positive(proportion = False, categoria = self.categoria).transform([sentence])
              negative = LEX_Negative(proportion = False, categoria = self.categoria).transform([sentence])
              if self.polarity == 'positive':
                if positive[0][0] > negative[0][0]:
                  count += 1
              if self.polarity == 'negative':
                if negative[0][0] > positive[0][0]:
                  count += 1
            list_count.append(count)
        return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class LEX_QuotationExclamation(BaseEstimator):
    def __init__(self, proportion):
        self.name = "PROP" if proportion else "NUM" + "QUOTATION AND EXCLAMATION MARKS"
        self.proportion = proportion

    def __value__(self, sentence):
      if not sentence or sentence.text.isspace():
        return 0
      if self.proportion:
        return len([term for term in sentence if term.text in ["!", "?"]])/len(sentence)
      else:
        return len([term for term in sentence if term.text in ["!", "?"]])

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
      list_count = []
      for doc in sentences:
        list_count.append(self.__value__(doc))
      return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))

# Concept-based Features



In [None]:
class CONC_Pleasantness(BaseEstimator):
    def __init__(self, absolute, average):
        self.name = "AVERAGE" if average else "SUM" + " OF PLEASANTNESS SCORES" + (" (ABS)" if absolute else "")
        self.sn = SenticNet('pt')
        self.absolute = absolute
        self.average = average

    def __value__(self, sentence):
      total = 0
      for word in sentence:
        try:
          if self.absolute:
            score = abs(self.sn.sentics(word.text.lower())['pleasantness'])
          else:
            score = self.sn.sentics(word.text.lower())['pleasantness']
          total += score
        except KeyError:
          pass
      if self.average:
        return total/len(sentence)
      else:
        return total

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
      list_count = []
      for sentence in sentences:
          list_count.append(self.__value__(sentence))
      return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class CONC_Attention(BaseEstimator):
    def __init__(self, absolute, average):
        self.name = "AVERAGE" if average else "SUM" + " OF ATTENTION SCORES" + (" (ABS)" if absolute else "")
        self.sn = SenticNet('pt')
        self.absolute = absolute
        self.average = average


    def __value__(self, sentence):
      total = 0
      for word in sentence:
        try:
          if self.absolute:
            score = abs(self.sn.sentics(word.text.lower())['attention'])
          else:
            score = self.sn.sentics(word.text.lower())['attention']
          total += score
        except KeyError:
          pass
      if self.average:
        return total/len(sentence)
      else:
        return total

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
      list_count = []
      for sentence in sentences:
        list_count.append(self.__value__(sentence))

      return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class CONC_Sensitivity(BaseEstimator):
    def __init__(self, absolute, average):
        self.name = "AVERAGE" if average else "SUM" + " OF SENSITIVITY SCORES" + (" (ABS)" if absolute else "")
        self.sn = SenticNet('pt')
        self.absolute = absolute
        self.average = average

    def __value__(self, sentence):
      total = 0
      for word in sentence:
        try:
          if self.absolute:
            score = abs(self.sn.sentics(word.text.lower())['sensitivity'])
          else:
            score = self.sn.sentics(word.text.lower())['sensitivity']
          total += score
        except KeyError:
          pass
      if self.average:
        return total/len(sentence)
      else:
        return total

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
      list_count = []
      for sentence in sentences:
        list_count.append(self.__value__(sentence))
      return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class CONC_Aptitude(BaseEstimator):
    def __init__(self, absolute, average):
        self.name = "AVERAGE" if average else "SUM" + " OF APTITUDE SCORES" + (" (ABS)" if absolute else "")
        self.sn = SenticNet('pt')
        self.absolute = absolute
        self.average = average

    def __value__(self, sentence):
      total = 0
      for word in sentence:
        try:
          if self.absolute:
            score = abs(self.sn.sentics(word.text.lower())['aptitude'])
          else:
            score = self.sn.sentics(word.text.lower())['aptitude']
          total += score
        except KeyError:
          pass
      if self.average:
        return total/len(sentence)
      else:
        return total

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
      list_count = []
      for sentence in sentences:
        list_count.append(self.__value__(sentence))
      return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class CONC_Polarity(BaseEstimator):
    def __init__(self, absolute, average):
        self.name = "AVERAGE" if average else "SUM" + " OF POLARITY SCORES" + (" (ABS)" if absolute else "")
        self.sn = SenticNet('pt')
        self.absolute = absolute
        self.average= average

    def __value__(self, sentence):
      total = 0
      for word in sentence:
        try:
          if self.absolute:
            score = abs(self.sn.polarity_value(word.text.lower()))
          else:
            score = self.sn.polarity_value(word.text.lower())
          total += score
        except KeyError:
          pass
      if self.average:
        return total/len(sentence)
      else:
        return total

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
      list_count = []
      for sentence in sentences:
        list_count.append(self.__value__(sentence))
      return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))

# Twitter

In [None]:
class TWT_CountElongated(BaseEstimator):
    def __init__(self):
        self.name = "COUNT OF ELONGATED WORDS"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
          if not isinstance(doc, str):
            doc = ' '.join([token.text for token in doc])
          list_count.append(len(
                re.findall(r"([a-zA-z])\1{2,}", doc)
            ))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
class TWT_CountExpressions(BaseEstimator):
    def __init__(self):
        self.name = "COUNT EXPRESSIONS (GÍRIAS)"
        self.file_name = girias_dir
        self.lexicon = []
        self.nlp = SpacyPreprocessor().nlp
        self.matcher = PhraseMatcher(self.nlp.vocab)

    def __load_lexicon__(self):
        with open(self.file_name, "r") as f:
            self.lexicon = f.read().split('\n')
            self.lexicon = [term.lower() for term in self.lexicon]
            patterns = self.nlp.pipe(self.lexicon)
            self.matcher.add("Girias", patterns)

    def __value__(self, sentence):
        doc = self.nlp(sentence)
        return len(self.matcher(doc))

    def fit(self, x=None, y=None):
        return self

    def transform(self, sentences):
        self.__load_lexicon__()
        list_count = []
        for doc in sentences:
            list_count.append(self.__value__(doc.text.lower()))
        return np.array(list_count).reshape(-1, 1)

In [None]:
class TWT_Negation(BaseEstimator):
    def __init__(self):
        self.name = "PRESENCE OF NEGATION WORDS"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            presence = 0
            for token in doc:
                if token.text in ['não', 'nao', 'nem', 'nunca', 'nn', 'n', 'ñ']:
                    presence = 1
                    break
            list_count.append(presence)
        return np.array(list_count).reshape(-1, 1)

In [None]:
class TWT_EmojiPolarityScore(BaseEstimator):
    def __init__(self, scoring_type='polarity'):
        self.scoring_type = scoring_type
        self.name = "SUM OF " + scoring_type.upper() + " EMOJIS"
        self.file_name = emoji_dir
        self.emoji_data = None

    def __get_emojis__(self, sentence):
        pass

    def __load_lexicon__(self):
        with open(self.file_name, "r") as f:
            self.emoji_data = pd.read_csv(self.file_name, index_col='char').to_dict('index')

    def __value__(self, sentence):
        score = 0
        for term in sentence:
            if term.text in self.emoji_data:
                score += self.emoji_data[term.text][self.scoring_type]
        return score

    def fit(self, x=None, y=None):
        return self

    def transform(self, sentences):
        self.__load_lexicon__()
        list_count = []
        for sentence in sentences:
            list_count.append(self.__value__(sentence))
        return np.array(list_count).reshape(-1, 1)

In [None]:
class TWT_EmoticonPolarityScore(BaseEstimator):
    def __init__(self):
        self.name = "SUM OF EMOTICON SCORE"
        self.file_name = emoticon_dir
        self.emoticon_data = None

    def __get_emojis__(self, sentence):
        pass

    def __load_lexicon__(self):
        with open(self.file_name, "r") as f:
            self.emoticon_data = pd.read_csv(self.file_name, sep='\t', index_col='emoticon').to_dict('index')

    def __value__(self, sentence):
        score = 0
        for term in sentence:
            if term.text in self.emoticon_data:
                score += self.emoticon_data[term.text]['sentiment']
        return score

    def fit(self, x=None, y=None):
        return self

    def transform(self, sentences):
        self.__load_lexicon__()
        list_count = []
        for sentence in sentences:
            list_count.append(self.__value__(sentence))
        return np.array(list_count).reshape(-1, 1)

# Miscellaneous

In [None]:
class SBJ_CountNE(BaseEstimator):
    def __init__(self):
        self.name = "COUNT OF NAMED ENTITIES"

    def fit(self, X=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(len(doc.ents))
        return np.array(list_count).reshape(-1,1) if list_count else np.zeros((len(sentences), 1))

In [None]:
# feature retirada
'''
class SBJ_FutureTense(BaseEstimator):
    def __init__(self):
        self.name = "PRESENCE OF FUTURE TENSE VERBS IN THE SENTENCE"

    def fit(self, X=None, y=None):
        return self

    def __is_future(self, tag):
        return 'Tense=Fut' in tag.split('|')

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            presence = 0
            for token in doc:
                if (token.pos_ == 'VERB' or token.pos_ == 'AUX') and self.__is_future(token.tag_):
                    presence = 1
                    break
            list_count.append(presence)
        return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))
'''

'\nclass SBJ_FutureTense(BaseEstimator):\n    def __init__(self):\n        self.name = "PRESENCE OF FUTURE TENSE VERBS IN THE SENTENCE"\n\n    def fit(self, X=None, y=None):\n        return self\n\n    def __is_future(self, tag):\n        return \'Tense=Fut\' in tag.split(\'|\')\n\n    def transform(self, sentences):\n        list_count = []\n        for doc in sentences:\n            presence = 0\n            for token in doc:\n                if (token.pos_ == \'VERB\' or token.pos_ == \'AUX\') and self.__is_future(token.tag_):\n                    presence = 1\n                    break\n            list_count.append(presence)\n        return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))\n'

In [None]:
class SBJ_CountCorrectWords(BaseEstimator):
    def __init__(self):
        self.name = "COUNT CORRECT WORDS"
        self.hunspell = hunspell.HunSpell(dic_path, aff_path)

    def __value__(self, doc):
        count_correct = 0
        for token in doc:
            if self.hunspell.spell(token.text) and token.text.isalpha(): 
                count_correct += 1
        return count_correct

    def fit(self, x=None, y=None):
        return self

    def transform(self, sentences):
        list_count = []
        for doc in sentences:
            list_count.append(self.__value__(doc))
        return np.array(list_count).reshape(-1, 1) if list_count else np.zeros((len(sentences), 1))


# Extract

In [None]:
pipeline = Pipeline(steps=[
      ('features', FeatureUnion(
          transformer_list=[
              ('qtCharacters', CountCharacters()),
              ('qtSentences', CountSentences()),
              ('qtWords', CountWords()),
              ('qtCapitalizedWords',CountWordsWithUpperLetter()),
              ('qtCapitalizedChars',CountUpperLetters()),
              ('propCapitalizedWords',ProportionCapitalizedWords()),
              ('propCapitalizedChars',ProportionCapitalizedChars()),
              ('tree', Pipeline([
                ('spacy', SpacyPreprocessor()),
                ('tree_features', FeatureUnion(
                    transformer_list=[
                      ('POS_adjectives', POS_CountAdj()),
                      ('POS_adp', POS_CountAdp()),
                      ('POS_adv', POS_CountAdv()),
                      ('POS_aux', POS_CountAux()),
                      ('POS_cconj', POS_CountCconj()),
                      ('POS_det', POS_CountDet()),
                      ('POS_intj', POS_CountIntj()),
                      ('POS_noun', POS_CountNoun()),
                      ('POS_num', POS_CountNum()),
                      ('POS_part', POS_CountPart()),
                      ('POS_pron', POS_CountPron()),
                      ('POS_propn', POS_CountPropn()),
                      ('POS_punct', POS_CountPunct()),
                      ('POS_sconj', POS_CountSconj()),
                      ('POS_sym', POS_CountSym()),
                      ('POS_verb', POS_CountVerb()),
                      ('POS_comp',POS_CountComparatives()),
                      ('POS_sup',POS_CountSuperlatives()),
                      ('POS_x', POS_CountX()),
                      ('SYNT_1', SYNT_Rule1()),
                      ('SYNT_2', SYNT_Rule2()),
                      ('SYNT_3', SYNT_Rule3()),
                      ('SYNT_4', SYNT_Rule4()),
                      ('SYNT_5', SYNT_Rule5()),
                      ('LEX_Subjective', LEX_Subjective(proportion = False)),
                      ('LEX_PropSubjective', LEX_Subjective(proportion = True)),
                      ('LEX_Positive', LEX_Positive(proportion = False, categoria = None)),
                      ('LEX_PropPositive', LEX_Positive(proportion = True, categoria = None)),
                      ('LEX_Negative', LEX_Negative(proportion = False, categoria = None)),
                      ('LEX_PropNegative', LEX_Negative(proportion = True, categoria = None)),
                      ('LEX_CountNegativeSentences',LEX_CountSentencesByPolarity(polarity = 'negative', categoria = None)),
                      ('LEX_CountPositiveSentences',LEX_CountSentencesByPolarity(polarity = 'positive', categoria = None)),
                      ('LEX_QuotationExclamation', LEX_QuotationExclamation(proportion = False)),
                      ('LEX_PropQuotationExclamation', LEX_QuotationExclamation(proportion = True)),
                      ('CONC_Pleasantness', CONC_Pleasantness(absolute=True, average = False)),
                      ('CONC_AvgPleasantness', CONC_Pleasantness(absolute=True, average = True)),
                      ('CONC_Attention', CONC_Attention(absolute=True, average = False)),
                      ('CONC_AvgAttention', CONC_Attention(absolute=True, average = True)),
                      ('CONC_Sensitivity', CONC_Sensitivity(absolute=True, average = False)),
                      ('CONC_AvgSensitivity', CONC_Sensitivity(absolute=True, average = True)),
                      ('CONC_Aptitude', CONC_Aptitude(absolute=True, average = True)),
                      ('CONC_AvgAptitude', CONC_Aptitude(absolute=True, average = False)),
                      ('CONC_Polarity', CONC_Polarity(absolute=True, average = False)),
                      ('CONC_AvgPolarity', CONC_Polarity(absolute=True, average = True)),
                      ('TWT_CountElongated', TWT_CountElongated()),
                      ('TWT_CountExpressions',TWT_CountExpressions()),
                      ('TWT_Negation',TWT_Negation()),
                      ('TWT_EmojiPolarityScore', TWT_EmojiPolarityScore()),
                      ('TWT_EmoticonPolarityScore', TWT_EmoticonPolarityScore()),
                      ('SBJ_CountNE', SBJ_CountNE()),
                     # ('SBJ_FutureTense', SBJ_FutureTense()), feature retirada
                      ('SBJ_CountCorrectWords',SBJ_CountCorrectWords()),
                ]))
            ]))
          ],
      ))
  ])

In [None]:
def dataf(categoria):
  dataframes = []
  dir = f'{extract_path}/amazon_original_dataset/{categoria}/'
  for arquivo in os.listdir(dir):
    arquivo_path = dir+arquivo
    if arquivo_path.endswith('.json'):
        dataframe = pd.read_json(arquivo_path)
        dataframes.append(dataframe)
  data = pd.concat(dataframes, ignore_index=True)
  return data

In [None]:
categories = ['auto', 'baby','celular','food','games','laptops','livros','moda','pets','toys']

for categoria in categories:
  data = dataf(categoria)
  data['categoria'] = categoria
  print(f"categoria: {categoria} - {data.shape}")
  pipeline.set_params(
        features__tree__tree_features__LEX_Positive__categoria=categoria,
        features__tree__tree_features__LEX_PropPositive__categoria=categoria,
        features__tree__tree_features__LEX_Negative__categoria=categoria,
        features__tree__tree_features__LEX_PropNegative__categoria=categoria,
        features__tree__tree_features__LEX_CountNegativeSentences__categoria=categoria,
        features__tree__tree_features__LEX_CountPositiveSentences__categoria=categoria,
    )
  features = pipeline.transform(data['text']).tolist()
  num_samples = len(features)
  num_features = len(features[0])
  print(f"Dimensões das features: ({num_samples}, {num_features})")
  os.makedirs(f"{extract_path}/features_extracted/{categoria}", exist_ok=True)
  with open(f"{extract_path}/features_extracted/{categoria}/{categoria}.csv", 'w') as arquivo:
    columns = ['text', 'rating'] + feature_names
    write = csv.DictWriter(arquivo, fieldnames=columns, delimiter=',', lineterminator='\n')
    write.writeheader()
    for i, sentence in enumerate(data['text']):
      row = {
          'text': sentence,
          'rating': data['rating'].iloc[i]
      }
      for j, feature_name in enumerate(feature_names):
          row[feature_name] = features[i][j]
      write.writerow(row)