In [23]:
from typing import Generator, List
import pickle

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import spacy
from spacy.lang.en import English

In [2]:
with open(r"C:/Users/chiruco/Desktop/python/ProyPython/SentimentalAnalysis/SentimentalAnalysis/data/interim/review_classes.pkl", "rb") as input_file:
    review_classes = pickle.load(input_file)

In [4]:
positive_reviews = review_classes['POS']
negative_reviews = review_classes['NEG']

In [12]:
def sentences_to_words(sentences: List[str]) -> Generator:
    for sentence in sentences:
        # https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True elimina la puntuación

In [13]:
def remove_stopwords(documents: List[List[str]]) -> List[List[str]]:
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords.words('english')]
            for doc in documents]

In [14]:
def learn_bigrams(documents: List[List[str]]) -> List[List[str]]:
    # We learn bigrams
    #  https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.Phrases
    bigram = Phrases(documents, min_count=5, threshold=10)

    # we reduce the bigram model to its minimal functionality
    bigram_mod = Phraser(bigram)

    # we apply the bigram model to our documents
    return [bigram_mod[doc] for doc in documents]


In [17]:
def lemmatization(nlp: English, texts: List[List[str]], allowed_postags: List = None) -> List[List[str]]:
    if allowed_postags is None:
        allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']

    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [20]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

In [21]:
def tokenize(documents: List[str]) -> List[List[str]]:

    document_words = list(sentences_to_words(documents))
    document_words = remove_stopwords(document_words)
    document_words = learn_bigrams(document_words)
    document_words = lemmatization(nlp, document_words)

    return document_words

In [26]:
positive_words = tokenize(positive_reviews)

In [27]:
negative_words = tokenize(negative_reviews)

In [28]:
with open(r"../data/interim/positive_words.pkl", "wb") as output_file:
    pickle.dump(positive_words, output_file)

with open(r"../data/interim/negative_words.pkl", "wb") as output_file:
    pickle.dump(negative_words, output_file)