In [1]:
from typing import Generator, List
import pickle

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import spacy
from spacy.lang.en import English

In [2]:
with open(r"C:/Users/chiruco/Desktop/python/ProyPython/SentimentalAnalysis/SentimentalAnalysis/data/interim/review_classes.pkl", "rb") as input_file:
    review_classes = pickle.load(input_file)

In [4]:
review_classes

{'POS': ["This is one of the best hotels I've ever stayed at with incredible staff catering to your every need. I felt like family whilst I was there!   I stayed for 3 weeks during the pandemic and was so impressed with all the measures the hotel have put in place to keep everyone safe and well. They give you a box of masks, gloves and a bottle of hand sanitiser upon arrival, all the staff wear masks and gloves and the rooms and everything else are cleaned to the highest standard.  Plus the prices they've been offering during this time are incredible value for money! Definitely would recommend and hope to be back once travel reopens.   Thank you so much for looking after me so well until I could return home!",
  'Everything about this hotel was awesome. The staff made me feel welcomed from the minute i arrived until the day i departed. The view from the dining area with the mountains in the background was spectacular. The grounds were manicured to perfection with flowering plants in ev

In [5]:
positive_reviews = review_classes['POS']
negative_reviews = review_classes['NEG']

In [10]:
def sentences_to_words(sentences: List[str]) -> List[List[str]]:
    words = []
    for sentence in sentences:
        # https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True elimina la puntuación

In [7]:
def remove_stopwords(documents: List[List[str]]) -> List[List[str]]:
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords.words('english')]
            for doc in documents]

In [11]:
def learn_bigrams(documents: List[List[str]]) -> List[List[str]]:
    # We learn bigrams
    #  https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.Phrases
    bigram = Phrases(documents, min_count=5, threshold=10)

    # we reduce the bigram model to its minimal functionality
    bigram_mod = Phraser(bigram)

    # we apply the bigram model to our documents
    return [bigram_mod[doc] for doc in documents]


In [15]:
def lemmatization(nlp: English, texts: List[List[str]], allowed_postags: List = None) -> List[List[str]]:
    if allowed_postags is None:
        allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']

    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [16]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

In [17]:
def tokenize(documents: List[str]) -> List[List[str]]:

    document_words = list(sentences_to_words(documents))
    document_words = remove_stopwords(document_words)
    document_words = learn_bigrams(document_words)
    document_words = lemmatization(nlp, document_words)

    return document_words

In [18]:
positive_words = tokenize(positive_reviews)

In [19]:
negative_words = tokenize(negative_reviews)

In [20]:
with open(r"C:/Users/chiruco/Desktop/python/ProyPython/SentimentalAnalysis/SentimentalAnalysis/data/interim/positive_words.pkl", "wb") as output_file:
    pickle.dump(positive_words, output_file)

with open(r"C:/Users/chiruco/Desktop/python/ProyPython/SentimentalAnalysis/SentimentalAnalysis/data/interim/negative_words.pkl", "wb") as output_file:
    pickle.dump(negative_words, output_file)

In [22]:
negative_words

[['town',
  'especially',
  'be',
  'go',
  'antiguena',
  'good',
  'sized',
  'private',
  'twin',
  'room',
  'shared_bathroom',
  'great',
  'price',
  'nice',
  'ish',
  'garden',
  'swing',
  'basic',
  'bathroom',
  'tatty',
  'hot',
  'shower',
  'always',
  'keep',
  'water',
  'flow',
  'slow',
  'super',
  'basic',
  'pretty',
  'dirty',
  'kitchenette',
  'load',
  'cat',
  'dog',
  'cheap',
  'basic',
  's'],
 ['go',
  'hotel',
  'directly',
  'book',
  'room',
  'receptionist',
  'first',
  'really',
  'high',
  'price',
  'lower',
  'ask',
  'much',
  'cost',
  'qetale',
  'comission',
  'rate',
  'dollar',
  'know',
  'randomly',
  'gove',
  'non',
  'exist',
  'rate',
  'lower',
  'real',
  'rate',
  'agree',
  'price',
  'include',
  'breakfast',
  'follow',
  'morning',
  'get',
  'breakfast',
  'receive',
  'plain',
  'piece',
  'toast',
  'fruit',
  'theother',
  'table',
  'receive',
  'also',
  'egg',
  'ask',
  'also',
  'egg',
  'reply',
  'come',
  'breakfast'