In [5]:
import json
import csv
import nltk
import pandas as pd
from collections import defaultdict
import copy

# NLK dependencies
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt')
# nltk.download('opinion_lexicon')
# nltk.download('sentiwordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('tagsets')

# Assignment 2
from nltk.corpus import wordnet as wn
from nltk import word_tokenize

# Assignment 3
from nltk.corpus import opinion_lexicon
from nltk.corpus import sentiwordnet as swn


# Assignment 1

### Task 1.1 
**Loading all the hotel reviews from the Yelp hotel reviews file.**

### Task 1.2 (optional) 

**Loading line by line the reviews from the Yelp beauty/spa resorts and restaurants reviews files.**

### Task 1.3 (optional) 

**Loading line by line reviews on other domains (e.g., movies, books, phones, digital music, CDs and videogames) from McAuley’s Amazon dataset.**

We tackle all of these tasks at the same time since a general enough functions solves all of them directly. The function `load_json_line_by_line()` reads a json file line by line and returns the dataset built.

We additionaly created a test function that tests the loading of all the described datasets. We have selected the Amazon cell phones and accesories dataset because it is big enough without being huge and we also have the required aspects for it (see tasks 2.1 to 2.3).

In [8]:
hotels_path = 'inputs/yelp_dataset/yelp_hotels.json'


def load_json_line_by_line(path):
    reviews = []
    with open(path, encoding='utf-8') as file:
        for line in file:
            # If line ends with a coma, remove it.
            if len(line) > 2:
                line = line[:-2] if line[-2] == ',' else line[:-1]
                reviews.append(json.loads(line))
    return reviews


def test_load_json_line_by_line():
    paths = [
        'inputs/yelp_dataset/yelp_hotels.json',
        'inputs/yelp_dataset/yelp_beauty_spas.json',
        'inputs/yelp_dataset/yelp_restaurants.json',
        'inputs/amazon/Cell_Phones_and_Accessories_5.json'
    ]

    for path in paths:
        print('Reading file {}\n'.format(path))
        reviews = load_json_line_by_line(path)
        print('{} reviews loaded\n'.format(len(reviews)))
        print('Example review: {}\n'.format(reviews[0]))


test_load_json_line_by_line()

Reading file inputs/yelp_dataset/yelp_hotels.json

5034 reviews loaded

Example review: {'reviewerID': 'qLCpuCWCyPb4G2vN-WZz-Q', 'asin': '8ZwO9VuLDWJOXmtAdc7LXQ', 'summary': 'summary', 'reviewText': "Great hotel in Central Phoenix for a stay-cation, but not necessarily a place to stay out of town and without a car. Not much around the area, and unless you're familiar with downtown, I would rather have a guest stay in Old Town Scottsdale, etc. BUT if you do stay here, it's awesome. Great boutique rooms. Awesome pool that's happening in the summer. A GREAT rooftop patio bar, and a very very busy lobby with Gallo Blanco attached. A great place to stay, but have a car!", 'overall': 4.0}

Reading file inputs/yelp_dataset/yelp_beauty_spas.json

5579 reviews loaded

Example review: {'reviewerID': 'Xm8HXE1JHqscXe5BKf0GFQ', 'asin': 'WGNIYMeXPyoWav1APUq7jA', 'summary': 'summary', 'reviewText': "Good tattoo shop. Clean space, multiple artists to choose from and books of their work are available f

# Assignment 2

### Task 2.1

**Loading (and printing on screen) the vocabulary of the `aspects_hotels.csv` file, and directly using it to identify aspect references in the reviews. In particular, the aspects terms could be mapped by exact matching with nouns appearing in the reviews.**

We will compute a dictionary that matches a certain aspect to every word related to it. It will usually be called `aspect_words_dict`. This will optimize knowing which aspect is related to each word.

The function `build_simple_vocab` creates this dictionary given a path to the file with the initial vocabulary.

In [9]:
aspect_hotels_path = 'inputs/aspects/aspects_hotels.csv'


def load_vocab(path):
    vocab = pd.read_csv(path, names=['aspect', 'word'])
    return vocab.groupby('aspect')['word'].apply(list)


def create_word_to_aspects_dict(aspect_words_dict):
    '''
        This function transforms an 'aspect to words'
        dictionary to a 'word to aspect' default dictionary
    '''
    word_aspect_dict = defaultdict(str)
    for aspect, words in aspect_words_dict.items():
        for word in words:
            word_aspect_dict[word] = aspect
    return word_aspect_dict


def build_simple_vocab(path):
    aspect_words_dict = load_vocab(path)
    word_aspect_dict = create_word_to_aspects_dict(aspect_words_dict)
    return word_aspect_dict


build_simple_vocab(aspect_hotels_path)

defaultdict(str,
            {'amenity': 'amenities',
             'amenities': 'amenities',
             'services': 'amenities',
             'atmosphere': 'atmosphere',
             'atmospheres': 'atmosphere',
             'ambiance': 'atmosphere',
             'ambiances': 'atmosphere',
             'light': 'atmosphere',
             'lighting': 'atmosphere',
             'lights': 'atmosphere',
             'music': 'atmosphere',
             'bar': 'bar',
             'bars': 'bar',
             'bartender': 'bar',
             'bartenders': 'bar',
             'bathroom': 'bathrooms',
             'bathrooms': 'bathrooms',
             'bath': 'bathrooms',
             'baths': 'bathrooms',
             'bathtub': 'bathrooms',
             'bathtubs': 'bathrooms',
             'shampoo': 'bathrooms',
             'shampoos': 'bathrooms',
             'shower': 'bathrooms',
             'showers': 'bathrooms',
             'towel': 'bathrooms',
             'towels': 'bathrooms

In the following cells we compute the aspects referenced by each review and display the result for the first few reviews.

In [18]:
def get_text_vocabulary(text):
    '''
        Returns a list of the words from
        the given text.
    '''
    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Obtain the words
    return [w.lower() for w in tokens if w.isalnum()]


def find_aspects_in_reviews(reviews, word_aspect_dict):
    '''
        Given a list of reviews, returns a set with the
        aspects that each review references.
    '''
    # Initialize references list
    refered_aspects = []

    for review in reviews:
        # Obtain review vocabulary
        r_vocab = get_text_vocabulary(review['reviewText'])

        # Search for words related to aspects and append
        # to vector if word appears
        refered_aspects.append(set([
            word_aspect_dict[word] for word in r_vocab if word_aspect_dict[word] != ''
        ]))

    return refered_aspects

In [19]:
hotel_reviews = load_json_line_by_line(hotels_path)
word_aspect_dict = build_simple_vocab(aspect_hotels_path)
refered_aspects = find_aspects_in_reviews(hotel_reviews, word_aspect_dict)

for review, aspects in zip(hotel_reviews[:3], refered_aspects[:3]):
    print('\tReview: {} \n\tAspects: {} \n'.format(
        review['reviewText'], aspects))

	Review: Great hotel in Central Phoenix for a stay-cation, but not necessarily a place to stay out of town and without a car. Not much around the area, and unless you're familiar with downtown, I would rather have a guest stay in Old Town Scottsdale, etc. BUT if you do stay here, it's awesome. Great boutique rooms. Awesome pool that's happening in the summer. A GREAT rooftop patio bar, and a very very busy lobby with Gallo Blanco attached. A great place to stay, but have a car! 
	Aspects: {'building', 'pool', 'shopping', 'bar', 'transportation'} 

	Review: I feel the Days Inn Tempe is best described as "a place where you can purchase the right to sleep for awhile." I booked my 10-night stay on Travelocity for a non-smoking room, yet when I entered the room I almost choked. It was disgusting. I've never had a smoking hotel room before and I will make sure I don't again. They said they couldn't move us to a different room.My local lady friend brought over a bottle of wine but forgot a co

### Task 2.2 (optional)

**Generating or extending the lists of terms of each aspect with synonyms extracted from WordNet.**

For this second task we expand the vocabulty using synonims extracted from Wordnet. The function `build_vocab()` is analogous to the previous `build_simple_vocab()` but takes this synonims into account.

In [20]:
def get_word_synonims(word):
    syns = []
    for syn in wn.synsets(word):
        for l in syn.lemmas():
            syns.append(l.name())
    return syns


def add_synonims_to_vocab(word_aspect_dict):
    '''
        Given a vocabulary, extends it by adding the synonims
        of each word.
    '''
    result = copy.deepcopy(word_aspect_dict)
    for word, aspect in word_aspect_dict.items():
        for synonym in get_word_synonims(word):
            result[synonym] = aspect
    return result


def build_vocab(path, add_synonyms = True):
    '''
        Builds a vocabulary taking synonims into account.
    '''
    aspect_words_dict = load_vocab(path)
    word_aspect_dict = create_word_to_aspects_dict(aspect_words_dict)
    word_aspect_dict_extended = add_synonims_to_vocab(word_aspect_dict)
    return word_aspect_dict_extended

### Task 2.3 (optional)

**Managing vocabularies for additional Yelp or Amazon domains. See assignments 1.2 and 1.3**

Extended our previous functions to the new datasets is trivial. We simple need to load the correct aspects for each review. The following test function computes the following for the Yelp hotels, Yelp restaurants and Amazon phones datasets:

- Load the reviews and build both the simple and complex vocabularies.
- Print the aspects found in the first few reviews with each vocabulary.
- Print the number of words in both the simple and extended vocabulary for comparison.

In [23]:
def test_build_vocabulary(n_displayed_reviews=1):
    test_case_names = [
        'Yelp hotels',
        'Yelp restaurants',
        'Amazon cell phones and accesories'
    ]
    reviews_paths = [
        'inputs/yelp_dataset/yelp_hotels.json',
        'inputs/yelp_dataset/yelp_restaurants.json',
        'inputs/amazon/Cell_Phones_and_Accessories_5.json'
    ]
    aspects_paths = [
        'inputs/aspects/aspects_hotels.csv',
        'inputs/aspects/aspects_restaurants.csv',
        'inputs/aspects/aspects_phones.csv'
    ]

    for name, reviews_path, aspects_path in \
            zip(test_case_names, reviews_paths, aspects_paths):
        print('----- {} dataset -----\n'.format(name))

        simple_vocab, complex_vocab = test_expand_vocab(
            reviews_path, aspects_path, n_displayed_reviews=n_displayed_reviews)

        print('Words in simple vocab: {}'.format(len(simple_vocab.keys())))
        print('Words in complex vocab: {}\n'.format(len(complex_vocab.keys())))


test_build_vocabulary(n_displayed_reviews=2)

----- Yelp hotels dataset -----

	Review: Great hotel in Central Phoenix for a stay-cation, but not necessarily a place to stay out of town and without a car. Not much around the area, and unless you're familiar with downtown, I would rather have a guest stay in Old Town Scottsdale, etc. BUT if you do stay here, it's awesome. Great boutique rooms. Awesome pool that's happening in the summer. A GREAT rooftop patio bar, and a very very busy lobby with Gallo Blanco attached. A great place to stay, but have a car! 
	Aspects: {'building', 'pool', 'shopping', 'bar', 'transportation'} 
	Extended Vocab Aspects: {'building', 'pool', 'location', 'shopping', 'service', 'checking', 'bar', 'transportation', 'bedrooms'} 

	Review: I feel the Days Inn Tempe is best described as "a place where you can purchase the right to sleep for awhile." I booked my 10-night stay on Travelocity for a non-smoking room, yet when I entered the room I almost choked. It was disgusting. I've never had a smoking hotel ro

In [33]:
import nltk
nltk.download('tagsets')


[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Ocete\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping help\tagsets.zip.


True

# Assignment 3

In [39]:
def load_polarities():
    # Liu's opinion lexicon
    pos_list = set(opinion_lexicon.positive())
    neg_list = set(opinion_lexicon.negative())

    all_polarities = defaultdict(float)
    for word in pos_list:
        all_polarities[word] = 1.0
    for word in neg_list:
        all_polarities[word] = -1.0
        
    return all_polarities
    
all_polarities = load_polarities()
all_polarities

defaultdict(float,
            {'thrilled': 1.0,
             'backbone': 1.0,
             'eloquent': 1.0,
             'imaginative': 1.0,
             'smilingly': 1.0,
             'posh': 1.0,
             'enticingly': 1.0,
             'hale': 1.0,
             'pampered': 1.0,
             'promising': 1.0,
             'steadfastness': 1.0,
             'amenity': 1.0,
             'doubtless': 1.0,
             'faithful': 1.0,
             'positively': 1.0,
             'illumine': 1.0,
             'suavely': 1.0,
             'dirt-cheap': 1.0,
             'enjoy': 1.0,
             'cool': 1.0,
             'supportive': 1.0,
             'examplary': 1.0,
             'fortuitous': 1.0,
             'inspire': 1.0,
             'diligent': 1.0,
             'finest': 1.0,
             'rejuvenating': 1.0,
             'darling': 1.0,
             'revolutionized': 1.0,
             'respect': 1.0,
             'gladly': 1.0,
             'glowing': 1.0,
             '

In [38]:
# Sentinet
sad = swn.senti_synsets('sad', 'a')
print(list(sad))
print(list(swn.senti_synsets('happy')))
polarity = swn.senti_synset('happy.a.01')
print('pos', polarity.pos_score(), 'neg', polarity.neg_score())

[SentiSynset('sad.a.01'), SentiSynset('sad.s.02'), SentiSynset('deplorable.s.01')]
[SentiSynset('happy.a.01'), SentiSynset('felicitous.s.02'), SentiSynset('glad.s.02'), SentiSynset('happy.s.04')]
pos 0.875 neg 0.0


# Assignment 4

In [29]:
def pos_tagging(text):
    sentences = nltk.sent_tokenize(text)
    sentences = [nltk.word_tokenize(s) for s in sentences]
    sentences = [nltk.pos_tag(s) for s in sentences]
    return sentences

In [34]:
print(pos_tagging("I think you are very cute"))
nltk.help.upenn_tagset('VBZ')

[[('I', 'PRP'), ('think', 'VBP'), ('you', 'PRP'), ('are', 'VBP'), ('very', 'RB'), ('cute', 'JJ')]]
VBZ: verb, present tense, 3rd person singular
    bases reconstructs marks mixes displeases seals carps weaves snatches
    slumps stretches authorizes smolders pictures emerges stockpiles
    seduces fizzes uses bolsters slaps speaks pleads ...


### TODOS AND IDEAS

- Hacer el 2.4
- En el 2.2, meter hiponimos e hipernonimos a parte de sinonimos?
- Utilizar spaCy para negaciones
- Use the overall to determine a class for each opinion and train a **MACHINE LEARNING** algorithm.

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In [None]:
!pip install spacy