In [16]:
import json
import csv
import nltk
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download('punkt')

from nltk.corpus import wordnet as wn
import pandas as pd
from nltk import word_tokenize

from collections import defaultdict
import copy

# Assignment 1

In [2]:
hotels_path = 'inputs/yelp_dataset/yelp_hotels.json'

def load_json_line_by_line(path):
    reviews = []
    with open(path, encoding='utf-8') as file:
        for line in file:
            if len(line) > 2:
                # If line ends with a coma, remove it.
                line = line[:-2] if line[-2] == ',' else line[:-1]
                reviews.append(json.loads(line))
    return reviews

def test_load_json_line_by_line():
    paths = [
        'inputs/yelp_dataset/yelp_hotels.json',
        'inputs/yelp_dataset/yelp_restaurants.json',
        'inputs/amazon/Musical_Instruments_5.json'
    ]
    
    for path in paths:
        print('Reading file {}\n'.format(path))
        reviews = load_json_line_by_line(path)
        print('{} reviews loaded\n'.format(len(reviews)))
        print('Example review: {}\n'.format(reviews[0]))

# reviews = load_json_line_by_line(hotels_path)
test_load_json_line_by_line()

Reading file inputs/yelp_dataset/yelp_hotels.json

5034 reviews loaded

Example review: {'reviewerID': 'qLCpuCWCyPb4G2vN-WZz-Q', 'asin': '8ZwO9VuLDWJOXmtAdc7LXQ', 'summary': 'summary', 'reviewText': "Great hotel in Central Phoenix for a stay-cation, but not necessarily a place to stay out of town and without a car. Not much around the area, and unless you're familiar with downtown, I would rather have a guest stay in Old Town Scottsdale, etc. BUT if you do stay here, it's awesome. Great boutique rooms. Awesome pool that's happening in the summer. A GREAT rooftop patio bar, and a very very busy lobby with Gallo Blanco attached. A great place to stay, but have a car!", 'overall': 4.0}

Reading file inputs/yelp_dataset/yelp_restaurants.json

158430 reviews loaded

Example review: {'reviewerID': 'rLtl8ZkDX5vH5nAx9C3q5Q', 'asin': '9yKzy9PApeiPPOUJEtnvkg', 'summary': 'summary', 'reviewText': 'My wife took me here on my birthday for breakfast and it was excellent. The weather was perfect whic

# Assignment 2

### Task 2.1

**Loading (and printing on screen) the vocabulary of the `aspects_hotels.csv` file, and directly using it to identify aspect references in the reviews. In particular, the aspects terms could be mapped by exact matching with nouns appearing in the reviews.**

We will use two data structures in parallel to optimize different operations:

- A dictionary that matches a certain aspect to every word related to it. It will usually be called `aspect_words_dict`.

- A dictionary that matches a certain word to the aspect it refers to. It will usually be called `word_aspect_dict`.

The function `build_simple_vocab` creates both dictionaries given a path to the file with the initial vocabulary.

In [4]:
aspect_hotels_path = 'inputs/aspects/aspects_hotels.csv'

def load_vocab(path):
    vocab = pd.read_csv(path,names = ['aspect', 'word'])
    return vocab.groupby('aspect')['word'].apply(list)

def create_word_to_aspects_dict(aspect_words_dict):
    '''
        This function transforms an 'aspect to words'
        dictionary to a 'word to aspect' default dictionary
    '''
    word_aspect_dict = defaultdict(str)
    for aspect, words in aspect_words_dict.items():
        for word in words:
            word_aspect_dict[word] = aspect
    return word_aspect_dict
    
def build_simple_vocab(path):
    aspect_words_dict = load_vocab(path)
    word_aspect_dict = create_word_to_aspects_dict(aspect_words_dict)
    return aspect_words_dict, word_aspect_dict

aspect_words_dict, word_aspect_dict = build_simple_vocab(aspect_hotels_path)
print(aspect_words_dict, '\n')
print(word_aspect_dict)

aspect
amenities                            [amenity, amenities, services]
atmosphere        [atmosphere, atmospheres, ambiance, ambiances,...
bar                              [bar, bars, bartender, bartenders]
bathrooms         [bathroom, bathrooms, bath, baths, bathtub, ba...
bedrooms          [bedroom, bedrooms, bed, beds, pillow, pillows...
booking           [booking, book, reservation, reservations, res...
breakfast         [breakfast, breakfasts, morning, mornings, toa...
building          [building, buildings, architecture, architectu...
checking          [checkin, check-in, check in, check ins, check...
cleanliness       [cleanliness, clean, cleaned, cleaning, dirt, ...
coffee                    [coffee, coffees, cafe, cafes, tea, teas]
cuisine           [cuisine, cuisines, dishe, dishes, food, foods...
dinner            [dinner, dinners, evening meal, evening menu, ...
drinks                    [drink, drinks, beer, beers, wine, wines]
events            [event, events, activit

In the following cells we compute the aspects referenced by each review and display the result for the first few reviews.

In [5]:
def get_text_vocabulary(text):
    '''
        Returns a list of the words from
        the given text.
    '''
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Obtain the words 
    return [w.lower() for w in tokens if w.isalnum()]

def find_aspects_in_reviews(reviews, word_aspect_dict):
    '''
        Given a list of reviews, returns a list with the
        aspects that each review talks references.
    '''
    # Initialize references list
    refered_aspects = []
    
    for review in reviews:
        # Obtain review vocabulary
        r_vocab = get_text_vocabulary(review['reviewText'])

        # Search for words related to aspects and append
        # to vector if word appears
        refered_aspects.append([
            word_aspect_dict[word] for word in r_vocab if word_aspect_dict[word] != ''
        ])

    return refered_aspects

In [6]:
hotel_reviews = load_json_line_by_line(hotels_path)
refered_aspects = find_aspects_in_reviews(hotel_reviews, word_aspect_dict)

for review, aspects in zip(hotel_reviews[:3], refered_aspects[:3]):
    print('\tReview: {} \n\tAspects: {} \n'.format(review['reviewText'], aspects))

	Review: Great hotel in Central Phoenix for a stay-cation, but not necessarily a place to stay out of town and without a car. Not much around the area, and unless you're familiar with downtown, I would rather have a guest stay in Old Town Scottsdale, etc. BUT if you do stay here, it's awesome. Great boutique rooms. Awesome pool that's happening in the summer. A GREAT rooftop patio bar, and a very very busy lobby with Gallo Blanco attached. A great place to stay, but have a car! 
	Aspects: ['transportation', 'shopping', 'pool', 'building', 'bar', 'building', 'transportation'] 

	Review: I feel the Days Inn Tempe is best described as "a place where you can purchase the right to sleep for awhile." I booked my 10-night stay on Travelocity for a non-smoking room, yet when I entered the room I almost choked. It was disgusting. I've never had a smoking hotel room before and I will make sure I don't again. They said they couldn't move us to a different room.My local lady friend brought over a 

### Task 2.2

**Generating or extending the lists of terms of each aspect with synonyms extracted from WordNet.**

For this second task we expand the vocabulty using synonims extracted from Wordnet. The function `build_vocab()` is analogous to the previous `build_simple_vocab()` but takes this synonims into account.

In [27]:
def get_word_synonims(word):
    syns = []
    for syn in wn.synsets(word):
        for l in syn.lemmas():
            syns.append(l.name())
    return syns

def add_synonims_to_vocab(aspect_words_dict):
    '''
        Given a vocabulary, extends it by adding the synonims
        of each word.
    '''
    result = {}
    for aspect, words in aspect_words_dict.items():
        result[aspect] = words
        olds_words = copy.deepcopy(words)
        for word in olds_words:
            result[aspect] += get_word_synonims(word)
    return result

def build_vocab(path):
    '''
        Builds a vocabulary taking synonims into account.
    '''
    aspect_words_dict = load_vocab(path)
    aspect_words_dict = add_synonims_to_vocab(aspect_words_dict)
    word_aspect_dict = create_word_to_aspects_dict(aspect_words_dict)
    return aspect_words_dict, word_aspect_dict

In [28]:
def test_build_vocabulary(n_show = 3):
    paths = [
        'inputs/aspects/aspects_hotels.csv',
        'inputs/yelp_dataset/yelp_restaurants.json',
        'inputs/amazon/Musical_Instruments_5.json'
    ]
    
    for path in paths:
        print('Obtaining vocab from file file {}\n'.format(path))
        
        _, simple_vocab = build_simple_vocab(path)
        _, complex_vocab = build_vocab(path)
        
        print("Simple vocab. length: {}".format(len(simple_vocab.keys())))
        print("With-synonyms vocab. length: {}".format(len(complex_vocab.keys())))

test_build_vocabulary()

Obtaining vocab from file file inputs/aspects/aspects_hotels.csv

Simple vocab. length: 281
With-synonyms vocab. length: 1336
Obtaining vocab from file file inputs/yelp_dataset/yelp_restaurants.json



AttributeError: 'float' object has no attribute 'lower'

### Aquidejemilectura

- Terminar el test roto
- Utilizar el siguiente codigo (metido en una funcion) a partir del nuevo vocab (esta arriba antes usado mirar ejemplo!):

```
for review, aspects in zip(hotel_reviews[:3], refered_aspects[:3]):
    print('\tReview: {} \n\tAspects: {} \n'.format(review['reviewText'], aspects))
```

### TODOS

- Hacer el 2.4
- En el 2.2, meter hiponimos e hipernonimos a parte de sinonimos?