In [3]:
import json
import csv
import nltk
import pandas as pd
from collections import defaultdict
import copy

#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download('punkt')
#nltk.download('opinion_lexicon')

# Assignment 2
from nltk.corpus import wordnet as wn
from nltk import word_tokenize

# Assignment 3
from nltk.corpus import opinion_lexicon

# Assignment 1

### Task 1.1 
**Loading all the hotel reviews from the Yelp hotel reviews file.**

### Task 1.2 (optional) 

**Loading line by line the reviews from the Yelp beauty/spa resorts and restaurants reviews files.**

### Task 1.3 (optional) 

**Loading line by line reviews on other domains (e.g., movies, books, phones, digital music, CDs and videogames) from McAuley’s Amazon dataset.**

We tackle all of these tasks at the same time since a general enough functions solves all of them directly. The function `load_json_line_by_line()` reads a json file line by line and returns the dataset built.

We additionaly created a test function that tests the loading of all the described datasets. We have selected the Amazon cell phones and accesories dataset because it is big enough without being huge and we also have the required aspects for it (see tasks 2.1 to 2.3).

In [4]:
hotels_path = 'inputs/yelp_dataset/yelp_hotels.json'

def load_json_line_by_line(path):
    reviews = []
    with open(path, encoding='utf-8') as file:
        for line in file:
            # If line ends with a coma, remove it.
            if len(line) > 2:
                line = line[:-2] if line[-2] == ',' else line[:-1]
                reviews.append(json.loads(line))
    return reviews

def test_load_json_line_by_line():
    paths = [
        'inputs/yelp_dataset/yelp_hotels.json',
        'inputs/yelp_dataset/yelp_beauty_spas.json',
        'inputs/yelp_dataset/yelp_restaurants.json',
        'inputs/amazon/Cell_Phones_and_Accessories_5.json'
    ]
    
    for path in paths:
        print('Reading file {}\n'.format(path))
        reviews = load_json_line_by_line(path)
        print('{} reviews loaded\n'.format(len(reviews)))
        print('Example review: {}\n'.format(reviews[0]))

test_load_json_line_by_line()

Reading file inputs/yelp_dataset/yelp_hotels.json



FileNotFoundError: [Errno 2] No such file or directory: 'inputs/yelp_dataset/yelp_hotels.json'

# Assignment 2

### Task 2.1

**Loading (and printing on screen) the vocabulary of the `aspects_hotels.csv` file, and directly using it to identify aspect references in the reviews. In particular, the aspects terms could be mapped by exact matching with nouns appearing in the reviews.**

We will compute a dictionary that matches a certain aspect to every word related to it. It will usually be called `aspect_words_dict`. This will optimize knowing which aspect is related to each word.

The function `build_simple_vocab` creates this dictionary given a path to the file with the initial vocabulary.

In [None]:
aspect_hotels_path = 'inputs/aspects/aspects_hotels.csv'

def load_vocab(path):
    vocab = pd.read_csv(path,names = ['aspect', 'word'])
    return vocab.groupby('aspect')['word'].apply(list)

def create_word_to_aspects_dict(aspect_words_dict):
    '''
        This function transforms an 'aspect to words'
        dictionary to a 'word to aspect' default dictionary
    '''
    word_aspect_dict = defaultdict(str)
    for aspect, words in aspect_words_dict.items():
        for word in words:
            word_aspect_dict[word] = aspect
    return word_aspect_dict
    
def build_simple_vocab(path):
    aspect_words_dict = load_vocab(path)
    word_aspect_dict = create_word_to_aspects_dict(aspect_words_dict)
    return word_aspect_dict

build_simple_vocab(aspect_hotels_path)

In the following cells we compute the aspects referenced by each review and display the result for the first few reviews.

In [None]:
def get_text_vocabulary(text):
    '''
        Returns a list of the words from
        the given text.
    '''
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Obtain the words 
    return [w.lower() for w in tokens if w.isalnum()]

def find_aspects_in_reviews(reviews, word_aspect_dict):
    '''
        Given a list of reviews, returns a set with the
        aspects that each review references.
    '''
    # Initialize references list
    refered_aspects = []
    
    for review in reviews:
        # Obtain review vocabulary
        r_vocab = get_text_vocabulary(review['reviewText'])

        # Search for words related to aspects and append
        # to vector if word appears
        refered_aspects.append(set([
            word_aspect_dict[word] for word in r_vocab if word_aspect_dict[word] != ''
        ]))

    return refered_aspects

In [None]:
hotel_reviews = load_json_line_by_line(hotels_path)
word_aspect_dict = build_simple_vocab(aspect_hotels_path)
refered_aspects = find_aspects_in_reviews(hotel_reviews, word_aspect_dict)

for review, aspects in zip(hotel_reviews[:3], refered_aspects[:3]):
    print('\tReview: {} \n\tAspects: {} \n'.format(review['reviewText'], aspects))

### Task 2.2 (optional)

**Generating or extending the lists of terms of each aspect with synonyms extracted from WordNet.**

For this second task we expand the vocabulty using synonims extracted from Wordnet. The function `build_vocab()` is analogous to the previous `build_simple_vocab()` but takes this synonims into account.

In [None]:
def get_word_synonims(word):
    syns = []
    for syn in wn.synsets(word):
        for l in syn.lemmas():
            syns.append(l.name())
    return syns

def add_synonims_to_vocab(word_aspect_dict):
    '''
        Given a vocabulary, extends it by adding the synonims
        of each word.
    '''
    result = copy.deepcopy(word_aspect_dict)
    for word, aspect in word_aspect_dict.items():
        for synonym in get_word_synonims(word):
            result[synonym] = aspect
    return result

def build_vocab(path):
    '''
        Builds a vocabulary taking synonims into account.
    '''
    aspect_words_dict = load_vocab(path)
    word_aspect_dict = create_word_to_aspects_dict(aspect_words_dict)
    word_aspect_dict_extended = add_synonims_to_vocab(word_aspect_dict)
    return word_aspect_dict_extended

In [None]:
def test_expand_vocab(reviews_path, aspects_path,
                      n_displayed_reviews=3):
    '''
        This functions displayes the refered aspects in the first
        n_displayed_reviews reviews using both vocabularies
    '''
    reviews = load_json_line_by_line(reviews_path)
    
    simple_vocab = build_simple_vocab(aspects_path)
    complex_vocab = build_vocab(aspects_path)

    refered_aspects = find_aspects_in_reviews(
        reviews[:n_displayed_reviews], simple_vocab)
    refered_aspects_extended = find_aspects_in_reviews(
        reviews[:n_displayed_reviews], complex_vocab)
    
    for review, aspects, extended_aspects in zip(reviews[:n_displayed_reviews],
                                                 refered_aspects,
                                                 refered_aspects_extended):
        print('\tReview: {} \n\tAspects: {} \n\tExtended Vocab Aspects: {} \n'.format(
            review['reviewText'], aspects, extended_aspects))
        
    return simple_vocab, complex_vocab

_ = test_expand_vocab(hotels_path, aspect_hotels_path)

### Task 2.3 (optional)

**Managing vocabularies for additional Yelp or Amazon domains. See assignments 1.2 and 1.3**

Extended our previous functions to the new datasets is trivial. We simple need to load the correct aspects for each review. The following test function computes the following for the Yelp hotels, Yelp restaurants and Amazon phones datasets:

- Load the reviews and build both the simple and complex vocabularies.
- Print the aspects found in the first few reviews with each vocabulary.
- Print the number of words in both the simple and extended vocabulary for comparison.

In [None]:
def test_build_vocabulary(n_displayed_reviews=1):
    test_case_names = [
        'Yelp hotels',
        'Yelp restaurants',
        'Amazon cell phones and accesories'
    ]
    reviews_paths = [
        'inputs/yelp_dataset/yelp_hotels.json',
        'inputs/yelp_dataset/yelp_restaurants.json',
        'inputs/amazon/Cell_Phones_and_Accessories_5.json'
    ]
    aspects_paths = [
        'inputs/aspects/aspects_hotels.csv',
        'inputs/aspects/aspects_restaurants.csv',
        'inputs/aspects/aspects_phones.csv'
    ]
    
    for name, reviews_path, aspects_path in \
            zip(test_case_names, reviews_paths, aspects_paths):
        print('----- {} dataset -----\n'.format(name))
        
        simple_vocab, complex_vocab = test_expand_vocab(
            reviews_path, aspects_path, n_displayed_reviews=n_displayed_reviews)
        
        print('Words in simple vocab: {}'.format(len(simple_vocab.keys())))
        print('Words in complex vocab: {}\n'.format(len(complex_vocab.keys())))

test_build_vocabulary(n_displayed_reviews=2)

### TODOS

- Hacer el 2.4
- En el 2.2, meter hiponimos e hipernonimos a parte de sinonimos?
- Utilizar spaCy para negaciones

# Assignment 3

In [5]:
pos_list=set(opinion_lexicon.positive())
neg_list=set(opinion_lexicon.negative())

In [6]:
all_polarity = defaultdict(str)
for word in pos_list:
    all_polarity[word] = 'positive'
for word in neg_list:
    all_polarity[word] = 'negative'

In [12]:
vocab_polarity = defaultdict(str)


TypeError: 'dict_keys' object is not subscriptable