In [1]:
import json
import csv
import nltk
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download('punkt')
from nltk.corpus import wordnet as wn
import pandas as pd
from nltk import word_tokenize

# Assignment 1

In [2]:
hotels_path = 'inputs/yelp_dataset/yelp_hotels.json'

def load_json_line_by_line(path):
    reviews = []
    with open(path, encoding='utf-8') as file:
        for line in file:
            if len(line) > 2:
                # If line ends with a coma, remove it.
                line = line[:-2] if line[-2] == ',' else line[:-1]
                reviews.append(json.loads(line))
    return reviews

def test_load_json_line_by_line():
    paths = [
        'inputs/yelp_dataset/yelp_hotels.json',
        'inputs/yelp_dataset/yelp_restaurants.json',
        'inputs/amazon/Musical_Instruments_5.json'
    ]
    
    for path in paths:
        print('Reading file {}\n'.format(path))
        reviews = load_json_line_by_line(path)
        print('{} reviews loaded\n'.format(len(reviews)))
        print('Example review: {}\n'.format(reviews[0]))

#reviews = load_json_line_by_line(hotels_path)
test_load_json_line_by_line()

Reading file inputs/yelp_dataset/yelp_hotels.json

5034 reviews loaded

Example review: {'reviewerID': 'qLCpuCWCyPb4G2vN-WZz-Q', 'asin': '8ZwO9VuLDWJOXmtAdc7LXQ', 'summary': 'summary', 'reviewText': "Great hotel in Central Phoenix for a stay-cation, but not necessarily a place to stay out of town and without a car. Not much around the area, and unless you're familiar with downtown, I would rather have a guest stay in Old Town Scottsdale, etc. BUT if you do stay here, it's awesome. Great boutique rooms. Awesome pool that's happening in the summer. A GREAT rooftop patio bar, and a very very busy lobby with Gallo Blanco attached. A great place to stay, but have a car!", 'overall': 4.0}

Reading file inputs/yelp_dataset/yelp_restaurants.json

158430 reviews loaded

Example review: {'reviewerID': 'rLtl8ZkDX5vH5nAx9C3q5Q', 'asin': '9yKzy9PApeiPPOUJEtnvkg', 'summary': 'summary', 'reviewText': 'My wife took me here on my birthday for breakfast and it was excellent. The weather was perfect whic

# Assignment 2

In [3]:
aspect_hotels_path = 'inputs/aspects/aspects_hotels.csv'

def load_vocab(path):
    vocab = pd.read_csv(path,names = ["aspect","word"])
    vocab = vocab.groupby('aspect')['word'].apply(list)
    
    return vocab

aspect_hotels = load_vocab(aspect_hotels_path)
print(aspect_hotels)

aspect
amenities                            [amenity, amenities, services]
atmosphere        [atmosphere, atmospheres, ambiance, ambiances,...
bar                              [bar, bars, bartender, bartenders]
bathrooms         [bathroom, bathrooms, bath, baths, bathtub, ba...
bedrooms          [bedroom, bedrooms, bed, beds, pillow, pillows...
booking           [booking, book, reservation, reservations, res...
breakfast         [breakfast, breakfasts, morning, mornings, toa...
building          [building, buildings, architecture, architectu...
checking          [checkin, check-in, check in, check ins, check...
cleanliness       [cleanliness, clean, cleaned, cleaning, dirt, ...
coffee                    [coffee, coffees, cafe, cafes, tea, teas]
cuisine           [cuisine, cuisines, dishe, dishes, food, foods...
dinner            [dinner, dinners, evening meal, evening menu, ...
drinks                    [drink, drinks, beer, beers, wine, wines]
events            [event, events, activit

In [24]:
def get_text_vocabulary(text):
    # Obtain text
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Obtain words and vocab
    words = [w.lower() for w in tokens if w.isalnum()]
    vocab = sorted(set(words))
    # Return vocab 
    return vocab
    

def find_references(reviews,vocab):
    
    # All references list
    references = []
    for r in reviews:
        # References of single review
        rev_refs = [] 
        # Obtain review vocabulary
        r_vocab = get_text_vocabulary(r['reviewText'])
    
        # Search for words related to aspects and append
        # to vector if word appears
        for key in vocab.keys():
            for word in vocab[key]:
                if word in r_vocab:
                    rev_refs.append(key)
                    break  
                
        references.append(rev_refs)
        
    return references
    

In [22]:
hotel_reviews = load_json_line_by_line(hotels_path)
refs = find_references(hotel_reviews,aspect_hotels)
print(refs[0:20])

[['bar', 'building', 'pool', 'shopping', 'transportation'], ['atmosphere', 'bathrooms', 'bedrooms', 'breakfast', 'drinks', 'pool', 'price', 'shopping'], ['breakfast', 'checking', 'facilities', 'location', 'shopping', 'spa', 'staff'], ['bathrooms', 'breakfast', 'building', 'cleanliness', 'cuisine', 'drinks', 'location', 'pool', 'spa'], ['bathrooms', 'bedrooms', 'checking', 'cuisine', 'drinks', 'events', 'location', 'lunch', 'pool', 'restaurant', 'service', 'shopping', 'staff'], ['bar', 'booking', 'cuisine', 'events', 'facilities', 'service', 'staff', 'temperature'], ['bathrooms', 'bedrooms', 'building', 'gym', 'pool', 'service'], ['bathrooms', 'bedrooms', 'building', 'cuisine', 'events', 'pool', 'price', 'restaurant', 'shopping', 'spa'], ['bedrooms', 'internet', 'staff'], ['bedrooms', 'building', 'cleanliness', 'gym', 'pool'], ['restaurant'], ['bar', 'pool', 'staff'], ['service'], ['bar', 'bathrooms', 'building', 'cuisine', 'drinks', 'events', 'location', 'pool', 'restaurant', 'service'

In [6]:
def count_words(vocab):
    count = 0
    for key in vocab.keys():
        count += len(vocab[key])
        
    return count

def get_word_synonims(word):
    syns = []
    for syn in wn.synsets(word):
        for l in syn.lemmas():
            syns.append(l.name())
    return syns

def expand_vocab(vocab):
    expanded = {}
    for a in vocab.keys():
        all_words = set()
        all_words.update(vocab[a])
        for word in vocab[a]:
            all_words.update(get_word_synonims(word))
         
        expanded[a] = list(all_words)
        
    return expanded
            
        
print("Pre-expansion length: {}".format(count_words(aspect_hotels)))
expanded = expand_vocab(aspect_hotels)
print("Post-expansion length: {}".format(count_words(expanded)))
    

Pre-expansion length: 282
Post-expansion length: 1407


In [25]:
def test_expand_vocabulary_other_datasets(n_show = 3):
    paths = [
        'inputs/yelp_dataset/yelp_hotels.json',
        'inputs/yelp_dataset/yelp_restaurants.json',
        'inputs/amazon/Musical_Instruments_5.json'
    ]
    
    for path in paths:
        print('Obtaining vocab from file file {}\n'.format(path))
        
        reviews = load_json_line_by_line(path)
        for i in range(0,n_show):
            print(get_text_vocabulary(reviews[i]['reviewText']))

test_expand_vocabulary_other_datasets()

Obtaining vocab from file file inputs/yelp_dataset/yelp_hotels.json

['a', 'and', 'area', 'around', 'attached', 'awesome', 'bar', 'blanco', 'boutique', 'busy', 'but', 'car', 'central', 'do', 'downtown', 'etc', 'familiar', 'for', 'gallo', 'great', 'guest', 'happening', 'have', 'here', 'hotel', 'i', 'if', 'in', 'it', 'lobby', 'much', 'necessarily', 'not', 'of', 'old', 'out', 'patio', 'phoenix', 'place', 'pool', 'rather', 'rooftop', 'rooms', 'scottsdale', 'stay', 'summer', 'that', 'the', 'to', 'town', 'unless', 'very', 'with', 'without', 'would', 'you']
['10', '90', 'a', 'about', 'addition', 'again', 'almost', 'and', 'are', 'as', 'awarding', 'awhile', 'because', 'beer', 'before', 'best', 'big', 'booked', 'bottle', 'brought', 'but', 'by', 'call', 'can', 'cheap', 'choked', 'class', 'coors', 'corkscrew', 'cost', 'could', 'cow', 'days', 'deal', 'degree', 'described', 'desk', 'different', 'disgusting', 'do', 'dollar', 'drink', 'eat', 'entered', 'feel', 'felt', 'few', 'for', 'forgot', 'friend',