In [None]:
ls yelp_data/

In [1]:
import os
import codecs

data_directory = os.path.join('yelp_data')

In [None]:
businesses_filepath = os.path.join(data_directory,
                                   'yelp_academic_dataset_business.json')

with codecs.open(businesses_filepath, encoding='utf_8') as f:
    first_business_record = f.readline() 

print(first_business_record)

In [None]:
%%time
review_json_filepath = os.path.join(data_directory,
                                    'yelp_academic_dataset_review.json')

with codecs.open(review_json_filepath, encoding='utf_8') as f:
    first_review_record = f.readline()
    
print(first_review_record)

In [None]:

# import json

# restaurant_ids = set()

# # open the businesses file
# with codecs.open(businesses_filepath, encoding='utf_8') as f:
    
#     # iterate through each line (json record) in the file
#     for business_json in f:
        
#         # convert the json record to a Python dict
#         business = json.loads(business_json)
        
#         # if this business is not a restaurant, skip to the next one
#         if  business['categories'] is None:
#             continue
#         elif 'Restaurants' not in business['categories']:
#             print(business['categories'])
    

In [None]:
%%time
import json

restaurant_ids = set()

# open the businesses file
with codecs.open(businesses_filepath, encoding='utf_8') as f:
    
    # iterate through each line (json record) in the file
    for business_json in f:
        
        # convert the json record to a Python dict
        business = json.loads(business_json)
        
        # if this business is not a restaurant, skip to the next one
        if  business['categories'] is None:
            continue
        elif 'Restaurants' not in business['categories']:
            continue
            
        # add the restaurant business id to our restaurant_ids set
        restaurant_ids.add(business[u'business_id'])

# turn restaurant_ids into a frozenset, as we don't need to change it anymore
restaurant_ids = frozenset(restaurant_ids)

# print the number of unique restaurant ids in the dataset
print( '{:,}'.format(len(restaurant_ids)), u'restaurants in the dataset.')

In [4]:
intermediate_directory = os.path.join( 'yelp_data')

review_txt_filepath = os.path.join(intermediate_directory,
                                   'review_text_all.txt')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    
    review_count = 0

    # create & open a new file in write mode
    with codecs.open(review_txt_filepath, 'w', encoding='utf_8') as review_txt_file:

        # open the existing review json file
        with codecs.open(review_json_filepath, encoding='utf_8') as review_json_file:

            # loop through all reviews in the existing file and convert to dict
            for review_json in review_json_file:
                review = json.loads(review_json)

                # if this review is not about a restaurant, skip to the next one
                if review[u'business_id'] not in restaurant_ids:
                    continue

                # write the restaurant review as a line in the new file
                # escape newline characters in the original review text
                review_txt_file.write(review[u'text'].replace('\n', '\\n') + '\n')
                review_count += 1

    print( '''Text from {:,} restaurant reviews
              written to the new txt file.'''.format(review_count))
    
else:
    
    with codecs.open(review_txt_filepath, encoding='utf_8') as review_txt_file:
        for review_count, line in enumerate(review_txt_file):
            pass
        
    print( 'Text from {:,} restaurant reviews in the txt file.'.format(review_count + 1))

## Spacy

In [3]:
%%time
import spacy
import pandas as pd
import itertools as it

nlp = spacy.load('en_core_web_lg')

CPU times: user 15.4 s, sys: 5.06 s, total: 20.5 s
Wall time: 15.4 s


### Let's grab a sample review to play with.



In [5]:
with codecs.open(review_txt_filepath, encoding='utf_8') as f:
    sample_review = list(it.islice(f, 8, 9))[0]
    sample_review = sample_review.replace('\\n', '\n')
        
print (sample_review)

Food is pretty good, not gonna lie. BUT you have to make sacrifices if you choose to eat there. It literally takes an hour to an hour and a half to deliver food. Seriously. EVERY SINGLE TIME. Doesnt matter if we order at 8am, 10am or 1pm. Never fails, they take F-O-R-E-V-E-R. If you dont get what you ordered or you are upset by them delivering your breakfast around LUNCH time, be ready to have the owner talk down to you and be a total bitch to you for i dont know, just wanting what you pay for?! 

Its over priced. But its decently tasteful food. Takes forever. Owners a witch. And i'm pretty sure that they continuing forget to pack my extra ranch just to piss me off. 

End Rant. 

PS- I've never gone in there to eat because i frankly, i'd rather tip the nice delivery driver then the ignorant imbeciles that work in the dining area. 

PPS- My hot chocolate today was cold. They should call it Cold Chocolate. Or start caring if their hot chocolate is hot. One of the two would be great!



### Hand the review text to spaCy, and be prepared to wait...



In [None]:
%%time
parsed_review = nlp(sample_review)

In [None]:
print (parsed_review)

In [None]:
for num, sentence in enumerate(parsed_review.sents):
    print ('Sentence {}:\n{}'.format(num + 1,sentence))

### What about named entity detection?



In [None]:
for num, entity in enumerate(parsed_review.ents):
    print ('Entity {}: {} - {}'.format(num + 1, entity, entity.label_))

### What about part of speech tagging?



In [None]:
token_text = [token.orth_ for token in parsed_review]
token_pos = [token.pos_ for token in parsed_review]

pd.DataFrame(list(zip(token_text,token_pos)),columns=['token_text','POS'])

## Normalization 
- stemming  
- lemmatitations (normalized word)
- shape analysis

In [None]:
token_lemma = [token.lemma_ for token in parsed_review]
token_shape = [token.shape_ for token in parsed_review]

pd.DataFrame(list(zip(token_text,token_lemma,token_shape)),columns=['token_text','token_lemma','token_shape'])

## Token level entitiy anbalysis

In [None]:
token_entity_type = [token.ent_type for token in parsed_review]
token_entity_iob = [token.ent_iob_ for token in parsed_review]

pd.DataFrame(list(zip(token_text,token_entity_type,token_entity_iob)),columns=['token_text','entity_type','inside_outside_beguin'])

## variety of token-level attributes

- stopwords
- punctuation
- whitespace
- represents a num
- token in spacy default voc?

In [None]:
token_att = [(token.orth_,
              token.prob,
              token.is_stop,
              token.is_punct,
              token.is_space,
              token.like_num,
              token.is_oov) for token in parsed_review]

df = pd.DataFrame(token_att, 
                  columns=['text',
                           'log_prob',
                           'stop',
                           'piunctuation?',
                         'whitespace',
                         'num?',
                         'out_of_vocab'])

df.loc[:,'stop':'out_of_vocab'] = (df.loc[:,'stop':'out_of_vocab'].applymap(lambda x: 'Yes' if x else ' '))



In [None]:
df

## Phrase Modeling

In [8]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [9]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [10]:
unigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'unigram_sentences_all.txt')

Let's use the lemmatized_sentence_corpus generator to loop over the original review text, segmenting the reviews into individual sentences and normalizing the text. We'll write this data back out to a new file (unigram_sentences_all), with one normalized sentence per line. We'll use this data for learning our phrase models.

In [11]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(review_txt_filepath):
            f.write(sentence + '\n')

KeyboardInterrupt: 

If your data is organized like our unigram_sentences_all file now is — a large text file with one document/sentence per line — gensim's LineSentence class provides a convenient iterator for working with other gensim components. It streams the documents/sentences from disk, so that you never have to hold the entire corpus in RAM at once. This allows you to scale your modeling pipeline up to potentially very large corpora.

In [None]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

Let's take a look at a few sample sentences in our new, transformed file.



In [None]:
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print( u' '.join(unigram_sentence))
    print( u'')

Next, we'll learn a phrase model that will link individual words into two-word phrases. We'd expect words that together represent a specific concept, like "ice cream", to be linked together to form a new, single token: "ice_cream".

In [None]:
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')


In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 1:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

Now that we have a trained phrase model for word pairs, let's apply it to the review sentences data and explore the results.



In [None]:
bigram_sentences_filepath = os.path.join(intermediate_directory,
                                         'bigram_sentences_all.txt')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')

In [None]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [None]:
for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print( u' '.join(bigram_sentence))
    print( u'')

Looks like the phrase modeling worked! We now see two-word phrases, such as "ice_cream" and "apple_pie", linked together in the text as a single token. Next, we'll train a second-order phrase model. We'll apply the second-order phrase model on top of the already-transformed data, so that incomplete word combinations like "vanilla_ice cream" will become fully joined to "vanilla_ice_cream". No disrespect intended to Vanilla Ice, of course.

In [None]:
trigram_model_filepath = os.path.join(intermediate_directory,
                                      'trigram_model_all')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 1:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

We'll apply our trained second-order phrase model to our first-order transformed sentences, write the results out to a new file, and explore a few of the second-order transformed sentences

In [None]:
trigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'trigram_sentences_all.txt')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for bigram_sentence in bigram_sentences:
            
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
            f.write(trigram_sentence + '\n')

In [None]:
trigram_sentences = LineSentence(trigram_sentences_filepath)


In [None]:
for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print (u' '.join(trigram_sentence))
    print (u'')

Looks like the second-order phrase model was successful. We're now seeing three-word phrases, such as "vanilla_ice_cream" and "cinnamon_ice_cream".

The final step of our text preparation process circles back to the complete text of the reviews. We're going to run the complete text of the reviews through a pipeline that applies our text normalization and phrase models.

In addition, we'll remove stopwords at this point. Stopwords are very common words, like a, the, and, and so on, that serve functional roles in natural language, but typically don't contribute to the overall meaning of text. Filtering stopwords is a common procedure that allows higher-level NLP modeling techniques to focus on the words that carry more semantic weight.

Finally, we'll write the transformed text out to a new file, with one review per line.

In [None]:
trigram_reviews_filepath = os.path.join(intermediate_directory,
                                        'trigram_transformed_reviews_all.txt')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_review in nlp.pipe(line_review(review_txt_filepath),
                                      batch_size=10000, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                              if term not in spacy.en.STOPWORDS]
            
            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')


Let's preview the results. We'll grab one review from the file with the original, untransformed text, grab the same review from the file with the normalized and transformed text, and compare the two.

In [None]:
print( u'Original:' + u'\n')

for review in it.islice(line_review(review_txt_filepath), 11, 12):
    print review

print( u'----' + u'\n')
print( u'Transformed:' + u'\n')

with codecs.open(trigram_reviews_filepath, encoding='utf_8') as f:
    for review in it.islice(f, 11, 12):
        print (review)


You can see that most of the grammatical structure has been scrubbed from the text — capitalization, articles/conjunctions, punctuation, spacing, etc. However, much of the general semantic meaning is still present. Also, multi-word concepts such as "friday_night" and "above_average" have been joined into single tokens, as expected. The review text is now ready for higher-level modeling.

## Topic Modeling with Latent Dirichlet Allocation (LDA)
