Set your own project path

In [1]:
# set path to project -> change if needed
project_path = '/Users/EB/Google Drive/Projects/breweries'

Import needed modules

In [2]:
# modules
import pickle
import spacy
import os

# change directory and load Beer class
os.chdir(project_path + '/modules/')

from beeradvocate.classes import Beer

Initialize spaCy

In [3]:
# create "custom" spacy pipeline (would also be standard)
def spacy_pipe(nlp):
    return(nlp.tagger, nlp.parser, nlp.entity)

# Load custom pipeline for English
nlp = spacy.load('en', create_pipeline = spacy_pipe)

Look at one beer review

In [4]:
# load pickled beer reviews
os.chdir(project_path + '/data/')
beers = pickle.load(open('reviews_sample.p', 'rb'))

# start with one review to check functionality
review = beers[0].reviews[0][0]
review = nlp(review)

### Lemmatization

In [5]:
# Lemmatize the review and keep only (proper) nouns and adjectives
# This might be "enough" pre-processing for e.g. cluster analysis
lemmas = []
for word in review:
    if word.pos_ in ('NOUN', 'PROPN', 'ADJ'):
        lemmas.append(word.lemma_)

print(lemmas)

['strong', 'sweet', 'aroma', 'chocolate', 'coffee', 'barley', 'dark', 'black', 'pour', 'creamy', 'head', 'beer', 'standard', 'porter', 'taste', 'chocolate', 'coffee', 'touch', 'earthy', 'spice', 'nutmeg', 'cinnamon', 'thick', 'stout', 'good', 'beer', 'sam', 'one', 'good', 'year', 'glad', 'available', '6-pack', 'same', 'old', 'fezziwig']


### Parser

In [7]:
# Parser
# Extract noun chunks in the text (with length > 1)
# Note: if dependency parsing is not needed, use:
#       spacy.load('en', parser = False) to increase speed
for np in review.noun_chunks:
    if len(np) > 1:
        print(np.lemma_)

* * 11/21/12 strong sweet aroma
dark black pour
creamy head
the beer
standard porter taste
, plus a touch
earthy spice
thick , almost a stout
a really good beer
this one
the year
a 6-pack
old fezziwig


Some of these dependencies (e.g. "creamy head", "earthy spice") are more interesting than others (e.g. "this one"). 
We can use a rule based system to extract them

In [9]:
for np in review.noun_chunks:
    toks = [token.pos_ for token in np]
    tok_count = toks.count('PROPN') + toks.count('NOUN') + toks.count('ADJ')

    if  tok_count == len(toks) & len(toks) > 1:
        print(np.lemma_)

dark black pour
creamy head
standard porter taste
earthy spice
old fezziwig
