Set your own project path

In [1]:
# set path to project -> change if needed
project_path = '/Users/EB/Google Drive/Projects/breweries'

Import needed modules

In [2]:
# modules
import pickle
import spacy
import os

Initialize spaCy

In [3]:
# create "custom" spacy pipeline (would also be standard)
def spacy_pipe(nlp):
    return(nlp.tagger, nlp.parser, nlp.entity)

# Load custom pipeline for English
nlp = spacy.load('en', create_pipeline = spacy_pipe)

Look at one beer review

In [4]:
# load pickled beer reviews
os.chdir(project_path + '/data/')
data = pickle.load(open('reviews_sample.p', 'rb'))

# start with one review to check functionality
review = data[1][10]
review = nlp(review)

### Lemmatization

In [5]:
# Lemmatize the review and keep only (proper) nouns and adjectives
# This might be "enough" pre-processing for e.g. cluster analysis
interesting_pos = ('NOUN', 'PROPN', 'ADJ')
print([word.lemma_ for word in review if word.pos_ in interesting_pos])

['big', 'tick', 'my', 'russian', 'river', '–tion', 'quest', 'my', 'way', 'straggler', 'big', 'thanks', 'bilbosnuts', 'one', 'growler', 'green', 'man', 'snifter', 'my', '–tion', 'glass', 'florida', 'aasher', 'wedding', '\uf04c', 'huge', 'pop', 'deep', 'ruby', 'red', 'brown', 'finger', 'white', 'head', 'that', 'minimal', 'amount', 'excellent', 'glass', 'aroma', 'sweet', 'malt', 'dark', 'chocolate', 'dark', 'fruit', 'smoke', 'bread', 'spice', 'flavor', 'sweet', 'malt', 'smoke', 'fruit', 'bread', 'dark', 'fruit', 'subtle', 'earth', 'subtle', 'spice', 'light', 'palate', 'medium', 'high', 'carbonation', 'good', 'brew', 'bit', 'one', 'that', 'sip', 'style', 'what', 'one', 'smoke', 'note', 'glass', 'which', 'nice', 'touch', 'that', 'level', 'complexity', 'my', 'palate', 'nice', 'touch', 'roastiness', 'dark', 'fruit', 'profile', 'one', 'that', 'glad', 'chance', 'rest', 'brewing', 'russian', 'river']


### Parser

In [6]:
# Parser
# Extract noun chunks in the text (with length > 1)
# Note: if dependency parsing is not needed, use:
#       spacy.load('en', parser = False) to increase speed
print([np.lemma_ for np in review.noun_chunks if len(np) > 1])

['another big tick', 'my russian river –tion quest', 'the straggler', 'big thanks', 'this one', 'a green man snifter', 'my –tion glass', 'a huge pop', 'a deep ruby red brown', 'a one finger off - white head', 'a minimal amount', 'the glass', 'the aroma', 'sweet malt', 'dark chocolate', 'dark fruit', 'the flavor', 'sweet malt', 'dark fruit', 'subtle earth', 'subtle spice', 'the palate', 'medium - high carbonation', 'a good brew', 'quite a bit', 'this one', 'each sip', 'the style', 'this one', 'the smoke note', 'the glass', 'a very nice touch', 'another level', 'my palate', 'a nice touch', 'dark fruit profile', 'the chance', 'the rest']


Some of these dependencies (e.g. "creamy head", "earthy spice") are more interesting than others (e.g. "this one"). 
We can use a rule based system to extract them

In [7]:
for np in review.noun_chunks:
    toks = [token.pos_ for token in np]
    tok_count = toks.count('PROPN') + toks.count('NOUN') + toks.count('ADJ')

    if  tok_count == len(toks) & len(toks) > 1:
        print(np.lemma_)

my russian river –tion quest
big thanks
my –tion glass
sweet malt
dark chocolate
dark fruit
sweet malt
dark fruit
subtle earth
subtle spice
my palate
dark fruit profile


### Entity Recognition

In [8]:
# Currently not interesting, might be interesting for other projects
print([(entity, entity.label_) for entity in review.ents])

[(Russian River, 'LOC'), (BilbosNuts, 'GPE'), (Green Man, 'ORG'), (Florida, 'GPE'), (one, 'CARDINAL'), (one, 'CARDINAL'), (Russian, 'NORP')]
