In [1]:
import numpy as np
import pandas as pd
import time
import random
from random import sample
import string
import pickle

# nltk models
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

#spaCy
import spacy
from spacy import displacy

# gensim models
import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.phrases import Phrases, Phraser

# Visualize the topics
import pyLDAvis
import pyLDAvis.gensim 



# Meet spaCy - "It's minimal and opinionated"

spaCy is a free, **open-source** library for advanced **Natural Language Processing (NLP)** in Python.

Features:

-  Tokenization
-  POS Tagging
-  Dependency Parsing
-  Lemmatization
-  Sentence Detection
-  Entity Recognition

And more...

https://spacy.io/


In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
%%time
text = 'I love this restaurant in Newcastle. Delicious cocktails and it has a really friendly atmosphere.'

doc = nlp(text)
doc

Wall time: 15.6 ms


In [4]:
token_text = [token.text for token in doc]
token_pos = [token.pos_ for token in doc]
token_lemma = [token.lemma_ for token in doc]
token_entity = [token.ent_type_ for token in doc]
token_stop = [token.is_stop for token in doc]
token_vec = [token.vector for token in doc]

headers = ['token_text','token_lemma','token_pos','token_entity','token_is_stop','token_vec']

pd.DataFrame(list(zip(token_text, token_lemma, token_pos, token_entity,token_stop,token_vec)),columns=headers)

Unnamed: 0,token_text,token_lemma,token_pos,token_entity,token_is_stop,token_vec
0,I,-PRON-,PRON,,False,"[0.18733, 0.40595, -0.51174, -0.55482, 0.03971..."
1,love,love,VERB,,False,"[0.13949, 0.53453, -0.25247, -0.12565, 0.04874..."
2,this,this,DET,,False,"[-0.087595, 0.35502, 0.063868, 0.29292, -0.236..."
3,restaurant,restaurant,NOUN,,False,"[0.47022, 0.027832, 0.33726, -0.60538, 1.1151,..."
4,in,in,ADP,,False,"[0.089187, 0.25792, 0.26282, -0.029365, 0.4718..."
5,Newcastle,newcastle,PROPN,GPE,False,"[0.39131, -0.4687, -0.26697, -0.45751, 0.85533..."
6,.,.,PUNCT,,False,"[0.012001, 0.20751, -0.12578, -0.59325, 0.1252..."
7,Delicious,delicious,ADJ,,False,"[-0.27801, -0.14519, 0.49453, 0.12529, -0.0576..."
8,cocktails,cocktail,NOUN,,False,"[-0.054778, -0.17894, 0.70387, -0.51988, 0.156..."
9,and,and,CCONJ,,False,"[-0.18567, 0.066008, -0.25209, -0.11725, 0.265..."


In [5]:
token_head = [token.head for token in doc]
token_children = [list(token.children) for token in doc]

headers_ = ['token_text','token_head','token_children']

pd.DataFrame(list(zip(token_text, token_head, token_children)),columns=headers_)

Unnamed: 0,token_text,token_head,token_children
0,I,love,[]
1,love,love,"[I, restaurant, in, .]"
2,this,restaurant,[]
3,restaurant,love,[this]
4,in,love,[Newcastle]
5,Newcastle,in,[]
6,.,love,[]
7,Delicious,cocktails,[]
8,cocktails,cocktails,"[Delicious, and, has]"
9,and,cocktails,[]


In [6]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

  "__main__", mod_spec)
  "__main__", mod_spec)


In [7]:
displacy.render(doc, style='ent', jupyter=True, options={'distance': 90})

# Noun chunks

In [8]:
list(doc.noun_chunks)

[I,
 this restaurant,
 Newcastle,
 Delicious cocktails,
 it,
 a really friendly atmosphere]

# Sentences

In [9]:
list(doc.sents)

[I love this restaurant in Newcastle.,
 Delicious cocktails and it has a really friendly atmosphere.]

# Doc2Vec

300 dimensions doc2vec as average of token vectors trained using GloVe on Common Crawl dataset

https://en.wikipedia.org/wiki/Common_Crawl

https://spacy.io/models/en#section-en_vectors_web_lg


In [10]:
doc_vec = list(doc.vector)
print("{}...\nVector dimension:{}".format(doc_vec[0:10],len(doc_vec)))

[0.04656335, 0.19671877, -0.040661585, -0.18454778, 0.21376491, 0.11574505, 0.095128357, -0.22332935, 0.03805406, 2.0259137]...
Vector dimension:300


In [11]:
docs = ['Pork is amazing','Sausage was great','Data Science made simple']

In [12]:
def doc2vec(docs):

    """
    Get doc2vec representations of docs using spaCy pre-trained word vectors
    """
    
    doc_text = []
    doc_vec = []

    for doc in nlp.pipe(docs):
        doc_text.append(doc.text)
        doc_vec.append(doc.vector)
    
    headers = ['doc_text','doc_vec']

    return pd.DataFrame(list(zip(doc_text, doc_vec)),columns=headers)

df = doc2vec(docs)
df

Unnamed: 0,doc_text,doc_vec
0,Pork is amazing,"[-0.332147, 0.185507, 0.2583, 0.130159, 0.1209..."
1,Sausage was great,"[-0.206415, 0.324179, 0.18584, 0.0150927, -7.1..."
2,Data Science made simple,"[-0.26445, 0.0599757, -0.181192, 0.0580605, -0..."


In [13]:
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

docvecs = df['doc_vec'].tolist()
cos_sim = cosine_similarity(docvecs)
df_sim = pd.DataFrame(cos_sim,columns=df['doc_text'].tolist(),index=df['doc_text'].tolist())
df_sim

Unnamed: 0,Pork is amazing,Sausage was great,Data Science made simple
Pork is amazing,1.0,0.816263,0.543974
Sausage was great,0.816263,1.0,0.508645
Data Science made simple,0.543974,0.508645,1.0


# Let's build an NLP pipeline

In [14]:
#load dataset
path = 'C:\\Users\\schapira.d\\Desktop\\Data Science Meetup\\yelp_reviews_1M.csv'
reviews_df = pd.read_csv(path,encoding='utf-8')
reviews = reviews_df['text'].fillna('').tolist()
reviews_df.head(1)

Unnamed: 0,business_id,name,city,categories,text,stars_x
0,vXEZ-r6fah-5Fjt3a6c-Gw,"""The Cheesecake Factory""",Pittsburgh,American (Traditional);Desserts;Food;American ...,One of my favorite places too take the kids wh...,4


In [15]:
nlp = spacy.load('en_core_web_sm',disable=['parser','ner'])

In [16]:
def TextPreprocessSpaCy(docs):
    text = []
    pos = ['ADJ','NOUN']
    for doc in nlp.pipe(docs):
        tokens = [token.lemma_.lower() for token in doc if token.pos_ in pos and token.is_stop == False] 
        text.append(tokens)
                
    return text

def TextPreprocessNLTK(docs):
    text = []
    stop = stopwords.words('english')
    lemma = WordNetLemmatizer()
    pos = ['JJ','JJR','JJS','NN','NNS']
    
    for i in docs:
        tokens = word_tokenize(i.lower()) # tokenize
        tokens = pos_tag(tokens) # POS tagger
        tokens = [i[0] for i in tokens if i[1] in pos] # POS filter
        tokens = [i for i in tokens if i not in string.punctuation] # remove punctuation
        tokens = [i for i in tokens if i not in stop] # remove stopwords
        tokens = [lemma.lemmatize(i) for i in tokens] # lemmatize
        text.append(tokens)
        
    return text

print('done')

done


In [17]:
start = time.time()
SpaCy = TextPreprocessSpaCy(reviews[0:1000])
end = time.time()
print("{} \n --SpaCy Run time: {}s".format(SpaCy[5],(end-start)))

start = time.time()
NLTK = TextPreprocessNLTK(reviews[0:1000])
end = time.time()
print("\n{} \n --NLTK Run time: {}s".format(NLTK[5],(end-start)))

print("\n{}".format(reviews[5]))

['great', 'tour', 'end', 'addition', 'restaurant', 'stop', 'rooftop', 'club', 'charming', 'restaurant', 'outdoor', 'garden', 'stop', 'nice', 'downtown', 'tour'] 
 --SpaCy Run time: 7.053212404251099s

['great', 'tour', 'full', 'end', 'addition', 'restaurant', 'stop', 'rooftop', 'club', 'container', 'park', 'charming', 'restaurant', 'outdoor', 'garden', 'stop', 'fremont', 'experience', 'nice', 'downtown', 'vega', 'enough', 'tour'] 
 --NLTK Run time: 11.497220277786255s

This was a great tour!  We were so full by the end.  In addition to the restaurants, stops included a rooftop club, the Container Park and a charming restaurant with outdoor garden.  The stops were outside of the Fremont Experience, so it was nice to see what else downtown Vegas has to offer.  (But still close enough to walk over to Fremont and check that out too.)  Highly recommend this tour!


In [18]:
%%time
reviews_spacy = TextPreprocessSpaCy(reviews)

Wall time: 2h 40min 3s


In [19]:
path ='C:\\Users\\schapira.d\\Desktop\\Data Science Meetup\\tokens.pkl' 
with open(path,'wb') as f:
    pickle.dump(reviews_spacy, f)

#load preprocessed dataset:
import pickle
path_tokens ='C:\\Users\\schapira.d\\Desktop\\Data Science Meetup\\tokens_spacy.pkl' 
with open(path_tokens,'rb') as f:
    reviews_spacy = pickle.load(f)

# Phrases model

In [20]:
%%time
#Phrases Modelling
bigram_model = Phrases(reviews_spacy,min_count=25)
bigram_phraser = Phraser(bigram_model)
trigram_model = Phrases(bigram_phraser[reviews_spacy],min_count=25)
trigram_phraser = Phraser(trigram_model)

reviews_trigram = list(trigram_phraser[bigram_phraser[reviews_spacy]])

Wall time: 6min 56s


In [22]:
path ='C:\\Users\\schapira.d\\Desktop\\Data Science Meetup\\phrases.pkl' 
with open(path,'wb') as f:
    pickle.dump(reviews_trigram, f)

In [23]:
print(reviews_trigram[2])

['good', 'people', 'review', 'casual', 'nice', 'service', 'great', 'food', 'good', 'italian', 'restaurant', 'wine', 'selection', 'great', 'good', 'oil_vinegar', 'bread', 'bread', 'hot', 'table', 'oyster', 'good', 'nice', 'little', 'hidden_gem', 'downfall', 'coffee', 'brand', 'good', 'cup_coffee', 'espresso', 'dinner', 'quality', 'coffee', 'bean\\/brand']


In [24]:
#Transforming to df for unstacking and join
df_phrases = pd.DataFrame({"Phrases" : reviews_trigram}).head(100000)

#Unstacking...
df = pd.DataFrame({'Index':np.repeat(df_phrases.index.values, df_phrases.Phrases.str.len()),
              'Phrases':np.concatenate(df_phrases.Phrases.values)})
df.set_index('Index', inplace = True)

#Joining with full data
reviews_phrases = pd.merge(df,reviews_df.head(100000),left_index=True,right_index=True).reset_index(drop=True)
pd.to_numeric(reviews_phrases.stars_x)

reviews_phrases.head()

Unnamed: 0,Phrases,business_id,name,city,categories,text,stars_x
0,favorite,vXEZ-r6fah-5Fjt3a6c-Gw,"""The Cheesecake Factory""",Pittsburgh,American (Traditional);Desserts;Food;American ...,One of my favorite places too take the kids wh...,4
1,place,vXEZ-r6fah-5Fjt3a6c-Gw,"""The Cheesecake Factory""",Pittsburgh,American (Traditional);Desserts;Food;American ...,One of my favorite places too take the kids wh...,4
2,kid,vXEZ-r6fah-5Fjt3a6c-Gw,"""The Cheesecake Factory""",Pittsburgh,American (Traditional);Desserts;Food;American ...,One of my favorite places too take the kids wh...,4
3,college,vXEZ-r6fah-5Fjt3a6c-Gw,"""The Cheesecake Factory""",Pittsburgh,American (Traditional);Desserts;Food;American ...,One of my favorite places too take the kids wh...,4
4,holiday,vXEZ-r6fah-5Fjt3a6c-Gw,"""The Cheesecake Factory""",Pittsburgh,American (Traditional);Desserts;Food;American ...,One of my favorite places too take the kids wh...,4


# Topic Modelling

In [None]:
%%time
# turn tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(reviews_trigram)
dictionary.filter_extremes(no_below=10, no_above=0.4)
dictionary.compactify()

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(i) for i in reviews_trigram]

Wall time: 1min 17s


In [None]:
%%time
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus,num_topics=50, id2word=dictionary, workers=3, passes=10)
lda_model.save('C:\\Users\\schapira.d\\Desktop\\Data Science Meetup\\lda_5010.model')

In [None]:
#lda_model = models.LdaModel.load('C:\\Users\\schapira.d\\Desktop\\Data Science Meetup\\lda_50.model')

In [None]:
lda_model.print_topics(num_words=5,num_topics=-1)

In [None]:
def TopicDetection(doc,min_topic_freq,topn):
    """
    Runs LDA against a document and returns most dominant topics & top keywords
    associated with topics. 
    """
    
    doc_tokens = TextPreprocessSpaCy(doc)[0] #spaCy preprocess
    doc_trigram = list(trigram_phraser[bigram_phraser[doc_tokens]]) # phrase model
    doc_bow = dictionary.doc2bow(doc_trigram) #create bow representation
    doc_lda = lda_model[doc_bow] # run LDA on doc
    
    #create columns for output df
    topic_num = [x[0] for x in doc_lda]
    topic_freq = [x[1] for x in doc_lda]
    topic_keywords = []
    for i in doc_lda:
        keywords = [x[0] for x in lda_model.show_topic(i[0],topn=topn)]
        topic_keywords.append(keywords)
    
    headers = ['topic_num','topic_freq','topic_keywords']
    df = pd.DataFrame(list(zip(topic_num, topic_freq, topic_keywords)),columns=headers)
    df = df[df.topic_freq>min_topic_freq].sort_values('topic_freq',ascending=False).reset_index(drop=True)
    
    return df

In [None]:
#Reviews - 2,30 german,70
text = [reviews[70]]
%time topic = TopicDetection(text,0.1,5)
print("{}\n\n{}".format(topic,text))

In [None]:
text = ['My son loves yoyo sushi, the rolls are amazing and fresh but prices are a bit high']
topic = TopicDetection(text,0.1,6)
print("{}\n\n{}".format(topic,text))