In [1]:
import numpy as np
import pandas as pd
import time
import random
from random import sample
import string
import pickle

# nltk models
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

#spaCy
import spacy
from spacy import displacy

# gensim models
import gensim
from gensim import corpora, models
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamulticore import LdaMulticore

# Visualize topics
import pyLDAvis
import pyLDAvis.gensim



# Meet spaCy - "It's minimal and opinionated"

spaCy is a free, **open-source** library for advanced **Natural Language Processing (NLP)** in Python.

Features:

-  Tokenization
-  POS Tagging
-  Dependency Parsing
-  Lemmatization
-  Sentence Detection
-  Entity Recognition

And more...

https://spacy.io/


In [71]:
nlp = spacy.load('en_core_web_sm')

In [72]:
%%time
text = 'Best restaurant in Newcastle. Delicious cocktails and it has a really friendly atmosphere.'

doc = nlp(text)
doc

Wall time: 31.2 ms


In [73]:
token_text = [token.text for token in doc]
token_pos = [token.pos_ for token in doc]
token_lemma = [token.lemma_ for token in doc]
token_entity = [token.ent_type_ for token in doc]
token_stop = [token.is_stop for token in doc]
token_vec = [token.vector for token in doc]

headers = ['token_text','token_lemma','token_pos','token_entity','token_is_stop','token_vec']

pd.DataFrame(list(zip(token_text, token_lemma, token_pos, token_entity,token_stop,token_vec)),columns=headers)

Unnamed: 0,token_text,token_lemma,token_pos,token_entity,token_is_stop,token_vec
0,Best,good,ADJ,,False,"[-1.6054, 4.53702, -0.672109, -2.05529, -1.600..."
1,restaurant,restaurant,NOUN,,False,"[1.33561, 0.368323, 2.64288, 0.373385, -2.4033..."
2,in,in,ADP,,True,"[1.44569, 1.10417, -0.406471, 1.28426, 1.65253..."
3,Newcastle,newcastle,PROPN,GPE,False,"[-2.38429, 0.409086, 3.07898, 0.594176, -0.754..."
4,.,.,PUNCT,,False,"[0.474397, 1.99392, 2.95767, -0.329908, 1.3399..."
5,Delicious,delicious,PROPN,,False,"[-3.31449, 3.54701, -0.886751, 2.62596, -3.169..."
6,cocktails,cocktail,NOUN,,False,"[0.890365, 3.60857, -1.63054, -2.22932, 1.359,..."
7,and,and,CCONJ,,True,"[0.74245, -1.03995, -0.239206, -1.88797, 2.333..."
8,it,-PRON-,PRON,,True,"[-1.6597, 0.930871, 3.74128, 2.16395, -2.18548..."
9,has,have,VERB,,True,"[-3.41011, 2.17194, -1.65569, -1.99658, 3.2991..."


In [74]:
token_head = [token.head for token in doc]
token_children = [list(token.children) for token in doc]

headers_ = ['token_text','token_head','token_children']

pd.DataFrame(list(zip(token_text, token_head, token_children)),columns=headers_)

Unnamed: 0,token_text,token_head,token_children
0,Best,restaurant,[]
1,restaurant,restaurant,"[Best, in, .]"
2,in,restaurant,[Newcastle]
3,Newcastle,in,[]
4,.,restaurant,[]
5,Delicious,cocktails,[]
6,cocktails,has,"[Delicious, and, it]"
7,and,cocktails,[]
8,it,cocktails,[]
9,has,has,"[cocktails, atmosphere, .]"


In [75]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

  "__main__", mod_spec)
  "__main__", mod_spec)


# Noun chunks

In [76]:
list(doc.noun_chunks)

[Best restaurant,
 Newcastle,
 Delicious cocktails,
 it,
 a really friendly atmosphere]

# Sentences

In [77]:
list(doc.sents)

[Best restaurant in Newcastle.,
 Delicious cocktails and it has a really friendly atmosphere.]

# Doc2Vec

300 dimensions doc2vec as average of token vectors trained using GloVe on Common Crawl dataset

https://en.wikipedia.org/wiki/Common_Crawl

https://spacy.io/models/en#section-en_vectors_web_lg


In [81]:
nlp = spacy.load('en_core_web_lg')

In [85]:
docs = ['Pork is amazing','Sausage was great','Data Science made simple','Physics studies laws of the universe']

In [86]:
def doc2vec(docs):

    """
    Get doc2vec representations of docs using spaCy pre-trained word vectors
    """
    
    doc_text = []
    doc_vec = []

    for doc in nlp.pipe(docs):
        doc_text.append(doc.text)
        doc_vec.append(doc.vector)
    
    headers = ['doc_text','doc_vec']

    return pd.DataFrame(list(zip(doc_text, doc_vec)),columns=headers)

df = doc2vec(docs)
df

Unnamed: 0,doc_text,doc_vec
0,Pork is amazing,"[-0.332147, 0.185507, 0.2583, 0.130159, 0.1209..."
1,Sausage was great,"[-0.206415, 0.324179, 0.18584, 0.0150927, -7.1..."
2,Data Science made simple,"[-0.26445, 0.0599757, -0.181192, 0.0580605, -0..."
3,Physics studies laws of the universe,"[0.0194767, 0.0151591, -0.171293, -0.183741, -..."


In [87]:
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

docvecs = df['doc_vec'].tolist()
cos_sim = cosine_similarity(docvecs)
df_sim = pd.DataFrame(cos_sim,columns=df['doc_text'].tolist(),index=df['doc_text'].tolist())
df_sim

Unnamed: 0,Pork is amazing,Sausage was great,Data Science made simple,Physics studies laws of the universe
Pork is amazing,1.0,0.816263,0.543974,0.441451
Sausage was great,0.816263,1.0,0.508645,0.422934
Data Science made simple,0.543974,0.508645,1.0,0.744686
Physics studies laws of the universe,0.441451,0.422934,0.744686,1.0


# Let's build an NLP pipeline

In [14]:
#load dataset
path = 'C:\\Users\\schapira.d\\Desktop\\Data Science Meetup\\yelp_reviews_1M.csv'
reviews_df = pd.read_csv(path,encoding='utf-8')
reviews = reviews_df['text'].fillna('').tolist()
ratings = reviews_df['stars_x'].tolist()
reviews_df.head(1)

Unnamed: 0,business_id,name,city,categories,text,stars_x
0,vXEZ-r6fah-5Fjt3a6c-Gw,"""The Cheesecake Factory""",Pittsburgh,American (Traditional);Desserts;Food;American ...,One of my favorite places too take the kids wh...,4


In [63]:
nlp = spacy.load('en_core_web_sm',disable=['parser','ner'])

In [64]:
def TextPreprocessSpaCy(docs):
    text = []
    pos = ['ADJ','NOUN']
    stop = ['-pron-']
    for doc in nlp.pipe(docs):
        tokens = [token.lemma_.lower() for token in doc if token.pos_ in pos and token.is_stop == False] 
        tokens = [i for i in tokens if i not in stop]
        text.append(tokens)
                
    return text

def TextPreprocessNLTK(docs):
    text = []
    stop = stopwords.words('english')
    lemma = WordNetLemmatizer()
    pos = ['JJ','JJR','JJS','NN','NNS']
    
    for i in docs:
        tokens = word_tokenize(i.lower()) # tokenize
        tokens = pos_tag(tokens) # POS tagger
        tokens = [i[0] for i in tokens if i[1] in pos] # POS filter
        tokens = [i for i in tokens if i not in string.punctuation] # remove punctuation
        tokens = [i for i in tokens if i not in stop] # remove stopwords
        tokens = [lemma.lemmatize(i) for i in tokens] # lemmatize
        text.append(tokens)
        
    return text

print('done')

done


In [70]:
start = time.time()
SpaCy = TextPreprocessSpaCy(reviews[0:1000])
end = time.time()
print("{} \n --SpaCy Run time: {}s".format(SpaCy[40],(end-start)))

start = time.time()
NLTK = TextPreprocessNLTK(reviews[0:1000])
end = time.time()
print("\n{} \n --NLTK Run time: {}s".format(NLTK[40],(end-start)))

print("\n{}".format(reviews[40]))

['good', 'place', 'sushi', 'area', 'everything', 'fresh', 'chef', 'pride', 'piece', 'sushi'] 
 --SpaCy Run time: 6.964414119720459s

['best', 'place', 'sushi', 'area', 'everything', 'fresh', 'chef', 'piece', 'sushi'] 
 --NLTK Run time: 7.787414073944092s

By far the best place to get sushi in the area.  Everything is fresh and you can tell that the chef takes pride in each piece of sushi that he creates.


In [18]:
%%time
reviews_spacy = TextPreprocessSpaCy(reviews)

Wall time: 2h 40min 3s


In [19]:
path ='C:\\Users\\schapira.d\\Desktop\\Data Science Meetup\\tokens.pkl' 
with open(path,'wb') as f:
    pickle.dump(reviews_spacy, f)

#load preprocessed dataset:
import pickle
path_tokens ='C:\\Users\\schapira.d\\Desktop\\Data Science Meetup\\tokens_spacy.pkl' 
with open(path_tokens,'rb') as f:
    reviews_spacy = pickle.load(f)

# Phrases model

In [20]:
%%time
#Phrases Modelling
bigram_model = Phrases(reviews_spacy,min_count=25)
bigram_phraser = Phraser(bigram_model)
trigram_model = Phrases(bigram_phraser[reviews_spacy],min_count=25)
trigram_phraser = Phraser(trigram_model)

reviews_trigram = list(trigram_phraser[bigram_phraser[reviews_spacy]])

Wall time: 6min 56s


In [22]:
path ='C:\\Users\\schapira.d\\Desktop\\Data Science Meetup\\phrases.pkl' 
with open(path,'wb') as f:
    pickle.dump(reviews_trigram, f)

In [69]:
print(reviews_trigram[40])

['good', 'place', 'sushi', 'area', 'everything', 'fresh', 'chef', 'pride', 'piece', 'sushi']


In [24]:
#Transforming to df for unstacking and join
df_phrases = pd.DataFrame({"Phrases" : reviews_trigram}).head(100000)

#Unstacking...
df = pd.DataFrame({'Index':np.repeat(df_phrases.index.values, df_phrases.Phrases.str.len()),
              'Phrases':np.concatenate(df_phrases.Phrases.values)})
df.set_index('Index', inplace = True)

#Joining with full data
reviews_phrases = pd.merge(df,reviews_df.head(100000),left_index=True,right_index=True).reset_index(drop=True)
pd.to_numeric(reviews_phrases.stars_x)

reviews_phrases.head()

Unnamed: 0,Phrases,business_id,name,city,categories,text,stars_x
0,favorite,vXEZ-r6fah-5Fjt3a6c-Gw,"""The Cheesecake Factory""",Pittsburgh,American (Traditional);Desserts;Food;American ...,One of my favorite places too take the kids wh...,4
1,place,vXEZ-r6fah-5Fjt3a6c-Gw,"""The Cheesecake Factory""",Pittsburgh,American (Traditional);Desserts;Food;American ...,One of my favorite places too take the kids wh...,4
2,kid,vXEZ-r6fah-5Fjt3a6c-Gw,"""The Cheesecake Factory""",Pittsburgh,American (Traditional);Desserts;Food;American ...,One of my favorite places too take the kids wh...,4
3,college,vXEZ-r6fah-5Fjt3a6c-Gw,"""The Cheesecake Factory""",Pittsburgh,American (Traditional);Desserts;Food;American ...,One of my favorite places too take the kids wh...,4
4,holiday,vXEZ-r6fah-5Fjt3a6c-Gw,"""The Cheesecake Factory""",Pittsburgh,American (Traditional);Desserts;Food;American ...,One of my favorite places too take the kids wh...,4


In [201]:
#Pivot phrases by avg. rating
phrases = pd.pivot_table(reviews_phrases, index='Phrases',aggfunc={'stars_x':[np.mean,len]})
phrases.columns = phrases.columns.to_series().str.join('_')
df = phrases.sort_values('stars_x_mean',ascending=False)
df.columns = ['term_frequency','avg_rating']
df = df[df.term_frequency > 100]

In [202]:
#Top phrases with highest avg. rating
df.head(15)

Unnamed: 0_level_0,term_frequency,avg_rating
Phrases,Unnamed: 1_level_1,Unnamed: 2_level_1
recommend,376,4.643617
love_love,133,4.639098
hidden_gem,423,4.626478
personal_favorite,152,4.611842
gem,857,4.588098
best,462,4.577922
worth_penny,165,4.575758
impeccable,352,4.571023
incredible,1432,4.567039
phenomenal,724,4.563536


In [203]:
#Top phrases with lowest avg. rating
df.tail(15)

Unnamed: 0_level_0,term_frequency,avg_rating
Phrases,Unnamed: 1_level_1,Unnamed: 2_level_1
nasty,420,1.890476
tasteless,543,1.858195
inedible,323,1.820433
terrible,2225,1.788315
pathetic,118,1.779661
rude,1912,1.775628
awful,1128,1.763298
apology,355,1.760563
horrible,2039,1.680235
unacceptable,164,1.621951


# Topic Modelling

In [25]:
%%time
# turn tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(reviews_trigram)
dictionary.filter_extremes(no_below=10, no_above=0.4)
dictionary.compactify()

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(i) for i in reviews_trigram]

Wall time: 1min 17s


In [26]:
%%time
#where the magic happens
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus,
                                                    num_topics=50, 
                                                    id2word=dictionary, 
                                                    workers=3, passes=5)

lda_model.save('C:\\Users\\schapira.d\\Desktop\\Data Science Meetup\\lda_505.model')

Wall time: 4h 22min 16s


lda_model = models.LdaModel.load('C:\\Users\\schapira.d\\Desktop\\Data Science Meetup\\lda_50.model')

In [28]:
lda_model.print_topics(num_words=5,num_topics=-1)

[(0,
  '0.158*"small" + 0.151*"portion" + 0.106*"large" + 0.103*"huge" + 0.070*"size"'),
 (1,
  '0.174*"-pron-" + 0.077*"husband" + 0.060*"wife" + 0.048*"kid" + 0.048*"family"'),
 (2,
  '0.068*"pasta" + 0.061*"bread" + 0.055*"italian" + 0.054*"sauce" + 0.032*"salad"'),
 (3,
  '0.124*"cheese" + 0.043*"onion" + 0.040*"bacon" + 0.026*"sauce" + 0.016*"little"'),
 (4,
  '0.080*"coffee" + 0.048*"tea" + 0.038*"drink" + 0.038*"water" + 0.037*"cup"'),
 (5,
  '0.074*"appetizer" + 0.052*"dinner" + 0.047*"entree" + 0.034*"meal" + 0.027*"dessert"'),
 (6,
  '0.067*"order" + 0.059*"service" + 0.054*"minute" + 0.052*"time" + 0.034*"server"'),
 (7,
  '0.090*"vegan" + 0.077*"crepe" + 0.048*"late_night" + 0.047*"takeout" + 0.040*"shake"'),
 (8,
  '0.088*"dessert" + 0.059*"sweet" + 0.046*"cake" + 0.037*"ice_cream" + 0.034*"chocolate"'),
 (9,
  '0.032*"guy" + 0.025*"thing" + 0.024*"people" + 0.021*"way" + 0.020*"time"'),
 (10,
  '0.145*"waffle" + 0.018*"traffic" + 0.016*"shopping_center" + 0.015*"challenge

In [29]:
def TopicDetection(doc,min_topic_freq,topn):
    """
    Runs LDA against a document and returns most dominant topics & top keywords
    associated with topics. 
    """
    
    doc_tokens = TextPreprocessSpaCy(doc)[0] #spaCy preprocess
    doc_trigram = list(trigram_phraser[bigram_phraser[doc_tokens]]) # phrase model
    doc_bow = dictionary.doc2bow(doc_trigram) #create bow representation
    doc_lda = lda_model[doc_bow] # run LDA on doc
    
    #create columns for output df
    topic_num = [x[0] for x in doc_lda]
    topic_freq = [x[1] for x in doc_lda]
    topic_keywords = []
    for i in doc_lda:
        keywords = [x[0] for x in lda_model.show_topic(i[0],topn=topn)]
        topic_keywords.append(keywords)
    
    headers = ['topic_num','topic_freq','topic_keywords']
    df = pd.DataFrame(list(zip(topic_num, topic_freq, topic_keywords)),columns=headers)
    df = df[df.topic_freq>min_topic_freq].sort_values('topic_freq',ascending=False).reset_index(drop=True)
    
    return df

In [30]:
#Reviews - 2,30 german,70
text = [reviews[70]]
%time topic = TopicDetection(text,0.1,5)
print("{}\n\n{}".format(topic,text))

Wall time: 203 ms
   topic_num  topic_freq                               topic_keywords
0         38    0.268456  [great, service, excellent, friendly, fast]
1          2    0.178915        [pasta, bread, italian, sauce, salad]
2          5    0.148292   [appetizer, dinner, entree, meal, dessert]
3         34    0.131870    [great, amazing, -pron-, service, server]

["Angela's is probably my favorite place in Charlotte and certainly the best Italian food around. If I lived closer I'd eat here all the time.\n\nEverything I've had here has been delicious. I typically get the Veal Parm. It's tasty and tender. I've also had spaghetti, gnocchi, and chicken Parm, and they have all been outstanding. I can also vouch for the eggplant rollatini - one of my wife's favorites. \n\nThe appetizers have been great.  We've had the calamari and the mozzarella caprese on several occasions, and they have been great. \n\nI don't usually get dessert, although I had a cannoli the once. Home run!!\n\nTo top 

In [31]:
text = ['My son loves yoyo sushi, the rolls are amazing and fresh but prices are a bit high']
topic = TopicDetection(text,0.1,6)
print("{}\n\n{}".format(topic,text))

   topic_num  topic_freq                                     topic_keywords
0         20    0.361946         [sushi, roll, fish, fresh, chef, japanese]
1         19    0.286807  [price, buffet, cheap, worth, quality, expensive]
2          1    0.186291         [-pron-, husband, wife, kid, family, time]

['My son loves yoyo sushi, the rolls are amazing and fresh but prices are a bit high']


In [90]:
text = ['Une baguette de pain ou simplement baguette est une variété de pain, reconnaissable à sa forme allongée']
topic = TopicDetection(text,0.1,6)
print("{}\n\n{}".format(topic,text))

   topic_num  topic_freq                                 topic_keywords
0         15       0.505             [et, la, par, mai, resto, service]
1          2       0.255  [pasta, bread, italian, sauce, salad, tomato]

['Une baguette de pain ou simplement baguette est une variété de pain, reconnaissable à sa forme allongée']


In [115]:
#function forked from:
#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=reviews, ratings=ratings):
    """
    Extract dominant topic from each document and append original text & rating
    """
    
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text and rating to the end of the output
    contents = pd.Series(texts)
    ratings = pd.Series(ratings)
    sent_topics_df = pd.concat([sent_topics_df, contents, ratings], axis=1)
    return(sent_topics_df)

#Sample from original data -optional so it runs quicker-
corpus_sample, reviews_sample, ratings_sample = zip(*random.sample(list(zip(corpus, reviews, ratings)), 10000))

#Run fuction
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, 
                                                  corpus=corpus_sample, 
                                                  texts=reviews_sample,
                                                  ratings=ratings_sample)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text', 'Rating']

# Show
df_dominant_topic.head()

In [176]:
df_topics = pd.pivot_table(df_dominant_topic,index=['Dominant_Topic'],
                           aggfunc={'Rating':[np.mean,len],'Keywords':np.unique})
df_topics.columns = df_topics.columns.to_series().str.join('_')
df = df_topics.sort_values('Rating_mean',ascending=False)
df[df.Rating_len > 25]

Unnamed: 0_level_0,Keywords_unique,Rating_len,Rating_mean
Dominant_Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34.0,"great, amazing, -pron-, service, server, exper...",823,4.55407
36.0,"staff, friendly, great, awesome, nice, clean, ...",459,4.529412
38.0,"great, service, excellent, friendly, fast, atm...",739,4.495264
12.0,"dish, flavor, menu, delicious, taste, bite, te...",227,4.242291
8.0,"dessert, sweet, cake, ice_cream, chocolate, de...",113,4.088496
16.0,"meat, veggie, vegetarian, fresh, option, gyro,...",112,4.053571
49.0,"lunch, sandwich, salad, -pron-, fresh, special...",318,3.987421
0.0,"small, portion, large, huge, size, big, portio...",32,3.96875
26.0,"restaurant, chinese, style, authentic, dish, f...",133,3.962406
29.0,"sauce, wing, pork, bbq, meat, rib, chicken, ho...",219,3.949772
