In [None]:
import pandas as pd
import json
import spacy
from spacy import displacy
from tqdm import tqdm
from collections import Counter
from dask.distributed import Client, progress
import dask.bag as db
import math
from dask.diagnostics import ProgressBar
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pyLDAvis.gensim
import spacy
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full
import numpy as np
import pyLDAvis.gensim
pyLDAvis.enable_notebook()


In [None]:
# !python -m spacy download en_core_web_md

# Loading

In [None]:
def extract_category(row):
    cat = row.category if 'category' in row else None
    
    if cat is None or pd.isna(cat) and 'type' in row and not pd.isna(row.type):
        return row.type
    elif isinstance(cat,dict) and 'key' in cat and not pd.isna(cat['key']):
        return cat['key']
    elif 'ranking_category' in row and not pd.isna(row.ranking_category):
        return row.ranking_category
    return cat

In [None]:
data_df = pd.read_json('/Users/danmer/travel_data/source/bc_clean.json')
van_df = pd.read_json('/Users/danmer/travel_data/source/vancouver_clean.json')
data_df = pd.concat([data_df, van_df])

In [None]:
data_df.info()

In [None]:
data_df.category = data_df.apply(extract_category, axis=1)

In [None]:
data_df.category.unique()

In [None]:
data_df.category.value_counts()

### attractions

In [None]:
attractions = data_df[(data_df.category == 'attraction') & (data_df.rating.notnull())][['id', 'name','rating','reviews', 'website']]
attractions['num_reviews'] = attractions.apply(lambda r: len(r.reviews) if 'reviews' in r else 0, axis=1)
attractions.sort_values('num_reviews',ascending=False).drop_duplicates('name').head(5)

In [None]:
attractions[attractions.rating == 5].sort_values('num_reviews',ascending=False).drop_duplicates('name').head(5)

### extra

In [None]:
restaurants = data_df[(data_df.category == 'RESTAURANT') & (data_df.rating.notnull())][['name','rating','reviews']]
restaurants['num_reviews'] = restaurants.apply(lambda r: len(r.reviews) if 'reviews' in r else 0, axis=1)
restaurants.sort_values('num_reviews',ascending=False).drop_duplicates('name').head(5)

In [None]:
descriptions = list(data_df[(data_df.category == 'attraction') & (data_df.description.notnull()) \
                            & (data_df.description!='')].description)
len(descriptions)

## Glove + TF/IDF

### reviews aggregation

In [None]:
att_agg_reviews = {}
skipped = 0
att_df = data_df[(data_df.category == 'attraction')].drop_duplicates('name')[['name', 'reviews']]
for idx, (id, row) in enumerate(att_df.iterrows()):
    name, reviews = row
    res_reviews = []

    for rev in reviews:
        try:
            if rev['language'] == 'en':
                res_reviews.append(rev['text'])
        except Exception as ex:
            skipped += 1
            pass
    att_agg_reviews[name] = '\n'.join(res_reviews)
print(f'skipped: {skipped}')
len(att_agg_reviews)

In [None]:
with open('/Users/danmer/travel_data/tags/docs.json', 'w+') as f:
    json.dump(att_agg_reviews, f)

In [146]:
names = list(att_agg_reviews.keys())

In [148]:
texts = list(att_agg_reviews.values())

### nlp

In [163]:
nlp_model  = spacy.load('en_core_web_md')

  1%|          | 16/1999 [2:05:56<260:09:18, 472.29s/it]


In [None]:
def nlp_doc(text, nlp):
    def keep_token(t):
        return (t.is_alpha and 
                not (t.is_space or t.is_punct or 
                     t.is_stop or t.like_num))

    def lemmatize_doc(doc):
        return [ t.lemma_ for t in doc if keep_token(t)]
    
    def ent_doc(doc):
        return [(e.text, e.label_) for e in doc.ents]
    
    nlp_doc = nlp(text.lower())
    
    return {'tokens': lemmatize_doc(nlp_doc), 
            'entities': ent_doc(nlp_doc),
           'emb': nlp_doc.vector}
    

In [None]:
docs = list(att_agg_reviews.values())
len(docs)

In [None]:
client = Client(n_workers=4, threads_per_worker=1)
# http://localhost:8787/status http://localhost:8787/status

In [None]:
def nlp_docs(texts):
    nlp = spacy.load("en_core_web_md")
    return [nlp_doc(text, nlp) for text in texts]

nlp_docs = db.from_sequence(docs).repartition(math.ceil(len(docs)/100)).map_partitions(nlp_docs).compute()

In [None]:
for nlp_doc in nlp_docs:
    nlp_doc['emb'] = nlp_doc['emb'].tolist()

In [None]:
with open('/Users/danmer/travel_data/tags/nlp_docs.json', 'w+') as f:
    json.dump({name: d for d, name in zip(nlp_docs, att_agg_reviews.keys())}, f)

### tf-idf

In [180]:
def get_corpus(docs):
    docs_dict = Dictionary(docs)
    docs_dict.filter_extremes(no_below=20, no_above=0.2)
    docs_dict.compactify()

    docs_corpus = [docs_dict.doc2bow(doc) for doc in docs]
    
    return docs_corpus, docs_dict

def build_tfidf(docs_corpus, docs_dict):
    model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
    return model_tfidf


def save_tfidf(model_tfidf, path):
    model_tfidf.save(path)


def load_tfidf(path):
    return TfidfModel.load(path)


def corpus_from_nlp_docs(nlp_docs):
    return get_corpus([d['tokens'] for d in nlp_docs])


def get_sorted_tfidf_with_labels(docs_dict, doc_tfidf):
    return sorted([{'tag': docs_dict[id], 'weight':  w} 
                   for id, w in doc_tfidf], key=lambda x: x['weight'], reverse=True)

In [None]:
docs_corpus, docs_dict = corpus_from_nlp_docs(nlp_docs)

In [142]:
len(docs_dict)

2929

In [143]:
len(docs_corpus)

1999

In [None]:
tfidf = build_tfidf(docs_corpus, docs_dict)

In [182]:
save_tfidf(tfidf, '/Users/danmer/travel_data/tags/tfidf.gensim')

In [174]:
transformed_tfidf = tfidf[docs_corpus]
tags_dict = {name: get_sorted_tfidf_with_labels(docs_dict, doc_tfidf) 
              for name, doc_tfidf in zip(names, transformed_tfidf)}
len(tags_dict)

1999

In [175]:
tags_dict['Jericho Beach'][:10]

[{'tag': 'beach', 'weight': 0.7770620885352576},
 {'tag': 'shore', 'weight': 0.18827467972091194},
 {'tag': 'sunset', 'weight': 0.16856450115555727},
 {'tag': 'north', 'weight': 0.13881659659598547},
 {'tag': 'dark', 'weight': 0.1348516009244458},
 {'tag': 'bank', 'weight': 0.1220805388643879},
 {'tag': 'ocean', 'weight': 0.11948429338482722},
 {'tag': 'sandy', 'weight': 0.11785924677357029},
 {'tag': 'bay', 'weight': 0.11282639167940087},
 {'tag': 'swimming', 'weight': 0.09992094973061863}]

### hybrid embeddings

In [164]:
def get_embs(model_tfidf, nlp, nlp_docs):
    # http://dsgeek.com/2018/02/19/tfidf_vectors.html
    docs_corpus, docs_dict = corpus_from_nlp_docs(nlp_docs)
    docs_tfidf  = model_tfidf[docs_corpus]
    docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])

#     tfidf_emb_vecs = np.vstack([np.array(nlp_docs[i]['emb']) for i in range(len(nlp_docs))])
    tfidf_emb_vecs = np.vstack([nlp(docs_dict[i]).vector for i in range(len(docs_dict))])
    docs_emb = np.dot(docs_vecs, tfidf_emb_vecs) 
    
    return docs_emb

In [165]:
agg_reviews_embs = get_embs(tfidf, nlp_model, nlp_docs)

In [169]:
emb_dict = {k:v for k,v in zip(names, agg_reviews_embs.tolist())}
len((emb_dict['Jericho Beach']))

300

In [None]:
np.savetxt('attractions-agg-reviews-embs.txt', agg_reviews_embs, delimiter='\t')

In [None]:
with open("attractions-agg.txt", "w") as outf:
    outf.write('\n'.join([k.replace('\n',' ') for k in att_agg_reviews.keys()]))

# Summarization

In [144]:
import  gensim.summarization

def summarize(text):
    return gensim.summarization.summarize(text,  word_count=50)

summarize(docs[7])

'Lovely walk with great views to sea and lots of waterfowl on the water.\nWe went to Neck Point Park to just put our feet in the water and we end up taking many pictures, stopping to appreciate the views, and walking one of the many trails offered.\nWhat a great place to walk the trails and see the beauty of the ocean.'

In [None]:
partitions = math.ceil(len(texts)/100)
summaries = db.from_sequence(texts).repartition(partitions).map_partitions(summarize).compute()

summaries_didct = {name: summary for name, reviews in zip(names, summaries)}

# Elastic export 

In [166]:
export_df = data_df[(data_df.category == 'attraction'].drop(columns=['reviews']).drop_duplicates('name')

In [None]:
# todo: filter tours and advernurej

In [170]:
default = np.ones((300,), dtype='float32').tolist()
export_df['embedding'] = export_df.apply(lambda x: emb_dict[x['name']] if x['name'] in emb_dict and all(e != 0 for e in emb_dict[x['name']]) else default, axis=1)

In [171]:
assert export_df[export_df.name == 'Jericho Beach'].embedding.values[0] == emb_dict['Jericho Beach']

In [256]:
def get_location(row):
    if "longitude" not in row or 'latitude' not in row or pd.isna(row.latitude) or pd.isna(row.longitude):
        return None
    return f'{row.latitude},{row.longitude}'
export_df['location'] = export_df.apply(get_location, axis=1)

In [257]:
assert export_df[export_df.name == 'Jericho Beach'].location.values[0] == '49.273098,-123.20285'

In [176]:
export_df['tags'] = export_df.apply(lambda x: tags_dict[x['name']] if x['name'] in tags_dict and tags_dict[x['name']]  else {}, axis=1)

In [237]:
export_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1999 entries, 16752 to 6208
Data columns (total 74 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   id                                         0 non-null      float64
 1   type                                       0 non-null      object 
 2   name                                       1999 non-null   object 
 3   awards                                     1999 non-null   object 
 4   rankingPosition                            0 non-null      float64
 5   priceLevel                                 0 non-null      object 
 6   category                                   1999 non-null   object 
 7   rating                                     1670 non-null   float64
 8   hotelClass                                 0 non-null      float64
 9   phone                                      1570 non-null   object 
 10  address             

In [179]:
export_df.to_json('/Users/danmer/travel_data/tags/elastic-bc-van-emb.json', orient='records', lines=True)

In [213]:
mapping = {
  "mappings": {
    "dynamic_templates": [
      {
        "embs": {
          "match":   "embedding",
          "mapping": {
            "type": "dense_vector",
            "dims": 300
          }
        }
      },
      {
        "geo": {
          "match":   "location",
          "mapping": {
            "type": "geo_point"
          }
        }
      },
    {
        "tags": {
          "match": "tags",
          "mapping": {
            "type": "nested",
                "properties": {
                    "tag": {
                        "type": "text"
                    },
                    "weight": {
                        "type": "float"
                    }
                }
          }
        }
      }
    ]
  }
}

In [210]:
from elasticsearch import Elasticsearch
from elasticsearch.client import IndicesClient

es = Elasticsearch(timeout=600)

In [214]:
index='ta-embs-tags'

In [216]:
es.indices.create(index, body=mapping)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ta-embs-tags'}

In [258]:
records = export_df.to_dict('records')
len(records)

1999

In [259]:
import simplejson
lines = []
for idx, doc in enumerate(records):
    lines.append(json.dumps({ "index" : { "_index" : index, "_id" : f"{idx}"}}))
    lines.append(simplejson.dumps(doc, ignore_nan=True))
body = '\n'.join(lines)
res = es.bulk(index=index, body=body, timeout='10m')
print(f'took: {res["took"]}, errors: {res["errors"]}')

took: 5704, errors: False


In [260]:
[item for item in res['items'] if item['index']['status'] != 201 and item['index']['status'] != 200]

[]

# NER test

In [None]:
client = Client(n_workers=4, threads_per_worker=1)
def ner(texts):
    nlp = spacy.load("en_core_web_sm")
    docs = [nlp(desc) for desc in texts]
    return [[(e.text, e.label_) for e in doc.ents] for doc in docs ]
partitions = math.ceil(len(reviews)/5000)
with ProgressBar():
    rev_db = db.from_sequence(reviews).repartition(partitions).map_partitions(ner).flatten().compute()


In [None]:
with open('entities.json', 'w+') as f:
    json.dump(rev_db, f)

In [None]:
Counter([en for en, label in rev_db if label == 'GPE']).most_common(50)

In [None]:
desc_docs = [nlp(desc) for desc in descriptions]

In [None]:
for doc in rev_docs[:5]:
    displacy.render(doc, style="ent")

In [None]:
for doc in desc_docs[:5]:
    displacy.render(doc, style="ent")

In [None]:
all_ents = [e.text for doc in rev_docs for e in doc.ents if e.label_ == "LOC"]
Counter(all_ents).most_common(100)

In [None]:
data_df[(data_df.category == 'attraction')].sort_values('num_reviews',ascending=False).name.apply(lambda x: x.strip().lower()).drop_duplicates().to_csv('attractions', index=False, header=False)

# Test embeddings

### lazer

In [None]:
laser = Laser()
embs = laser.embed_sentences(reviews[:1000], lang='en')
# https://projector.tensorflow.org/

In [None]:
np.save('review-embs', embs)

In [None]:
np.savetxt('review-embs.txt', embs, delimiter='\t')

In [None]:
with open("reviews-10000.txt", "w") as outf:
    outf.write('\n'.join([r.replace('\n',' ')[:200] for r in reviews[:10000]]))

### fasttext

In [None]:
import fasttext
model = fasttext.load_model('/Users/danmer/deep-pdf-data/cc.en.300.bin')

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
review_embs = []
for review in tqdm(reviews[:10000]):
    doc = nlp(review)
    embs = []
    for token in doc:
        if token.text in model.words:
            embs.append(model[token.text])
    mean_emb = np.mean(np.array(embs), axis=0)
    review_embs.append(mean_emb)

In [None]:
np.savetxt('review-embs-fasttext-10000.txt', np.array(review_embs), delimiter='\t')

In [None]:
np.save('review-embs-fasttext-10000.npy', np.array(review_embs))

In [None]:
with open("reviews-26k.txt", "w") as outf:
    outf.write('\n'.join([r.replace('\n',' ')[:200] for r in all_reviews]))

# Topic modelling

In [97]:
lda_model = LdaModel(corpus=transformed_tfidf,
                                           id2word=docs_dict,
                                           num_topics=200, 
#                                            random_state=2,
#                                            update_every=1,
#                                            passes=10,
                                           alpha='auto',
                                           per_word_topics=True
                    )

In [None]:
lda_model.show_topics()

In [None]:
dash = pyLDAvis.gensim.prepare(lda_model, docs_corpus, docs_dict)
dash

In [None]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=docs_corpus,
                                                  texts=list(att_agg_reviews.keys()))
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# Show
df_dominant_topic.head(20)