In [1]:
import pandas as pd
import json
import spacy
from spacy import displacy
from tqdm import tqdm
from collections import Counter
# from dask.distributed import Client, progress
# import dask.bag as db
import math
# from dask.diagnostics import ProgressBar
# from laserembeddings import Laser
import numpy as np
import matplotlib.pyplot as plt
import numpy as np



# Loading

In [2]:
def extract_category(row):

    cat = row.category if 'category' in row else None
    
    if cat is None or pd.isna(cat) and 'type' in row and not pd.isna(row.type):
        return row.type
    elif isinstance(cat,dict) and 'key' in cat and not pd.isna(cat['key']):
        return cat['key']
    elif 'ranking_category' in row and not pd.isna(row.ranking_category):
        return row.ranking_category
    return cat

In [4]:
data_df = pd.read_json('/Users/danmer/travel_data/source/bc_clean.json')
van_df = pd.read_json('/Users/danmer/travel_data/source/vancouver_clean.json')
data_df = pd.concat([data_df, van_df])

In [5]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26241 entries, 0 to 6211
Data columns (total 73 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   id                                         20513 non-null  float64
 1   type                                       20513 non-null  object 
 2   name                                       26241 non-null  object 
 3   awards                                     26241 non-null  object 
 4   rankingPosition                            4377 non-null   float64
 5   priceLevel                                 20513 non-null  object 
 6   category                                   10105 non-null  object 
 7   rating                                     23172 non-null  float64
 8   hotelClass                                 1323 non-null   float64
 9   phone                                      23614 non-null  object 
 10  address                

In [6]:
data_df.category = data_df.apply(extract_category, axis=1)

In [7]:
data_df.category.unique()

array(['hotel', 'HOTEL', 'RESTAURANT', 'attraction', 'restaurant'],
      dtype=object)

In [8]:
data_df.category.value_counts()

RESTAURANT    16108
attraction     5728
restaurant     3082
hotel          1295
HOTEL            28
Name: category, dtype: int64

In [192]:
attractions = data_df[(data_df.category == 'attraction') & (data_df.rating.notnull())][['name','rating','reviews', 'website']]
attractions['num_reviews'] = attractions.apply(lambda r: len(r.reviews) if 'reviews' in r else 0, axis=1)
attractions.sort_values('num_reviews',ascending=False).drop_duplicates('name').head(5)

Unnamed: 0,name,rating,reviews,website,num_reviews
17548,Stanley Park,4.5,"[{'language': 'en', 'title': 'Stanley Park Je...",http://vancouver.ca/parks-recreation-culture/s...,2072
17229,Granville Island,4.5,"[{'language': 'en', 'title': 'Very quiet islan...",http://granvilleisland.com/,1719
4173,Rocky Mountaineer,4.5,"[{'language': 'en', 'title': 'Canada closing b...",http://www.rockymountaineer.com/en_US/routes_a...,1236
17885,The Butchart Gardens,4.5,"[{'language': 'en', 'title': 'Walk in the gard...",http://www.butchartgardens.com,995
16860,Capilano Suspension Bridge Park,4.5,"[{'language': 'en', 'title': 'Amazing ', 'text...",http://www.capbridge.com/,521


In [133]:
attractions[attractions.rating == 5].sort_values('num_reviews',ascending=False).drop_duplicates('name').head(5)

Unnamed: 0,name,rating,reviews,website,num_reviews
17952,Vancouver Seawall,5.0,"[{'language': 'en', 'title': 'Rented bikes and...",http://www.tourismvancouver.com/activities/hik...,259
17630,Wild Pacific Trail,5.0,"[{'language': 'en', 'title': 'Trail run', 'tex...",http://www.wildpacifictrail.com/,180
16761,Myra Canyon Park,5.0,"[{'language': 'en', 'title': 'Amazing!', 'text...",http://www.myratrestles.com,124
16942,Chesterman Beach,5.0,"[{'language': 'en', 'title': 'stunning beach',...",,88
16781,Pacific Rim National Park,5.0,"[{'language': 'en', 'title': 'Stunning', 'text...",http://pc.gc.ca/pacificrim,68


In [44]:
restaurants = data_df[(data_df.category == 'RESTAURANT') & (data_df.rating.notnull())][['name','rating','reviews']]
restaurants['num_reviews'] = restaurants.apply(lambda r: len(r.reviews) if 'reviews' in r else 0, axis=1)
restaurants.sort_values('num_reviews',ascending=False).drop_duplicates('name').head(5)

Unnamed: 0,name,rating,reviews,num_reviews
1610,Glowbal,4.5,[{'text': 'We went to Glowbal to celebrate my ...,855
1872,Black + Blue,4.5,[{'text': 'Had a good time and meal there. Had...,575
1659,IL Terrazzo,4.5,[{'text': 'Great choice in Victoria! 1. Food...,482
1371,Italian Kitchen,4.5,[{'text': 'The restaurant is bustling and hect...,463
1528,Cardero's Restaurant & Live Bait Marine Pub,4.5,[{'text': 'Went for a casual lunch and as usua...,409


In [12]:
descriptions = list(data_df[(data_df.category == 'attraction') & (data_df.description.notnull()) \
                            & (data_df.description!='')].description)
len(descriptions)

1385

## review embeddings

In [17]:
laser = Laser()
embs = laser.embed_sentences(reviews[:1000], lang='en')
# https://projector.tensorflow.org/

In [20]:
np.save('review-embs', embs)

In [23]:
np.savetxt('review-embs.txt', embs, delimiter='\t')

In [45]:
with open("reviews-10000.txt", "w") as outf:
    outf.write('\n'.join([r.replace('\n',' ')[:200] for r in reviews[:10000]]))

In [5]:
import fasttext
model = fasttext.load_model('/Users/danmer/deep-pdf-data/cc.en.300.bin')



In [2]:
nlp = spacy.load("en_core_web_sm")

In [42]:
review_embs = []
for review in tqdm(reviews[:10000]):
    doc = nlp(review)
    embs = []
    for token in doc:
        if token.text in model.words:
            embs.append(model[token.text])
    mean_emb = np.mean(np.array(embs), axis=0)
    review_embs.append(mean_emb)

100%|██████████| 10000/10000 [13:03<00:00, 12.76it/s] 


In [44]:
np.savetxt('review-embs-fasttext-10000.txt', np.array(review_embs), delimiter='\t')

In [46]:
np.save('review-embs-fasttext-10000.npy', np.array(review_embs))

In [77]:
with open("reviews-26k.txt", "w") as outf:
    outf.write('\n'.join([r.replace('\n',' ')[:200] for r in all_reviews]))

## Glove + TF/IDF

In [9]:

att_agg_reviews = {}
skipped = 0
att_df = data_df[(data_df.category == 'attraction')].drop_duplicates('name')[['name', 'reviews']]
for idx, (id, row) in enumerate(att_df.iterrows()):
    name, reviews = row
    res_reviews = []

    for rev in reviews:
        try:
            if rev['language'] == 'en':
                res_reviews.append(rev['text'])
        except Exception as ex:
            skipped += 1
            pass
    att_agg_reviews[name] = '\n'.join(res_reviews)
print(f'skipped: {skipped}')
len(att_agg_reviews)

skipped: 0


1999

In [10]:
import spacy
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full
import numpy as np

nlp  = spacy.load('en_core_web_md')

In [11]:

def nlp_doc(doc):
    def keep_token(t):
        return (t.is_alpha and 
                not (t.is_space or t.is_punct or 
                     t.is_stop or t.like_num))

    def lemmatize_doc(doc):
        return [ t.lemma_ for t in doc if keep_token(t)]
    
    return lemmatize_doc(nlp(doc))
    

def build_tfidf(docs, path):
    docs = [nlp_doc(d) for d in docs]
    docs_dict = Dictionary(docs)
    docs_dict.filter_extremes(no_below=20, no_above=0.2)
    docs_dict.compactify()

    docs_corpus = [docs_dict.doc2bow(doc) for doc in docs]
    model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
    
    model_tfidf.save(path)
    
def load_tfidf(path):
    return TfidfModel.load(path)

def get_embs(model_tfidf, docs):
    # http://dsgeek.com/2018/02/19/tfidf_vectors.html
    docs = [nlp_doc(d) for d in docs]
    docs_tfidf  = model_tfidf[docs]
    docs_vecs   = np.vstack([sparse2full(c, len(docs)) for c in docs_tfidf])

    tfidf_emb_vecs = np.vstack([nlp(docs[i]).vector for i in range(len(docs))])
    docs_emb = np.dot(docs_vecs, tfidf_emb_vecs) 
    
    return docs_emb


In [37]:
docs = list(att_agg_reviews.values())[:10]

In [40]:
docs_ = [nlp_doc(d) for d in docs]

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit(docs_)

AttributeError: 'list' object has no attribute 'lower'

In [None]:
docs

In [29]:
len(X.get_feature_names())

32540

In [30]:
build_tfidf(att_agg_reviews.values(), 'reviews_tfidf')

In [13]:
tfidf = load_tfidf('reviews_tfidf')

In [14]:
agg_reviews_embs = get_embs(tfidf, att_agg_reviews.values())

ValueError: too many values to unpack (expected 2)

In [23]:
tfidf.num_docs

1999

In [20]:
docs = list(att_agg_reviews.values())[:10]
docs = [nlp_doc(d) for d in docs]
docs_tfidf  = model_tfidf[docs]
np.vstack([sparse2full(c, len(docs)) for c in docs_tfidf])



NameError: name 'model_tfidf' is not defined

In [206]:
agg_reviews_embs.shape

(1999, 300)

In [195]:
np.savetxt('attractions-agg-reviews-embs.txt', agg_reviews_embs, delimiter='\t')

In [196]:
with open("attractions-agg.txt", "w") as outf:
    outf.write('\n'.join([k.replace('\n',' ') for k in att_agg_reviews.keys()]))

## Elastic export 

In [204]:
export_df = data_df.drop(columns=['reviews']).drop_duplicates('name')

In [333]:
emb_dict = {k:v for k,v in zip(att_agg_reviews.keys(), agg_reviews_embs.tolist())}
len((emb_dict['Jericho Beach'])

300

In [384]:
default = np.ones((300,), dtype='float32').tolist()
export_df['embedding'] = export_df.apply(lambda x: emb_dict[x['name']] if x['name'] in emb_dict and all(e != 0 for e in emb_dict[x['name']]) else default, axis=1)

In [385]:
assert export_df[export_df.name == 'Jericho Beach'].embedding.values[0] == emb_dict['Jericho Beach']

In [386]:
export_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14924 entries, 0 to 14923
Data columns (total 74 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   id                                         12960 non-null  float64
 1   type                                       12960 non-null  object 
 2   name                                       14924 non-null  object 
 3   awards                                     14924 non-null  object 
 4   rankingPosition                            1182 non-null   float64
 5   priceLevel                                 12960 non-null  object 
 6   category                                   14924 non-null  object 
 7   rating                                     13348 non-null  float64
 8   hotelClass                                 1210 non-null   float64
 9   phone                                      13730 non-null  object 
 10  address               

In [287]:
export_df['location'] = export_df.apply(lambda row: f'{row.latitude if "latitude" else row or 0},{row.longitude if "longitude" in row else 0}', axis=1)

In [387]:
export_df.to_json('data/elastic/bc-van-emb.json', orient='records', lines=True)

In [None]:
  "embedding": {
    "type": "dense_vector",
    "dims": 300
  },
     "location": {
        "type": "geo_point"
      }

In [244]:
test_emb = export_df[export_df.name == 'Jericho Beach'].embedding.values[0]
len(test_emb)

300

In [42]:
from elasticsearch import Elasticsearch
es = Elasticsearch()


In [45]:
# query = {
#   "size": 10,
#   "query": {
#     "script_score": {
#       "query": {
#           "bool": {
#               "must": [
#                 {
#                   "match": {
#                     "category": "attraction"
#                   }
#                 }
#               ]
#             }
#       },
#       "script": {
#         "source": "cosineSimilarity(params.queryVector, 'embedding')+1.0",
#         "params": {
#           "queryVector": test_emb
#         }
#       }
#     }
#   }
# }

# query = {
#   "query": {
#     "bool": {
#       "must": {
#         "match": {
#           "category": "attraction"
#         }
#       },
#       "should": {
#         "distance_feature": {
#           "field": "location",
#           "pivot": "1000m",
#           "origin": { 
#             "lat": 49.125433, 
#             "lon": -123.194445
#           }
#         }
#       }
#     }
#   }
# }

query = {
  "query": {
        "term": {
            "_id": {
                "value": "inusJHIBLQnqvrvrsAV_"
            }
        }
      
}
}

# query = {
#   "query": {
#     "bool": {
#       "must": {
#         "match": {
#           "category": "attraction"
#         }
#       },
#       "should": {
#         "distance_feature": {
#           "field": "location",
#           "pivot": "1000m",
#           "origin": {
#             "lat": res['hits']['hits'][0]['_source']['latitude'],
#             "lon": res['hits']['hits'][0]['_source']['longitude']
#           }
#         }
#       }
#     }
#   }
# }

res = es.search(index="ta-embs", body=query)

print("Got %d Hits:" % res['hits']['total']['value'])
for hit in res['hits']['hits']:
    print("%(name)s " % hit["_source"])

Got 0 Hits:


In [422]:
res['hits']['hits'][0]['_source']

{'id': None,
 'type': None,
 'name': 'Jericho Beach',
 'awards': [{'award_type': 'CERTIFICATE_OF_EXCELLENCE',
   'year': '2019',
   'images': {'small': 'https://www.tripadvisor.com/img/cdsi/img2/awards/CERTIFICATE_OF_EXCELLENCE_small-0-5.jpg',
    'large': 'https://www.tripadvisor.com/img/cdsi/img2/awards/CERTIFICATE_OF_EXCELLENCE_2019_en_US_large-0-5.jpg'},
   'categories': [],
   'display_name': 'Certificate of Excellence 2019'},
  {'award_type': 'CERTIFICATE_OF_EXCELLENCE',
   'year': '2018',
   'images': {'small': 'https://www.tripadvisor.com/img/cdsi/img2/awards/CERTIFICATE_OF_EXCELLENCE_small-0-5.jpg',
    'large': 'https://www.tripadvisor.com/img/cdsi/img2/awards/CERTIFICATE_OF_EXCELLENCE_2018_en_US_large-0-5.jpg'},
   'categories': [],
   'display_name': 'Certificate of Excellence 2018'},
  {'award_type': 'CERTIFICATE_OF_EXCELLENCE',
   'year': '2017',
   'images': {'small': 'https://www.tripadvisor.com/img/cdsi/img2/awards/CERTIFICATE_OF_EXCELLENCE_small-0-5.jpg',
    'large':

# Topics

In [12]:
client = Client(n_workers=4, threads_per_worker=1)
client

0,1
Client  Scheduler: tcp://127.0.0.1:61738  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 17.18 GB


In [32]:
def ner(texts):
    nlp = spacy.load("en_core_web_sm")
    docs = [nlp(desc) for desc in texts]
    return [[(e.text, e.label_) for e in doc.ents] for doc in docs ]
partitions = math.ceil(len(reviews)/5000)
with ProgressBar():
    rev_db = db.from_sequence(reviews).repartition(partitions).map_partitions(ner).flatten().compute()


In [36]:
with open('entities.json', 'w+') as f:
    json.dump(rev_db, f)

In [41]:
Counter([en for en, label in rev_db if label == 'GPE']).most_common(50)

[('Vancouver', 13072),
 ('Victoria', 4620),
 ('Canada', 3964),
 ('Granville Island', 1912),
 ('Vancouver Island', 904),
 ('BC', 896),
 ('Kelowna', 896),
 ('British Columbia', 621),
 ('Nanaimo', 512),
 ('Okanagan', 508),
 ('Whistler', 468),
 ('Granville', 396),
 ('Seattle', 364),
 ('US', 352),
 ('Granville island', 304),
 ('Vernon', 300),
 ('Richmond', 292),
 ('UK', 268),
 ('Tofino', 257),
 ('Aquabus', 252),
 ('Penticton', 244),
 ('B.C.', 232),
 ('Kamloops', 212),
 ('vancouver', 212),
 ('China', 208),
 ('Calgary', 200),
 ('Alaska', 184),
 ('Australia', 180),
 ('Washington', 176),
 ('Port Angeles', 172),
 ('Vail', 164),
 ('Gastown', 144),
 ('Golden', 140),
 ('Peachland', 132),
 ('Parksville', 132),
 ('Alberta', 128),
 ('Osoyoos', 128),
 ('San Francisco', 120),
 ('Ucluelet', 116),
 ('Barkerville', 116),
 ('Georgia', 108),
 ('Blackcomb', 108),
 ('Salt Spring Island', 108),
 ('California', 100),
 ('Long Beach', 100),
 ('Burnaby', 100),
 ('London', 96),
 ('Toronto', 96),
 ('France', 92),
 ('

In [25]:
desc_docs = [nlp(desc) for desc in descriptions]

In [None]:
for doc in rev_docs[:5]:
    displacy.render(doc, style="ent")

In [None]:
for doc in desc_docs[:5]:
    displacy.render(doc, style="ent")

In [None]:
all_ents = [e.text for doc in rev_docs for e in doc.ents if e.label_ == "LOC"]
Counter(all_ents).most_common(100)

  3%|▎         | 3369/107078 [01:00<19:31, 88.53it/s]

[('Canada', 196),
 ('Vancouver', 172),
 ('British Columbia', 141),
 ('one', 136),
 ('BC', 128),
 ('Vancouver Island', 112),
 ('first', 80),
 ('daily', 76),
 ('Victoria', 64),
 ('two', 56),
 ('summer', 48),
 ('Canadian', 48),
 ('winter', 40),
 ('Okanagan', 40),
 ('year-round', 36),
 ('three', 32),
 ('Whistler', 32),
 ('North America', 32),
 ('Okanagan Valley', 32),
 ('Bastion', 32),
 ('7 days', 32),
 ('Today', 28),
 ('Saturday', 28),
 ('First Nations', 28),
 ('year', 28),
 ("British Columbia's", 28),
 ('annual', 28),
 ('Metro Vancouver', 28),
 ('Haida', 28),
 ('Penticton', 28),
 ('the Sunshine Coast', 24),
 ('Japanese', 24),
 ('European', 24),
 ('the Okanagan Valley', 24),
 ('the Naramata Bench', 24),
 ('today', 24),
 ('1', 24),
 ('the year', 21),
 ('Long Beach', 20),
 ('Kamloops', 20),
 ('Sea', 20),
 ('3', 20),
 ('the west coast', 20),
 ('Northern BC', 20),
 ('over 100', 20),
 ('Campbell River', 20),
 ('four', 20),
 ('Sidney', 20),
 ('the Salish Sea', 20),
 ('2', 20)]

In [54]:
data_df[(data_df.category == 'attraction')].sort_values('num_reviews',ascending=False).name.apply(lambda x: x.strip().lower()).drop_duplicates().to_csv('attractions', index=False, header=False)