### Get a list of credit card merchants, search in google & scrap the descrition

In [4]:
import requests
from bs4 import BeautifulSoup
import time
from pathlib import Path
import os

USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}


def fetch_results(search_term, number_results, language_code):
    assert isinstance(search_term, str), 'Search term must be a string'
    assert isinstance(number_results, int), 'Number of results must be an integer'
    escaped_search_term = search_term.replace(' ', '+')

    google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, number_results, language_code)
    response = requests.get(google_url, headers=USER_AGENT)
    response.raise_for_status()

    return search_term, response.text


def parse_results(html, keyword):
    soup = BeautifulSoup(html, 'html.parser')

    found_results = []
    rank = 1
    result_block = soup.find_all('div', attrs={'class': 'g'})
    for result in result_block:

        link = result.find('a', href=True)
        title = result.find('h3', attrs={'class': 'r'})
        description = result.find('span', attrs={'class': 'st'})
        if link and title:
            link = link['href']
            title = title.get_text()
            description = description.get_text()
            if link != '#':
                found_results.append({'keyword': keyword, 'rank': rank, 'title': title, 'description': description})
                rank += 1
    return found_results

import json

def export_file(filename, list_name):
    with open(filename, 'w') as f:
        for search in list_name:
            f.write(json.dumps(search["description"]))

def scrape_google(search_term, number_results, language_code):
    try:
        keyword, html = fetch_results(search_term, number_results, language_code)
        results = parse_results(html, keyword)
        file_name = search_term + '.txt'
        Full_path = os.path.join("cc_desc", file_name)
        
        with open(Full_path, 'w', encoding='utf-8') as f:
                for search in results:
                    f.write(json.dumps(search["description"], ensure_ascii=False))
        return results
    except AssertionError:
        raise Exception("Incorrect arguments parsed to function")
    except requests.HTTPError:
        raise Exception("You appear to have been blocked by Google")
    except requests.RequestException:
        raise Exception("Appears to be an issue with your connection")


#if __name__ == '__main__':
#    keywords = ['edmund martin', 'python', 'google scraping']
#    data = []
#    for keyword in keywords:
#        try:
#            results = scrape_google(keyword, 100, "en")
#            for result in results:
#                data.append(result)
#        except Exception as e:
#            print(e)
#        finally:
#            time.sleep(10)
    #print(data)

In [None]:
Pacific_place = scrape_google("Starbucks",5,"en")

In [5]:
CDC_HK = scrape_google("Cafe de carol Hong Kong openrice",5,"en")

In [256]:
TW_HK = scrape_google("Tsui Wah Hong Kong Openrice",5,"en")

In [257]:
print(TW_HK)

[{'keyword': 'Tsui Wah Hong Kong Openrice', 'rank': 1, 'title': 'Tsui Wah Restaurant - Hong Kong Style Meatless Menu ... - Openrice', 'description': "翠華餐廳Tsui Wah Restaurant's Address, Telephone Number, Ratings, Reviews, Photos and Menu, located at G-2/F, 15-19 Wellington Street Central. Signature dishes include 奶茶, 海南雞飯, 凍檸茶, 鹿儿岛猪软骨鲜虾菠菜饺米线, . 連鎖式經營的港式茶餐廳，以「魚蛋稱霸、咖喱稱王」做口號。"}, {'keyword': 'Tsui Wah Hong Kong Openrice', 'rank': 2, 'title': 'Tsui Wah Restaurant - Hong Kong Style Meatless Menu ... - Openrice', 'description': "翠華餐廳Tsui Wah Restaurant's Address, Telephone Number, Ratings, Reviews, Photos and Menu, located at G/F, 84-86 Des Voeux Road Central Central. Signature dishes include 熱奶茶, . 連鎖式經營的港式茶餐廳，以「魚蛋稱霸、咖喱稱王」做口號。"}, {'keyword': 'Tsui Wah Hong Kong Openrice', 'rank': 3, 'title': "Tsui Wah Restaurant's Menu - Hong Kong Style Meatless ... - Openrice", 'description': "翠華餐廳Tsui Wah Restaurant's Menu, located at G-2/F, 15-19 Wellington Street Central."}, {'keyword': 'Tsui Wah Hong Kon

In [259]:
print(CDC_HK)

[{'keyword': 'Cafe de carol Hong Kong openrice', 'rank': 1, 'title': 'Hong Kong Restaurant Search : Cafe De Coral | OpenRice Hong Kong', 'description': 'Find Cafe De Coral, Food & Dining places in Hong Kong at OpenRice Hong Kong.'}, {'keyword': 'Cafe de carol Hong Kong openrice', 'rank': 2, 'title': 'Café de Coral - Hong Kong Style Meatless Menu Fast Food ... - Openrice', 'description': "大家樂Café de Coral's Address, Telephone Number, Ratings, Reviews, Photos and Menu, located at Shop40-50, 2/F, Sun Tuen Mun Shopping Centre, 55-65 Lung Mun Road Tuen Mun."}, {'keyword': 'Cafe de carol Hong Kong openrice', 'rank': 3, 'title': 'Café de Coral - Hong Kong Style Meatless Menu Fast Food ... - Openrice', 'description': "大家樂Café de Coral's Address, Telephone Number, Ratings, Reviews, Photos and Menu, located at UG-1/F, Chinachem Tower, 34-37 Connaught Road Central."}, {'keyword': 'Cafe de carol Hong Kong openrice', 'rank': 4, 'title': 'Café de Coral - Hong Kong Style Meatless Menu Fast Food ... -

In [369]:
Caltex_HK = scrape_google("Caltax Hong Kong",5,"en")

In [370]:
print(Caltex_HK)

[{'keyword': 'Caltax Hong Kong', 'rank': 1, 'title': 'Find Caltex Petrol Service Stations in Hong Kong - | Caltex Hong Kong', 'description': 'Use this interactive feature to locate Caltex stations nearest to you along with the various amenities, utilities each has to offer; and plan your way goin.'}, {'keyword': 'Caltax Hong Kong', 'rank': 2, 'title': 'Caltex in Hong Kong', 'description': 'As a Global Energy Company we look to offer a host of quality products & services in Hong Kong to make your journey better.'}, {'keyword': 'Caltax Hong Kong', 'rank': 3, 'title': 'Promotions, Discounts & More - Caltex Hong Kong | Caltex Hong Kong', 'description': 'Star Mart Monthly Offer. Look out for these February deals at Star Mart when you drop by any Caltex station. ... Fuel up with these credit cards or your Caltex JoyFuel Card to enjoy a fuel discount of HK$0.9/litre.'}, {'keyword': 'Caltax Hong Kong', 'rank': 4, 'title': 'Caltex HK - 5 Star Techron Fuel, Clean & Glide Tech | Caltex Hong Kong'

In [6]:
my_dict =[{'search_name': 'Pacific Place', 'arr_name':"Pacific_Place"}, 
          {'search_name':"Lee Garden",'arr_name':'Lei_Garden'},
           {'search_name':"SamSung",'arr_name':'SamSung'},
           {'search_name':"Broadway HK",'arr_name':'Broadway HK'},
           {'search_name':"Fortress HK",'arr_name':'Fortress'},
           {'search_name':"Genki Hong Kong",'arr_name':'Genki_HK'},
          {'search_name':"Caltax Hong Kong",'arr_name':'Caltex_HK'},
          {'search_name':"Shell Hong Kong",'arr_name':'Shell_HK'},
          {'search_name':"Samsonite",'arr_name':'Samsonite'},
          {'search_name':"Island Shangrila",'arr_name':'Shangrila'},
          {'search_name':"Marriot",'arr_name':'Marriot'},
         ]

for l in my_dict:
    #print(l['search_name'], l['arr_name'])
    l['arr_name'] = scrape_google(l['search_name'],5,"en")

In [7]:
my_dict =[{'search_name': 'Pacific Coffee', 'arr_name':"Pacific_coffee"}, 
          {'search_name':"Starbuck",'arr_name':'Starbuck'},
           {'search_name':"Simply Life HK",'arr_name':'Simply_life'},
           {'search_name':"Fairwood HK",'arr_name':'fairwood'},
           {'search_name':"MX HK",'arr_name':'MX_HK'},
           {'search_name':"Shushi Hiro",'arr_name':'Shushi_Hiro'},
          {'search_name':"Mandarin Oriental",'arr_name':'MO_HK'},
          {'search_name':"Conard",'arr_name':'Conrad_HK'},
          {'search_name':"Excelsior HK",'arr_name':'Excelsior'},
          {'search_name':"Parklane hotel",'arr_name':'Parklane HK'},
          {'search_name':"Shushi Senro",'arr_name':'Shushi_senro'},
         ]

for l in my_dict:
    #print(l['search_name'], l['arr_name'])
    l['arr_name'] = scrape_google(l['search_name'],5,"en")

### import the exported text description

In [1]:
import glob
files = glob.glob("cc_desc/*.txt")
t = []

# iterate over the list getting each file 
for fle in files:
   # open the file and then call .read() to get the text 
    with open(fle) as f: 
        t.append(f.read())
        
    

In [13]:
print(t[:3])

['"Find Cafe De Coral, Food & Dining places in Hong Kong at OpenRice Hong Kong.""大家樂Café de Coral\'s Address, Telephone Number, Ratings, Reviews, Photos and Menu, located at Shop40-50, 2/F, Sun Tuen Mun Shopping Centre, 55-65 Lung Mun Road Tuen Mun.""大家樂Café de Coral\'s Address, Telephone Number, Ratings, Reviews, Photos and Menu, located at UG-1/F, Chinachem Tower, 34-37 Connaught Road Central.""大家樂Café de Coral\'s Address, Telephone Number, Ratings, Reviews, Photos and Menu, located at B/F, Argyle Centre Phase 1, 688 Nathan Road Mong Kok.""大家樂Café de Coral\'s Address, Telephone Number, Ratings, Reviews, Photos and Menu, located at Shop 23, L2, Festival Walk, 80 Tat Chee Avenue Kowloon Tong. 大家樂由早到晚均提供豐富美食選擇，價格相宜，糯米雞、一哥焗豬扒飯與香滑奶茶是必食之選。"', '"Our luxury 5-star Island Shangri-La, Hong Kong provides comfortably appointed rooms, suites and restaurants as well as excellent amenities.""Located in Hong Kong, 14 miles from Ngong Ping 360 Cable Car, Island Shangri-La Hong Kong features a restaur

### remove chinese words

In [14]:
import re
def rm_chinese(in_text):
    b = in_text.encode('utf8')
    c = str(re.sub(rb'[^\x00-\x7f]',rb'',b))
    return c

f = ' '.join([rm_chinese(s) for s in t])
#t_noChin = rm_chinese(str(t))

print(f[:50])

b'"Find Cafe De Coral, Food & Dining places in Hon


In [4]:
import nltk

from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import re

In [5]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
unwant_words = ['hong','kong','our','']

def rm_chinese(in_text):
    b = in_text.encode('utf8')
    c = str(re.sub(rb'[^\x00-\x7f]',rb'',b))
    return c

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    rm_ch = ' '.join([i for i in rm_chinese(stop_free).split()])
    stop_free = rm_ch
    ex_unwant = " ".join([i for i in stop_free.lower().split() if i not in unwant_words])
    punc_free = ''.join(ch for ch in ex_unwant if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    
    return normalized

doc_clean = [clean(doc).split() for doc in t]



In [6]:
print(type(doc_clean))
print(doc_clean[:3])

<class 'list'>
[['bfind', 'cafe', 'de', 'coral', 'food', 'dining', 'place', 'openrice', 'kongcaf', 'de', 'coral', 'address', 'telephone', 'number', 'rating', 'review', 'photo', 'menu', 'located', 'shop4050', '2f', 'sun', 'tuen', 'mun', 'shopping', 'centre', '5565', 'lung', 'mun', 'road', 'tuen', 'muncaf', 'de', 'coral', 'address', 'telephone', 'number', 'rating', 'review', 'photo', 'menu', 'located', 'ug1f', 'chinachem', 'tower', '3437', 'connaught', 'road', 'centralcaf', 'de', 'coral', 'address', 'telephone', 'number', 'rating', 'review', 'photo', 'menu', 'located', 'bf', 'argyle', 'centre', 'phase', '1', '688', 'nathan', 'road', 'mong', 'kokcaf', 'de', 'coral', 'address', 'telephone', 'number', 'rating', 'review', 'photo', 'menu', 'located', 'shop', '23', 'l2', 'festival', 'walk', '80', 'tat', 'chee', 'avenue', 'kowloon', 'tong'], ['bour', 'luxury', '5star', 'island', 'shangrila', 'provides', 'comfortably', 'appointed', 'room', 'suite', 'restaurant', 'well', 'excellent', 'amenitieslo

In [7]:
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 

dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [25]:
print (doc_term_matrix)

[[(0, 1), (1, 1), (2, 5), (3, 5), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 4), (10, 4), (11, 4), (12, 4), (13, 4), (14, 4), (15, 4), (16, 4), (17, 1), (18, 1), (19, 1), (20, 2), (21, 2), (22, 1), (23, 2), (24, 1), (25, 1), (26, 3), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1)], [(53, 1), (54, 1), (55, 1), (56, 2), (57, 2), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 2), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1)], [(5, 1), (16, 1), (23, 1), (63, 2), (67, 5), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 6), (80, 5), (81, 3), (82, 6), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1

### Create Topic  model

In [17]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=7, id2word = dictionary, passes=50)



In [18]:
#Show first n important word in the topics:
ldamodel.show_topics(7,5)

[(0,
  '0.018*"x" + 0.015*"table" + 0.015*"sushi" + 0.014*"causeway" + 0.012*"bay"'),
 (1,
  '0.013*"cinema" + 0.013*"pacific" + 0.011*"place" + 0.011*"coffee" + 0.011*"caltex"'),
 (2,
  '0.016*"samsonite" + 0.013*"de" + 0.013*"coral" + 0.013*"luggage" + 0.011*"restaurant"'),
 (3,
  '0.019*"de" + 0.019*"centre" + 0.016*"located" + 0.013*"address" + 0.013*"shop"'),
 (4,
  '0.017*"product" + 0.017*"tradein" + 0.012*"service" + 0.012*"information" + 0.012*"trade"'),
 (5,
  '0.020*"hotel" + 0.017*"sushi" + 0.013*"conard" + 0.011*"reward" + 0.011*"senro"'),
 (6,
  '0.025*"restaurant" + 0.025*"wah" + 0.019*"tsui" + 0.017*"review" + 0.017*"located"')]

In [None]:
for i in ldamodel.print_topics(): 
    for j in i: print (j)

### LDA visualization

In [24]:
import numpy as np
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim
# ======= theta = Doc * Topic=======

lda_corpus2=ldamodel.get_document_topics(bow=doc_term_matrix,minimum_probability=0)

doc_topic_relation=[doc_topic_r for  doc_topic_r in lda_corpus2 ]

 

ls_transcripts=doc_clean

topic_N = 7

 

theta = np.zeros([len(ls_transcripts),topic_N])

 

for doc_index,doc in enumerate(doc_topic_relation):

                for rel in doc:

                                # print(rel)

                                theta[doc_index,rel[0]]=rel[1]

                                # print(rel[1])

df_theta = pd.DataFrame(theta)

#df_theta.to_csv(csvfile_theta, index=False, header=False)

 

 

# ======= vocab : words list=======

 

vocab=[(ldamodel.id2word[id]) for id in ldamodel.id2word]

df_vocab = pd.DataFrame(vocab)

 

 

# ======= phi : Topic * Words =======

 

phi = np.zeros([topic_N,len(vocab)])

 

for topic_i in range(topic_N):

                # print(topic_i)

                for topic_i_wordtopic in ldamodel.show_topic(topicid=topic_i,topn=len(vocab)):

                                #print(topic_i_wordtopic[0])

                                #print(vocab.index(topic_i_wordtopic[0]))

                                phi[topic_i,vocab.index(topic_i_wordtopic[0])]=topic_i_wordtopic[1]

 

df_phi = pd.DataFrame(phi,columns =vocab)

 

 

# ======= term_frequency : Docs * Words =======

# ======= doc_length : Words frequency in each Docs =======

 

 

# term_frequency = np.zeros([len(ls_transcripts),len(vocab)])

term_frequency = np.zeros(len(vocab))

doc_length = np.zeros(len(ls_transcripts))

 

for doc_index,doc in enumerate(doc_term_matrix):

                for word in doc:

                                # term_frequency[doc_index,word[0]] =word[1]

                                term_frequency[word[0]] =word[1]+term_frequency[word[0]]

                                doc_length[doc_index] =doc_length[doc_index]+word[1]

 

df_term_frequency = pd.DataFrame(term_frequency)

 

df_doc_length = pd.DataFrame(doc_length)

 

pyLDAvis.prepare(phi,

theta,

doc_length,

vocab,

term_frequency,

#R=30,

#lambda_step=0.01,

#n_jobs=-1,

#plot_opts={'xlab': 'PC1', 'ylab': 'PC2'},

#sort_topics=True

)

 

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


PreparedData(topic_coordinates=            Freq  cluster  topics         x         y
topic                                                
0      26.518012        1       1 -0.060898 -0.057544
5      20.082849        1       2  0.147184 -0.132317
1      16.155785        1       3  0.070076  0.140879
2      12.216288        1       4  0.025271  0.073832
3      11.921901        1       5 -0.105305  0.009311
6      11.302785        1       6 -0.109661 -0.044420
4       1.802380        1       7  0.033333  0.010260, topic_info=     Category       Freq           Term      Total  loglift  logprob
term                                                                
503   Default   6.000000            wah   6.000000  30.0000  30.0000
16    Default  10.000000        located  10.000000  29.0000  29.0000
337   Default   9.000000              x   9.000000  28.0000  28.0000
2     Default   8.000000             de   8.000000  27.0000  27.0000
321   Default   4.000000        product   4.000000  26.00

In [23]:
ldamodel.save('topic.model')
#from gensim.models import LdaModel
#loading = LdaModel.load('topic.model')

In [None]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)