In [170]:
import os
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm

import requests
from urllib.parse import urlsplit

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from bs4 import Comment
import re
from langdetect import detect_langs

import matplotlib.pyplot as plt
import matplotlib.image as matimg

from PIL import Image
from io import BytesIO
from imageio import imread

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input, decode_predictions
from tensorflow.keras.preprocessing.image import load_img, img_to_array

from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

from scipy.spatial import distance
import uuid
from pathlib import Path

from process_websites import get_corpus
#from process_websites import clean_categories

W, H = 224, 224

%matplotlib inline
FOLDER_PATH = '/home/vahidsanei_google_com/'
PATH_CHROME_DRIVER = '/home/vahidsanei_google_com/chromedriver/chromedriver'
PATH_GECKO_DRIVER = '/home/vahidsanei_google_com/geckodriver/geckodriver'

In [2]:
df = pd.read_csv(os.path.join(FOLDER_PATH, 'data', 'yelp_data', 'updated', 'business.csv'))
df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,url,webpage_text
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'...",http://www.therangeatlakenorman.com/,"<html lang=""en-US""><head>\n\t<meta charset=""UT..."
1,XNoUzKckATkOD1hP6vghZg,Felinus,3554 Rue Notre-Dame O,Montreal,QC,H4C 1P4,45.479984,-73.58007,5.0,5,1,,"Pets, Pet Services, Pet Groomers",,http://www.felinus.ca,"<html xmlns=""http://www.w3.org/1999/xhtml"" xml..."
2,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726649,4.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Home Services, Plumbing, Electricians, Handyma...","{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', '...",https://www.usemyguyservices.com,"<html lang=""en-US"" prefix=""og: http://ogp.me/n..."
3,cKyLV5oWZJ2NudWgqs8VZw,Oasis Auto Center - Gilbert,"1720 W Elliot Rd, Ste 105",Gilbert,AZ,85233,33.350399,-111.827142,4.5,38,1,{'BusinessAcceptsCreditCards': 'True'},"Auto Repair, Automotive, Oil Change Stations, ...","{'Monday': '7:0-18:0', 'Tuesday': '7:0-18:0', ...",http://oasisautocenter.net,"<html lang=""en-US"" prefix=""og: http://ogp.me/n..."
4,ScYkbYNkDgCneBrD9vqhCQ,Junction Tire & Auto Service,6910 E Southern Ave,Mesa,AZ,85209,33.393885,-111.682226,5.0,18,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Auto Repair, Oil Change Stations, Automotive, ...","{'Monday': '7:30-17:0', 'Tuesday': '7:30-17:0'...",http://junctiontire.net/tires-auto-repair-mesa-az,"<html lang=""en"" style="""" class="" js rgba multi..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5111 entries, 0 to 5110
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   5111 non-null   object 
 1   name          5111 non-null   object 
 2   address       4864 non-null   object 
 3   city          5111 non-null   object 
 4   state         5111 non-null   object 
 5   postal_code   5105 non-null   object 
 6   latitude      5111 non-null   float64
 7   longitude     5111 non-null   float64
 8   stars         5111 non-null   float64
 9   review_count  5111 non-null   int64  
 10  is_open       5111 non-null   int64  
 11  attributes    4398 non-null   object 
 12  categories    5108 non-null   object 
 13  hours         4343 non-null   object 
 14  url           5111 non-null   object 
 15  webpage_text  5111 non-null   object 
dtypes: float64(3), int64(2), object(11)
memory usage: 639.0+ KB


In [4]:
df = get_corpus(df, df_address_with_corpus=os.path.join(FOLDER_PATH, 'data', 'yelp_data', 'updated', 'business_with_corpus.csv'))
df.head()

The dataframe already exists. We load the existing file ...


Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,url,webpage_text,is_eng,webpage_corpus
0,0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'...",http://www.therangeatlakenorman.com/,"<html lang=""en-US""><head>\n\t<meta charset=""UT...",True,Shooting Ranges Gun Rental Charlotte NC The Ra...
1,1,XNoUzKckATkOD1hP6vghZg,Felinus,3554 Rue Notre-Dame O,Montreal,QC,H4C 1P4,45.479984,-73.58007,5.0,5,1,,"Pets, Pet Services, Pet Groomers",,http://www.felinus.ca,"<html xmlns=""http://www.w3.org/1999/xhtml"" xml...",False,
2,2,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726648,4.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Home Services, Plumbing, Electricians, Handyma...","{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', '...",https://www.usemyguyservices.com,"<html lang=""en-US"" prefix=""og: http://ogp.me/n...",True,Home Renovations and Repairs Phoenix AZ Home U...
3,3,cKyLV5oWZJ2NudWgqs8VZw,Oasis Auto Center - Gilbert,"1720 W Elliot Rd, Ste 105",Gilbert,AZ,85233,33.350399,-111.827142,4.5,38,1,{'BusinessAcceptsCreditCards': 'True'},"Auto Repair, Automotive, Oil Change Stations, ...","{'Monday': '7:0-18:0', 'Tuesday': '7:0-18:0', ...",http://oasisautocenter.net,"<html lang=""en-US"" prefix=""og: http://ogp.me/n...",False,
4,4,pQeaRpvuhoEqudo3uymHIQ,The Empanadas House,404 E Green St,Champaign,IL,61820,40.110446,-88.233073,4.5,5,1,"{'RestaurantsAttire': ""u'casual'"", 'Restaurant...","Ethnic Food, Food Trucks, Specialty Food, Impo...","{'Monday': '11:30-14:30', 'Tuesday': '11:30-14...",http://www.theempanadashouse.com,<html><head>\n<title>Website Disabled</title>\...,True,Website Disabled Sorry the site you requested ...


In [5]:
folder_path = os.path.join(FOLDER_PATH, 'data','uncased_L-12_H-768_A-12')
tokenizer = FullTokenizer(vocab_file=os.path.join(folder_path, 'vocab.txt'))

bert_ckpt_file = os.path.join(folder_path, 'bert_model.ckpt')
bert_config_file = os.path.join(folder_path, 'bert_config.json')

tokens_test = tokenizer.tokenize('This is an open-source project for category detection of businesses based on their website contents!')
print(tokens_test)
print(tokenizer.convert_tokens_to_ids(tokens_test))

['this', 'is', 'an', 'open', '-', 'source', 'project', 'for', 'category', 'detection', 'of', 'businesses', 'based', 'on', 'their', 'website', 'contents', '!']
[2023, 2003, 2019, 2330, 1011, 3120, 2622, 2005, 4696, 10788, 1997, 5661, 2241, 2006, 2037, 4037, 8417, 999]


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5107 entries, 0 to 5106
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      5107 non-null   int64  
 1   business_id     5107 non-null   object 
 2   name            5107 non-null   object 
 3   address         4863 non-null   object 
 4   city            5107 non-null   object 
 5   state           5107 non-null   object 
 6   postal_code     5101 non-null   object 
 7   latitude        5107 non-null   float64
 8   longitude       5107 non-null   float64
 9   stars           5107 non-null   float64
 10  review_count    5107 non-null   int64  
 11  is_open         5107 non-null   int64  
 12  attributes      4398 non-null   object 
 13  categories      5104 non-null   object 
 14  hours           4336 non-null   object 
 15  url             5107 non-null   object 
 16  webpage_text    5107 non-null   object 
 17  is_eng          5107 non-null   b

In [7]:
data = df.sample(frac=0.0005)
data = data.reset_index(drop=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      3 non-null      int64  
 1   business_id     3 non-null      object 
 2   name            3 non-null      object 
 3   address         3 non-null      object 
 4   city            3 non-null      object 
 5   state           3 non-null      object 
 6   postal_code     3 non-null      object 
 7   latitude        3 non-null      float64
 8   longitude       3 non-null      float64
 9   stars           3 non-null      float64
 10  review_count    3 non-null      int64  
 11  is_open         3 non-null      int64  
 12  attributes      2 non-null      object 
 13  categories      3 non-null      object 
 14  hours           3 non-null      object 
 15  url             3 non-null      object 
 16  webpage_text    3 non-null      object 
 17  is_eng          3 non-null      bool   

In [8]:
data['url'].head()

0                 http://5two3.com
1       http://www.ontimevegas.com
2    http://www.amhomeservices.ca/
Name: url, dtype: object

In [9]:
data = df

In [10]:
for i in range(5):
    print(data['url'][i], ':')
    split_url = urlsplit(data['url'][i])
    print(split_url.hostname)

http://www.therangeatlakenorman.com/ :
www.therangeatlakenorman.com
http://www.felinus.ca :
www.felinus.ca
https://www.usemyguyservices.com :
www.usemyguyservices.com
http://oasisautocenter.net :
oasisautocenter.net
http://www.theempanadashouse.com :
www.theempanadashouse.com


In [135]:
def retrieve_images_with_bs4(webpage_html):
    try:
        soup = BeautifulSoup(webpage_html, 'lxml')
    except:
        return []
    urls = []
    for img_content in soup.find_all('img'):
        try:
            if img_content['src'] is not None:
                urls.append(img_content['src'])
        except Exception as e:
            pass
    return urls

In [222]:
# Selenium seems to be a better option
def retrieve_images(url_link):
    try:
        options = Options()
        options.add_argument('--headless')
        driver = webdriver.Chrome(PATH_CHROME_DRIVER, options=options)
        element = EC.presence_of_all_elements_located((By.TAG_NAME, 'img'))
        driver.set_page_load_timeout(10)
        driver.implicitly_wait(10)
        driver.get(url_link)
        WebDriverWait(driver, 10).until(element)
        images = driver.find_elements_by_tag_name('img')
    except Exception as e:
        #print(e)
        return []
    res = []
    for img in images:
        try:
            res.append(img.get_attribute('src'))
        except Exception as e:
            #print(e)
            pass
    driver.close()
    return res

In [215]:
def download_img(img_url):
    try:
        
        r = requests.get(img_url, stream=True, timeout=3)
        img = Image.open(BytesIO(r.content))
        file_name = os.path.join(FOLDER_PATH, 'data', 'image/pic.jpg')
        os.remove(file_name)
        if img.mode in ['RGBA', 'P']:
            img = img.convert('RGB')
        img.save(file_name, 'JPEG')
        return True
    except Exception as e:
        #print(e)
        return False

In [216]:
def create_corpus(url_link, model, ntop=3, log=False):
    #split_url = urlsplit(url_link)
    #base_url = '{}://'.format(split_url.scheme) + split_url.hostname
    #urls = retrieve_images_with_bs4(webpage_html)
    urls = retrieve_images(url_link)
    #print(urls)
    content = []
    for img_url in urls:
        if download_img(img_url):
            image = load_img(os.path.join(FOLDER_PATH, 'data', 'image/pic.jpg'), target_size=(W, H))
            image = img_to_array(image)
            image = image.reshape(1, image.shape[0], image.shape[1], image.shape[2])
            image = preprocess_input(image)
            y_pred = model.predict(image)
            labels = decode_predictions(y_pred, top=ntop)
            labels = [label[1] for label in labels[0][:ntop]]
            labels = [re.sub('[^a-zA-Z]', ' ', label) for label in labels]
            content.extend(labels)
    return ' '.join(content)

In [217]:
def clean_categories(df, map_classes):
    df = df[df['categories'].notnull()]
    df['categories'] = df['categories'].apply(lambda x: re.split('[,;&]', x))
    cat = {}
    bad = []
    for x in df['categories']:
        flg = False
        for cls in x:
            cls = cls.strip()
            if not cls in map_classes: continue
            flg = True
            mapped_cls = map_classes[cls]
            if mapped_cls not in cat: cat[mapped_cls]=1
            else: cat[mapped_cls]+=1
        if flg is False:
            bad.append(x)
            
    new_cat = []
    val = 0
    for arr in df['categories']:
        new_cat.append(None)
        for x in arr:
            cls = x.strip()
            if not cls in map_classes:
                continue
            val += 1
            new_cat[-1] = map_classes[cls]
            break
            
    df['categories'] = new_cat
    df = df[df['categories'].notnull()]
    return df

In [218]:
map_classes = {
                'Restaurants': 'Food', 'Food': 'Food', 'Frozen Yogurt': 'Food', 'Pizza': 'Food', 'Bars': 'Food', 'Coffee': 'Food',
                'Cafes': 'Food', 'Fast Food': 'Food', 'Bakeries': 'Food', 'Tea' : 'Food', 'Breakfast': 'Food',
                'Wine': 'Food', 'Sandwiches': 'Food', 'Burgers': 'Food', 'Brunch': 'Food', 'Breakfast': 'Food', 'Desserts': 'Food',
                'Vegetarian': 'Food', 'Vegan': 'Food', 

                'Health': 'Health', 'Dentists': 'Health', 'Doctors': 'Health', 'Medical Centers': 'Health', 'Drugstores': 'Health',
                'Local Services': 'Health', 

                'Local Services': 'Local Services', 'Car Dealers': 'Local Services', 'Professional Services': 'Local Services', 
                'Home Services': 'Local Services','Garden': 'Local Services', 'Real Estate': 'Local Services',
                'Auto Repair': 'Local Services', 'Pet Services': 'Local Services', 'Home Cleaning': 'Local Services', 
                'Public Services': 'Local Services', 'Home Decor': 'Local Services', 
                'Automotive': 'Local Services', 'Pets': 'Local Services',
    
                'Hair Salons': 'Local Services', 'Nail Salons': 'Local Services', 'Beauty': 'Local Services', 'Hair Salons': 'Local Services', 'Makeup Artists': 'Local Services',
                'Hair Removal': 'Local Services', 'Massage': 'Local Services', 'Barbers': 'Local Services', 'Beauty Supply': 'Local Services',
                
                'Entertainment': 'Entertainment', 'Event Planning': 'Entertainment', 'Golf': 'Entertainment',
                'Active Life': 'Entertainment', 'Nightlife': 'Entertainment',

                'Hotels': 'Entertainment', 'Travel': 'Entertainment',
    
                'Jewelry': 'Entertainment', 'Shopping': 'Entertainment', 'Hobby Shops': 'Entertainment', 

                'Fitness': 'Entertainment', 'Sporting Goods': 'Entertainment', 'Gyms': 'Entertainment', 'Sports Bars': 'Entertainment', 
            
                'Banks': 'Financial Services', 'Financial Services': 'Financial Services',
    
                'Mass Media': 'Entertainment'
        }
data = clean_categories(data, map_classes)
set(data['categories'])

{'Entertainment', 'Financial Services', 'Food', 'Health', 'Local Services'}

In [219]:
model = VGG16()

In [223]:
corpuses = []
for _, entry in tqdm(data.iterrows()):
    if url != np.nan and entry['categories'] == 'Food':
        #print(entry['name'] , '*', entry['categories'], entry['url'])
        corpus = create_corpus(entry['url'], model)
        corpuses.append(corpus)
    else:
        corpuses.append(None)
        #print('*' * 100)
        #print(corpus)
        #print('#' * 100)
#data['corpus_text'] = corpuses

  "Palette images with Transparency expressed in bytes should be "
5054it [5:12:16,  3.71s/it] 


In [228]:
data['corpus_text'] = corpuses
len(corpuses), len

1469

In [225]:
data['corpus_text'][:10]

0    
1    
2    
3    
4    
5    
6    
7    
8    
9    
Name: corpus_text, dtype: object

In [None]:
classes = [
 'Education', 
 'Entertainment',
 'Beauty',
 'Bank',
 'Fitness',
 'Food',
 'Health',
 'housework',
 'Religious',
 'Travel']

In [None]:
classes = [cls.lower() for cls in classes]
classes

In [None]:
word2vec = {}
with open(os.path.join(FOLDER_PATH, 'data', 'glove_data', 'glove.6B.300d.txt')) as file:
    for line in file:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vector

In [None]:
model = VGG16()

In [None]:
img_labels = []
for image_file in os.listdir(os.path.join(FOLDER_PATH, 'advertiser-quality-from-sites', 'source', 'images')):

In [None]:
for _, label in zip(range(10), img_labels):
#    label_word2vec = []
#     for i in range(0, len(label)):
#         if not label[i] in word2vec: continue
#         if len(label_word2vec) == 0: 
#             label_word2vec = word2vec[label[i]]
#         else:
#             label_word2vec = [a + b for a, b in zip(label_word2vec, word2vec[label[i]])]
    label_word2vec = []
    for i in range(0, len(label)):
        if not label[i] in word2vec: continue
        label_word2vec.append(word2vec[label[i]])
    if len(label_word2vec) == 0:
        print('The label is not in dictionary')
        continue
    label_word2vec = np.mean(label_word2vec, axis=0)
    dist = []
    for cls in classes:
        dist.append(distance.cosine(label_word2vec, word2vec[cls]))
    best_idx = np.argmin(dist)
    print(label, 'is closest to', classes[best_idx], 'with distance:', dist[best_idx])

In [None]:
i = 0
for img_url in img_urls:
    try:
        r = requests.get(img_url, stream=True, timeout=3)
        file_name = 'images/pic{}.jpg'.format(i)
#         #print(img_url)
#         with open('images/pic{}.jpg'.format(i), 'wb') as f:
#             f.write(r.content)
        img = Image.open(BytesIO(r.content))
        if img.mode in ['RGBA', 'P']:
            img = img.convert('RGB')
        img.save(file_name, 'JPEG')
        
        i += 1
    except Exception as e:
        print(e)
        #print('Not opened', img_url)
        pass

In [None]:
images_rgb = []
for x in range(200):
    file = 'images/pic{}.jpg'.format(x)
    img = matimg.imread(file)
    #print(file + ':', img.shape)
    images_rgb.append(img)

In [None]:
H, W = 224, 224
for img_rgb in images_rgb:
    mat = tf.constant(img_rgb)
    mat = tf.image.resize_with_crop_or_pad(mat, H, W)
    mat /= 255.0

In [None]:
model = VGG16()