In [1]:
%load_ext autoreload
%autoreload 2

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from lxml import etree
import os
import re
import time
from tqdm import tqdm

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 1000)

## Load the NLB methods that I already have
os.chdir(os.environ['cred_folder'])
from web_scraping import *

os.chdir('/Users/cliff/main/sides/nlb/book_price')

### Testing Amazon 

In [2]:
def title_to_url_suffix(book_title: str):
    """ Converting normal title text into url suffix """
    return re.sub(r"[!,*)@#%(&$_?.^'-]", '', book_title).lower().replace(" ", "+")

In [3]:
title_to_url_suffix("Be So Good They Cant Ignore You")

'be+so+good+they+cant+ignore+you'

In [4]:
title_to_url_suffix("Be So Good They Can't Ignore You")

'be+so+good+they+cant+ignore+you'

In [5]:
def title_to_full_url(title: str, base_url: str='https://www.amazon.sg/'):
    query = "s?k=" + title_to_url_suffix(title)
    return base_url + query + "+paperback" 

In [6]:
title_to_full_url('obstacle is the way')

'https://www.amazon.sg/s?k=obstacle+is+the+way+paperback'

In [7]:
title_to_full_url("Be So Good They Cant Ignore You")

'https://www.amazon.sg/s?k=be+so+good+they+cant+ignore+you+paperback'

In [8]:
title_to_full_url("Be So Good They Can't Ignore You")

'https://www.amazon.sg/s?k=be+so+good+they+cant+ignore+you+paperback'

### Load Book Lists 

In [9]:
full_book_list = open('/Users/cliff/main/sbrain/book_list_clean.md', 'r').readlines()
full_book_list = [title.replace('\n', '') for title in full_book_list]
len(full_book_list)

247

### Setting up the selenium 

In [None]:
# for i in tqdm(range(0,250,5)):
# for i in tqdm(range(0,10,5)):

for i in tqdm(range(10,255,5)):
    try:
        book_lists = full_book_list[i:i+5]
        title_dict = dict()
        browser = activate_chrome_selenium_latest(have_pic=False, is_headless=True)
        
        for title in book_lists:
            url_query = title_to_full_url(title)
            browser.get(url_query)
            soup = bs(browser.page_source, 'html5lib')

            first_price = soup.find(class_ = 'a-offscreen').text
            final_first_price = float(first_price.replace('S$', ''))
            scrapped_title = soup.find('h2', 'a-size-mini a-spacing-none a-color-base s-line-clamp-4').text.strip()
            title_dict[scrapped_title] = final_first_price
            time.sleep(3)

        ## Saving files locally
        title_prices = pd.DataFrame.from_dict(title_dict, orient='index').reset_index()
        title_prices['title'] = book_lists
        title_prices.columns = ['scraped_title', 'price', 'given_title']
        title_prices[['given_title', 'scraped_title', 'price']].to_csv(f"book_list_{i}.csv", index=False)

        browser.close()
        time.sleep(5)
    except:
        browser.close()
        pass

  0%|                                                    | 0/49 [00:00<?, ?it/s]

Current google-chrome version is 104.0.5112
Get LATEST chromedriver version for 104.0.5112 google-chrome
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/104.0.5112.79/chromedriver] found in cache
  2%|▉                                           | 1/49 [00:34<27:35, 34.48s/it]

Current google-chrome version is 104.0.5112
Get LATEST chromedriver version for 104.0.5112 google-chrome
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/104.0.5112.79/chromedriver] found in cache
  4%|█▊                                          | 2/49 [01:05<25:21, 32.36s/it]

Current google-chrome version is 104.0.5112
Get LATEST chromedriver version for 104.0.5112 google-chrome
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/104.0.5112.79/chromedriver] found in cache
  6%|██▋                                         | 3/49 [01:41<26:18, 34.31s/it]

Current google-chrome version is 104.0.5112
Get LATEST chromedriver v

### Check locally saved files

In [None]:
from glob import glob
saved_book_list = glob("book_list*.csv")
len(saved_book_list)

In [None]:
df = pd.DataFrame()
for file in saved_book_list:
    df = df.append(pd.read_csv(file))
df.reset_index(drop=True, inplace=True)

In [None]:
df.shape

In [None]:
df

### Summary Statistics 

In [None]:
print(f"Total books: {title_prices.price.count()}")
print(f"Total book costs: SGD {title_prices.price.sum()}")
print(f"Average book costs: SGD {round(title_prices.price.sum() / title_prices.price.count(), 2)}")

### Given Title to Scraped Title Quality Check 

In [None]:
# !python -m spacy download en

In [None]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
[nlp(i.split(":")[0]).similarity(nlp(j.split(':')[0])) for i,j in zip(title_prices.given_title, title_prices.scraped_title)]