In [25]:
%load_ext autoreload
%autoreload 2

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from lxml import etree
import os
import re
import time
from tqdm import tqdm

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

## Load the NLB methods that I already have
os.chdir(os.environ['cred_folder'])
from web_scraping import *

os.chdir('/Users/cliff/main/sides/nlb/book_price')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Testing Amazon 

In [2]:
def title_to_url_suffix(book_title: str):
    """ Converting normal title text into url suffix """
    return re.sub(r"[!,*)@#%(&$_?.^'-]", '', book_title).lower().replace(" ", "+")

In [3]:
title_to_url_suffix("Be So Good They Cant Ignore You")

'be+so+good+they+cant+ignore+you'

In [4]:
title_to_url_suffix("Be So Good They Can't Ignore You")

'be+so+good+they+cant+ignore+you'

In [5]:
def title_to_full_url(title: str, base_url: str='https://www.amazon.sg/'):
    query = "s?k=" + title_to_url_suffix(title)
    return base_url + query + "+paperback" 

In [6]:
title_to_full_url('obstacle is the way')

'https://www.amazon.sg/s?k=obstacle+is+the+way+paperback'

In [7]:
title_to_full_url("Be So Good They Cant Ignore You")

'https://www.amazon.sg/s?k=be+so+good+they+cant+ignore+you+paperback'

In [8]:
title_to_full_url("Be So Good They Can't Ignore You")

'https://www.amazon.sg/s?k=be+so+good+they+cant+ignore+you+paperback'

### Load Book Lists 

In [9]:
full_book_list = open('/Users/cliff/main/sbrain/book_list_clean.md', 'r').readlines()
full_book_list = [title.replace('\n', '') for title in full_book_list]
len(full_book_list)

247

### Setting up the selenium 

In [10]:
# for i in tqdm(range(0,250,5)):
for i in tqdm(range(0,10,5)):
# for i in tqdm(range(10,255,5)):
    try:
        book_lists = full_book_list[i:i+5]
        title_dict = dict()
        browser = activate_chrome_selenium_latest(have_pic=False, is_headless=True)
        
        for title in book_lists:
            url_query = title_to_full_url(title)
            # If title equals to these, remove paperback
            if title in paperback_exclusions:
                url_query = "+".join(url_query.split("+")[:-1])
            
            browser.get(url_query)
            soup = bs(browser.page_source, 'html5lib')

            first_price = soup.find(class_ = 'a-offscreen').text
            final_first_price = float(first_price.replace('S$', ''))
            scrapped_title = soup.find('h2', 'a-size-mini a-spacing-none a-color-base s-line-clamp-4').text.strip()
            title_dict[scrapped_title] = final_first_price
            time.sleep(3)

        ## Saving files locally
        title_prices = pd.DataFrame.from_dict(title_dict, orient='index').reset_index()
        title_prices['title'] = book_lists
        title_prices.columns = ['scraped_title', 'price', 'given_title']
        title_prices[['given_title', 'scraped_title', 'price']].to_csv(f"book_list_{i}.csv", index=False)

        browser.close()
        time.sleep(5)
    except:
        browser.close()
        pass

  0%|                                                    | 0/49 [00:00<?, ?it/s]

Current google-chrome version is 104.0.5112
Get LATEST chromedriver version for 104.0.5112 google-chrome
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/104.0.5112.79/chromedriver] found in cache
  2%|▉                                           | 1/49 [00:34<27:35, 34.48s/it]

Current google-chrome version is 104.0.5112
Get LATEST chromedriver version for 104.0.5112 google-chrome
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/104.0.5112.79/chromedriver] found in cache
  4%|█▊                                          | 2/49 [01:05<25:21, 32.36s/it]

Current google-chrome version is 104.0.5112
Get LATEST chromedriver version for 104.0.5112 google-chrome
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/104.0.5112.79/chromedriver] found in cache
  6%|██▋                                         | 3/49 [01:41<26:18, 34.31s/it]

Current google-chrome version is 104.0.5112
Get LATEST chromedriver v

### Check locally saved files

In [11]:
from glob import glob
saved_book_list = glob("book_list*.csv")
len(saved_book_list)

50

In [40]:
df = pd.DataFrame()
for file in saved_book_list:
    tmp_table = pd.read_csv(file)
    tmp_table['i'] = file.split("_")[-1].split('.')[0]
    df = df.append(tmp_table)
df.reset_index(drop=True, inplace=True)

In [41]:
df.shape

(247, 4)

In [42]:
df.head()

Unnamed: 0,given_title,scraped_title,price,i
0,Marketing Automation for Dummies,MARKETING AUTOMATION FOR DUMMIES [Paperback] MATHEW SWEEZEY,19.82,115
1,Who Gets What - And Why?,"From Strength to Strength: Finding Success, Happiness, and Deep Purpose in the Second Half of Life",29.96,115
2,The Effective Engineer,The Six Habits of Highly Effective Sales Engineers,30.41,115
3,Barking Up The Wrong Tree,Barking Up the Wrong Tree: The Surprising Science Behind Why Everything You Know About Success Is (Mostly) Wrong,30.98,115
4,Cyber Crime & Warfare,Sandworm: A New Era of Cyberwar and the Hunt for the Kremlin's Most Dangerous Hackers,21.35,115


### Summary Statistics 

In [34]:
print(f"Total books: {df.price.count()}")
print(f"Total book costs: SGD {df.price.sum()}")
print(f"Average book costs: SGD {round(df.price.sum() / df.price.count(), 2)}")

Total books: 247
Total book costs: SGD 8536.2
Average book costs: SGD 34.56


### Given Title to Scraped Title Quality Check 

In [None]:
# !python -m spacy download en

In [35]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

In [44]:
df['title_check'] = [nlp(i.split(":")[0]).similarity(nlp(j.split(':')[0])) for i,j in zip(df.given_title, df.scraped_title)]

In [45]:
df[df['title_check'] < .7].shape

(120, 5)

In [51]:
to_check = df[df['title_check'] < .7].groupby('i')['title_check'].count().reset_index()
to_check['i'] = to_check['i'].astype(int)
to_check.sort_values('i').reset_index(drop=True).head()

Unnamed: 0,i,title_check
0,0,1
1,5,2
2,10,1
3,15,2
4,20,3


In [None]:
paperback_exclusions = ["1337 h4x0r H4ndb00k", 
                        "The Strategist The Life and Times of Thomas Schelling", 
                        "big data understanding how data power big business",
                        "23 things they dont tell you about capitalism", 
                        "Super Crunchers", "EconoMyths", "How Pleasure Works", 
                        "The Tinkerers: The Amateurs, DIYers, and Inventors Who Make America Great", 
                        "Rebel Buddha", "automate this how algorithms came to rule our world",
                        "How to read a Modern Painting", "The Art of Lean Software Development",
                        "The Secret of Competitive Intelligence", "Think like Zuck", "The Clean Coder",
                        "The Lazy Winner", "The Rules of Work", "Data Scientist at Work", 
                        "The happiness code ten keys to being the best you can be", 
                        "The Digital Marketer: 10 New skills you must learn to stay relevant and customer-centric",
                        "Data Mining for Managers", "Analytics at Work", "Data-driven Marketing for Dummies", 
                        "Bad Data Handbook", "Innovation: How to Produce Creative and Useful Scientific Ideas",
                        "Web Scraping with Python", "The Big Data-Driven Business", 
                        "Big Data: Understanding How Data Powers Big Business", "Yoga Wisdom at Work", 
                        "How to Have a Good Day", "Marketing Automation for Dummies", "Cyber Crime & Warfare", 
                        "Lean Analytics", "Making Habits, Breaking Habits", "Seven steps to train your mind", 
                        "Smart Change", "Grit to Great", "he Success Equation", "Rise of the Platform Marketer", 
                        "The Mathematical Corporation: Where Machine Intelligence and Human Ingenuity Achieve the Impossible"
                        
                       ]

In [87]:
df[(df.i == '145') & (df.title_check < .7)]

Unnamed: 0,given_title,scraped_title,price,i,title_check
171,The Mathematical Corporation,Corporate Financial Reporting and Analysis: A Global Perspective,88.87,145,0.622813
172,Rise of the Platform Marketer,The Boy in the Striped Pajamas,16.26,145,0.68067
173,Does it Work? 10 Principles for Delivering True Business Value in Digital Marketing,Tiffen 82mm Variable ND Filter,238.38,145,0.561008


### Something to do with `paperback` search

In [47]:
df[df['scraped_title'] == "The Boy in the Striped Pajamas"]

Unnamed: 0,given_title,scraped_title,price,i,title_check
12,Smartworlds,The Boy in the Striped Pajamas,16.26,35,0.124945
35,EconoMyths,The Boy in the Striped Pajamas,16.26,25,0.274736
72,Innovation: How to Produce Creative and Useful Scientific Ideas,The Boy in the Striped Pajamas,16.26,95,0.1091
84,The Fuzzy and the Techie,The Boy in the Striped Pajamas,16.26,160,0.65954
124,Getting Started with D3,The Boy in the Striped Pajamas,16.26,210,0.363198
172,Rise of the Platform Marketer,The Boy in the Striped Pajamas,16.26,145,0.68067
178,Engaging Customers Using Big Data,The Boy in the Striped Pajamas,16.26,150,0.518202
201,CSS-Refactoring,The Boy in the Striped Pajamas,16.26,225,0.319084
205,1337 h4x0r H4ndb00k,The Boy in the Striped Pajamas,16.26,70,-0.04025
235,The Strategist The Life and Times of Thomas Schelling,The Boy in the Striped Pajamas,16.26,10,0.702106
