# Step 1: Install Python packages

In [1]:
import os
import html5lib
import pandas as pd
from selenium import webdriver                   
from selenium.webdriver.common.keys import Keys   
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from datetime import date, timedelta, datetime as dt
from bs4 import BeautifulSoup as bs    

# Step 2: Preparation 

In [2]:
class RemoteDriverStartService():
    options = webdriver.ChromeOptions()
    # Set user app data to a new directory
    options.add_argument("user-data-dir=C:\\Users\\Donley\\App Data\\Google\\Chrome\\Application\\User Data\\Kit")
    options.add_experimental_option("Proxy", "null")
    options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
    # Create a download path for external data sources as default: 
    options.add_experimental_option("prefs", {
      "download.default_directory": r"C:\Users\Donley\Documents\GA_TECH\SUBMISSIONS\PROJECT2-CHALLENGE\data\external",
      "download.prompt_for_download": False,
      "download.directory_upgrade": True,
      "safebrowsing.enabled": True
    }),
    # Add those optional features to capabilities
    caps = options.to_capabilities()  
    def start_driver(self):
        return webdriver.Remote(command_executor='http://127.0.0.1:4444', 
                                desired_capabilities=self.caps)
# Set class equal to new capabilities:
DesiredCapabilities = RemoteDriverStartService()  

In [3]:
# Create variables for scraping: 
investo = "https://www.investopedia.com/top-communications-stocks-4583180"
# Download data to paths, csv's, json, etc: 
    # for external data sources
external = "../data/external/"
    # for processed data sources with ID's
processed = "../data/processed/"

In [7]:
# Locate Driver in system
current_path = os.getcwd()

# save the .exe file under the same directory of the web-scrape python script.
Path = os.path.join(current_path, "chromedriver")

# Initialize Chrome driver and start browser session controlled by automated test software under Kit profile.
caps = webdriver.DesiredCapabilities.CHROME.copy()
caps['acceptInsecureCerts'] = True
# caps = webdriver.DesiredCapabilities.CHROME.copy()
# caps['acceptInsecureCerts'] = True
# driver = webdriver.Chrome(options=options, desired_capabilities=caps)
driver = webdriver.Chrome(executable_path='chromedriver', desired_capabilities=caps)

# Step 3: Find the IDs of the items we want to scrape for

In [9]:
# Start Grabbing Information from investopedia: 
driver.get(investo)
timeout = 30
# Find an ID on the page and wait before executing anything until found: 
try:
    WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.ID, "main_1-0")))
except TimeoutException:
    driver.quit()

# Step 4: Techniques to make more human-like web-scrapers 

In [None]:
# If the website detects us as a web-scraper, it will cut our connection so we cannot pull more data and have to re-start our scraper. This largely impacts the efficiency of the scraper and involves a lot of manual interference. There are a few techniques we can use to make the scraper more human-like:
# (1) Randomize the sleep time
# This can be easily implemented as below wherever needed:
# #sleep for sometime between 5 and 8 seconds
# time.sleep(random.uniform(5,8))
# (2) Randomize the user agent for the web browser
# This is also easy and can be added to the browser options as below:
# ua = UserAgent()
# userAgent = ua.random
# Firefox_options = webdriver.FirefoxOptions()
# Firefox_options.add_argument(f’user-agent={userAgent}’)
# browser = webdriver.Firefox(executable_path = DRIVER_BIN, options=Firefox_options)
# (3) Use dynamic proxy/IP
# This requires more work than the above two. Usually free proxies are not stable and most of them don’t respond to requests, so we need to first a free proxy that responds to our requests. This website (also named as “url” in the script below) provides a lot of free proxies which we scrape down for our use. We will use Python BeautifulSoup package to scrape a list of proxies, and use Python requests package to test whether the proxy responds to our requests to the link.
# def get_proxy(link):
#     url = "https://www.sslproxies.org/"
#     r = requests.get(url)
#     soup = BeautifulSoup(r.content, 'html5lib')
#     proxies_list = list(map(lambda x: x[0]+':'+x[1], list(zip(map(lambda x: x.text, soup.findAll('td')[::8]), map(lambda x: x.text, soup.findAll('td')[1::8])))))
#     while 1:
#         try:
#             selected_ip = choice(proxies_list)
#             proxy = {'https': selected_ip, 'http': selected_ip}
#             headers = {'User-Agent': ua.random}
#             print('Using proxy:{}'.format(proxy))
#             r = requests.request('get', link, proxies=proxy, headers=headers, timeout=5)
#             break
#         except:
#             pass
        
#     return proxy
# We then add the working proxy to the browser option, similar to how we added the fake user agent:
# link = "https://www.expedia.com"
# proxy = get_proxy(link)
# Firefox_options.add_argument('--proxy-server=%s' % proxy)
# browser = webdriver.Firefox(executable_path = DRIVER_BIN, options=Firefox_options)

# Step 5: The full code that runs the scraper and save the data to .csv files


In [72]:
itable = driver.find_element_by_id("main_1-0").get_attribute('outerHTML')
itables  = pd.read_html(itable)
communications_bv = itables[0]
communications_bv.columns = ["Communictaions Best Value", "Price", "Market Cap", "12-Month Trailing P/E Ratio"]
communications_bv
# Locate column containing ticker symbols: 
communications_bv_df = communications_bv.iloc[1:]
# Only keep tick information within parentheses:
communications_bv_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in communications_bv_df["Communictaions Best Value"]]
communications_bv_ticks

['ARD', 'MGM', 'EBAY']

In [73]:
communications_fg = itables[1]
communications_fg.columns = ["Communications Fastest Growing", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
communications_fg_df = communications_fg.iloc[1:]
communications_fg_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in communications_fg_df["Communications Fastest Growing"]]
communications_fg_ticks

['BERY', 'F', 'ETSY']

In [74]:
communications_mm = itables[2]
communications_mm.columns = ["Communications Most Momentum", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
communications_mm_df = communications_mm.iloc[1:]
communications_mm_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in communications_mm_df["Communications Most Momentum"]]
del communications_mm_ticks[-2:]
communications_mm_ticks

['TSLA', 'CVNA', 'W']

In [75]:
sectors = driver.find_elements(By.CSS_SELECTOR, '#journey-nav__sublist_1-0 > li:nth-child(3)')

In [76]:
sectors[1].click()

ElementNotInteractableException: Message: element not interactable
  (Session info: chrome=85.0.4183.121)


In [77]:
dtable = driver.find_element_by_id("main_1-0").get_attribute('outerHTML')
dtables  = pd.read_html(dtable)
discretionary_bv = dtables[0]
discretionary_bv.columns = ["tick", "Price", "Market Cap", "12-Month Trailing P/E Ratio"]
discretionary_bv
# Locate column containing ticker symbols: 
discretionary_bv_df = discretionary_bv.iloc[1:]
# Only keep tick information within parentheses:
discretionary_bv_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in discretionary_bv_df["tick"]]
discretionary_bv_ticks

['ARD', 'MGM', 'EBAY']

In [78]:
discretionary_fg = dtables[1]
discretionary_fg.columns = ["stock", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
discretionary_fg_df = discretionary_fg.iloc[1:]
discretionary_fg_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in discretionary_fg_df["stock"]]
discretionary_fg_ticks

['BERY', 'F', 'ETSY']

In [79]:
discretionary_mm = itables[2]
discretionary_mm.columns = ["Communications Most Momentum", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
discretionary_mm_df = discretionary_mm.iloc[1:]
discretionary_mm_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in discretionary_mm_df["Communications Most Momentum"]]
del discretionary_mm_ticks[-2:]
discretionary_mm_ticks

['TSLA', 'CVNA', 'W']

In [80]:
sectors[2].click()

ElementNotInteractableException: Message: element not interactable
  (Session info: chrome=85.0.4183.121)


In [81]:
driver.quit()