# Step 1: Install Python packages

In [1]:
import os
import html5lib
import pandas as pd
from selenium import webdriver                   
from selenium.webdriver.common.keys import Keys   
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from datetime import date, timedelta, datetime as dt
from bs4 import BeautifulSoup as bs    

# Step 2: Preparation 

In [2]:
class RemoteDriverStartService():
    options = webdriver.ChromeOptions()
    # Set user app data to a new directory
    options.add_argument("user-data-dir=C:\\Users\\Donley\\App Data\\Google\\Chrome\\Application\\User Data\\Kit")
    options.add_experimental_option("Proxy", "null")
    options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
    # Create a download path for external data sources as default: 
    options.add_experimental_option("prefs", {
      "download.default_directory": r"C:\Users\Donley\Documents\GA_TECH\SUBMISSIONS\PROJECT2-CHALLENGE\data\external",
      "download.prompt_for_download": False,
      "download.directory_upgrade": True,
      "safebrowsing.enabled": True
    }),
    # Add those optional features to capabilities
    caps = options.to_capabilities()  
    def start_driver(self):
        return webdriver.Remote(command_executor='http://127.0.0.1:4444', 
                                desired_capabilities=self.caps)
# Set class equal to new capabilities:
DesiredCapabilities = RemoteDriverStartService()  

In [3]:
# Create variables for scraping: 
investo = "https://www.investopedia.com/top-communications-stocks-4583180"
# Download data to paths, csv's, json, etc: 
    # for external data sources
external = "../data/external/"
    # for processed data sources with ID's
processed = "../data/processed/"

In [4]:
# Locate Driver in system
current_path = os.getcwd()

# save the .exe file under the same directory of the web-scrape python script.
Path = os.path.join(current_path, "chromedriver")

# Initialize Chrome driver and start browser session controlled by automated test software under Kit profile.
caps = webdriver.DesiredCapabilities.CHROME.copy()
caps['acceptInsecureCerts'] = True
# caps = webdriver.DesiredCapabilities.CHROME.copy()
# caps['acceptInsecureCerts'] = True
# driver = webdriver.Chrome(options=options, desired_capabilities=caps)
driver = webdriver.Chrome(executable_path='chromedriver', desired_capabilities=caps)

# Step 3: Find the IDs of the items we want to scrape for

In [5]:
# Start Grabbing Information from investopedia: 
driver.get(investo)
timeout = 30
# Find an ID on the page and wait before executing anything until found: 
try:
    WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.ID, "main_1-0")))
except TimeoutException:
    driver.quit()

# Step 4: Techniques to make more human-like web-scrapers 

In [6]:
# If the website detects us as a web-scraper, it will cut our connection so we cannot pull more data and have to re-start our scraper. This largely impacts the efficiency of the scraper and involves a lot of manual interference. There are a few techniques we can use to make the scraper more human-like:
# (1) Randomize the sleep time
# This can be easily implemented as below wherever needed:
# #sleep for sometime between 5 and 8 seconds
# time.sleep(random.uniform(5,8))
# (2) Randomize the user agent for the web browser
# This is also easy and can be added to the browser options as below:
# ua = UserAgent()
# userAgent = ua.random
# Firefox_options = webdriver.FirefoxOptions()
# Firefox_options.add_argument(f’user-agent={userAgent}’)
# browser = webdriver.Firefox(executable_path = DRIVER_BIN, options=Firefox_options)
# (3) Use dynamic proxy/IP
# This requires more work than the above two. Usually free proxies are not stable and most of them don’t respond to requests, so we need to first a free proxy that responds to our requests. This website (also named as “url” in the script below) provides a lot of free proxies which we scrape down for our use. We will use Python BeautifulSoup package to scrape a list of proxies, and use Python requests package to test whether the proxy responds to our requests to the link.
# def get_proxy(link):
#     url = "https://www.sslproxies.org/"
#     r = requests.get(url)
#     soup = BeautifulSoup(r.content, 'html5lib')
#     proxies_list = list(map(lambda x: x[0]+':'+x[1], list(zip(map(lambda x: x.text, soup.findAll('td')[::8]), map(lambda x: x.text, soup.findAll('td')[1::8])))))
#     while 1:
#         try:
#             selected_ip = choice(proxies_list)
#             proxy = {'https': selected_ip, 'http': selected_ip}
#             headers = {'User-Agent': ua.random}
#             print('Using proxy:{}'.format(proxy))
#             r = requests.request('get', link, proxies=proxy, headers=headers, timeout=5)
#             break
#         except:
#             pass
        
#     return proxy
# We then add the working proxy to the browser option, similar to how we added the fake user agent:
# link = "https://www.expedia.com"
# proxy = get_proxy(link)
# Firefox_options.add_argument('--proxy-server=%s' % proxy)
# browser = webdriver.Firefox(executable_path = DRIVER_BIN, options=Firefox_options)

# Step 5: The full code that runs the scraper and save the data to .csv files


In [7]:
itable = driver.find_element_by_id("main_1-0").get_attribute('outerHTML')
itables  = pd.read_html(itable)
communications_bv = itables[0]
communications_bv.columns = ["Communictaions Best Value", "Price", "Market Cap", "12-Month Trailing P/E Ratio"]
communications_bv
# Locate column containing ticker symbols: 
communications_bv_df = communications_bv.iloc[1:]
# Only keep tick information within parentheses:
communications_bv_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in communications_bv_df["Communictaions Best Value"]]
communications_bv_ticks

['VIAC', 'LUMN', 'DISCK']

In [8]:
communications_fg = itables[1]
communications_fg.columns = ["Communications Fastest Growing", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
communications_fg_df = communications_fg.iloc[1:]
communications_fg_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in communications_fg_df["Communications Fastest Growing"]]
communications_fg_ticks

['ZM', 'LBRDA', 'JW.A']

In [9]:
communications_mm = itables[2]
communications_mm.columns = ["Communications Most Momentum", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
communications_mm_df = communications_mm.iloc[1:]
communications_mm_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in communications_mm_df["Communications Most Momentum"]]
del communications_mm_ticks[-2:]
communications_mm_ticks

['ZM', 'ZG', 'TWLO']

In [13]:
discretionary = driver.find_elements(By.CSS_SELECTOR, '#journey-nav__sublist_1-0 > li:nth-child(3) > a')
discretionary

[<selenium.webdriver.remote.webelement.WebElement (session="cdaa5bf725dc5fd0edd4d1300ca77435", element="efe5094d-466a-4843-bb11-3ec795ff8262")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cdaa5bf725dc5fd0edd4d1300ca77435", element="703e49c5-b873-4044-b929-a98bd45176a4")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cdaa5bf725dc5fd0edd4d1300ca77435", element="1dca930d-159a-4672-877a-3be0275d96ae")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cdaa5bf725dc5fd0edd4d1300ca77435", element="9e687892-7e07-4a4d-ae37-e6ab17dcc6e8")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cdaa5bf725dc5fd0edd4d1300ca77435", element="ce9c1803-3857-40e1-b5eb-e9bf1d69f88a")>]

In [15]:
discretionary[0].click() 

In [16]:
dtable = driver.find_element_by_id("main_1-0").get_attribute('outerHTML')
dtables  = pd.read_html(dtable)
discretionary_bv = dtables[0]
discretionary_bv.columns = ["tick", "Price", "Market Cap", "12-Month Trailing P/E Ratio"]
discretionary_bv
# Locate column containing ticker symbols: 
discretionary_bv_df = discretionary_bv.iloc[1:]
# Only keep tick information within parentheses:
discretionary_bv_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in discretionary_bv_df["tick"]]
discretionary_bv_ticks

['ARD', 'MGM', 'EBAY']

In [17]:
discretionary_fg = dtables[1]
discretionary_fg.columns = ["stock", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
discretionary_fg_df = discretionary_fg.iloc[1:]
discretionary_fg_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in discretionary_fg_df["stock"]]
discretionary_fg_ticks

['BERY', 'F', 'ETSY']

In [18]:
discretionary_mm = itables[2]
discretionary_mm.columns = ["Communications Most Momentum", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
discretionary_mm_df = discretionary_mm.iloc[1:]
discretionary_mm_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in discretionary_mm_df["Communications Most Momentum"]]
del discretionary_mm_ticks[-2:]
discretionary_mm_ticks

['ZM', 'ZG', 'TWLO']

In [19]:
staples = driver.find_elements(By.CSS_SELECTOR, '#journey-nav__sublist_1-0 > li:nth-child(4) > a')
staples[0].click()

In [20]:
stable = driver.find_element_by_id("main_1-0").get_attribute('outerHTML')
stables  = pd.read_html(stable)
staples_bv = stables[0]
staples_bv.columns = ["tick", "Price", "Market Cap", "12-Month Trailing P/E Ratio"]
staples_bv
# Locate column containing ticker symbols: 
staples_bv_df = staples_bv.iloc[1:]
# Only keep tick information within parentheses:
staples_bv_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in staples_bv_df["tick"]]
staples_bv_ticks

['ACI', 'CPB', 'KR']

In [21]:
staples_fg = stables[1]
staples_fg.columns = ["stock", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
staples_fg_df = staples_fg.iloc[1:]
staples_fg_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in staples_fg_df["stock"]]
staples_fg_ticks

['ACI', 'OLLI', 'KR']

In [22]:
staples_mm = stables[2]
staples_mm.columns = ["Communications Most Momentum", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
staples_mm_df = staples_mm.iloc[1:]
staples_mm_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in staples_mm_df["Communications Most Momentum"]]
del staples_mm_ticks[-2:]
staples_mm_ticks

['SAM', 'CHGG', 'TWOU']

In [23]:
energy = driver.find_elements(By.CSS_SELECTOR, '#journey-nav__sublist_1-0 > li:nth-child(5) > a')
energy[0].click()

In [24]:
etable = driver.find_element_by_id("main_1-0").get_attribute('outerHTML')
etables  = pd.read_html(etable)
energy_bv = etables[0]
energy_bv.columns = ["tick", "Price", "Market Cap", "12-Month Trailing P/E Ratio"]
energy_bv
# Locate column containing ticker symbols: 
energy_bv_df = energy_bv.iloc[1:]
# Only keep tick information within parentheses:
energy_bv_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in energy_bv_df["tick"]]
energy_bv_ticks

['MUR', 'LNG', 'VVV']

In [25]:
energy_fg = etables[1]
energy_fg.columns = ["stock", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
energy_fg_df = energy_fg.iloc[1:]
energy_fg_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in energy_fg_df["stock"]]
energy_fg_ticks

['VLO', 'AM', 'WMB']

In [26]:
energy_mm = etables[2]
energy_mm.columns = ["Communications Most Momentum", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
energy_mm_df = energy_mm.iloc[1:]
energy_mm_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in energy_mm_df["Communications Most Momentum"]]
del energy_mm_ticks[-2:]
energy_mm_ticks

['EQT', 'COG', 'AM']

In [27]:
financial = driver.find_elements(By.CSS_SELECTOR, '#journey-nav__sublist_1-0 > li:nth-child(6) > a')
financial[0].click()

In [28]:
ftable = driver.find_element_by_id("main_1-0").get_attribute('outerHTML')
ftables  = pd.read_html(ftable)
financial_bv = ftables[0]
financial_bv.columns = ["tick", "Price", "Market Cap", "12-Month Trailing P/E Ratio"]
financial_bv
# Locate column containing ticker symbols: 
financial_bv_df = financial_bv.iloc[1:]
# Only keep tick information within parentheses:
financial_bv_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in financial_bv_df["tick"]]
financial_bv_ticks

['BHF', 'UNM', 'MET']

In [29]:
financial_fg = ftables[1]
financial_fg.columns = ["stock", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
financial_fg_df = financial_fg.iloc[1:]
financial_fg_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in financial_fg_df["stock"]]
financial_fg_ticks

['WTM', 'ARES', 'PNC']

In [30]:
financial_mm = itables[2]
financial_mm.columns = ["Communications Most Momentum", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
financial_mm_df = financial_mm.iloc[1:]
financial_mm_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in financial_mm_df["Communications Most Momentum"]]
del financial_mm_ticks[-2:]
financial_mm_ticks

['ZM', 'ZG', 'TWLO']

In [31]:
healthcare = driver.find_elements(By.CSS_SELECTOR, '#journey-nav__sublist_1-0 > li:nth-child(7) > a')
healthcare[0].click()

In [32]:
htable = driver.find_element_by_id("main_1-0").get_attribute('outerHTML')
htables  = pd.read_html(htable)
healthcare_bv = htables[0]
healthcare_bv.columns = ["tick", "Price", "Market Cap", "12-Month Trailing P/E Ratio"]
healthcare_bv
# Locate column containing ticker symbols: 
healthcare_bv_df = healthcare_bv.iloc[1:]
# Only keep tick information within parentheses:
healthcare_bv_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in healthcare_bv_df["tick"]]
healthcare_bv_ticks

['BIIB', 'BIO', 'CVS']

In [33]:
healthcare_fg = htables[1]
healthcare_fg.columns = ["stock", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
healthcare_fg_df = healthcare_fg.iloc[1:]
healthcare_fg_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in healthcare_fg_df["stock"]]
healthcare_fg_ticks

['QDEL', 'PODD', 'PRGO']

In [34]:
healthcare_mm = htables[2]
healthcare_mm.columns = ["Communications Most Momentum", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
healthcare_mm_df = healthcare_mm.iloc[1:]
healthcare_mm_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in healthcare_mm_df["Communications Most Momentum"]]
del healthcare_mm_ticks[-2:]
healthcare_mm_ticks

['LVGO', 'IMMU', 'MRNA']

In [35]:
industrial = driver.find_elements(By.CSS_SELECTOR, '#journey-nav__sublist_1-0 > li:nth-child(8) > a')
industrial[0].click()

In [37]:
intable = driver.find_element_by_id("main_1-0").get_attribute('outerHTML')
intables  = pd.read_html(intable)
industrial_bv = intables[0]
industrial_bv.columns = ["tick", "Price", "Market Cap", "12-Month Trailing P/E Ratio"]
industrial_bv
# Locate column containing ticker symbols: 
industrial_bv_df = industrial_bv.iloc[1:]
# Only keep tick information within parentheses:
industrial_bv_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in industrial_bv_df["tick"]]
industrial_bv_ticks

['EAF', 'AL', 'HII']

In [38]:
industrial_fg = intables[1]
industrial_fg.columns = ["stock", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
industrial_fg_df = industrial_fg.iloc[1:]
industrial_fg_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in industrial_fg_df["stock"]]
industrial_fg_ticks

['ENR', 'PWR', 'VRT']

In [39]:
industrial_mm = intables[2]
industrial_mm.columns = ["Communications Most Momentum", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
industrial_mm_df = industrial_mm.iloc[1:]
industrial_mm_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in industrial_mm_df["Communications Most Momentum"]]
del industrial_mm_ticks[-2:]
industrial_mm_ticks

['GNRC', 'FDX', 'VRT']

In [40]:
materials = driver.find_elements(By.CSS_SELECTOR, '#journey-nav__sublist_1-0 > li:nth-child(9) > a')
materials[0].click()

In [41]:
motable = driver.find_element_by_id("main_1-0").get_attribute('outerHTML')
motables  = pd.read_html(motable)
materials_bv = motables[0]
materials_bv.columns = ["tick", "Price", "Market Cap", "12-Month Trailing P/E Ratio"]
materials_bv
# Locate column containing ticker symbols: 
materials_bv_df = discretionary_bv.iloc[1:]
# Only keep tick information within parentheses:
materials_bv_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in materials_bv_df["tick"]]
materials_bv_ticks

['ARD', 'MGM', 'EBAY']

In [42]:
materials_fg = motables[1]
materials_fg.columns = ["stock", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
materials_fg_df = materials_fg.iloc[1:]
materials_fg_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in materials_fg_df["stock"]]
materials_fg_ticks

['EXP', 'RGLD', 'MDU']

In [43]:
materials_mm = motables[2]
materials_mm.columns = ["Communications Most Momentum", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
materials_mm_df = materials_mm.iloc[1:]
materials_mm_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in materials_mm_df["Communications Most Momentum"]]
del materials_mm_ticks[-2:]
materials_mm_ticks

['NEM', 'FCX', 'SMG']

In [44]:
real_estate = driver.find_elements(By.CSS_SELECTOR, '#journey-nav__sublist_1-0 > li:nth-child(10) > a')
real_estate[0].click()

In [45]:
retable = driver.find_element_by_id("main_1-0").get_attribute('outerHTML')
retables  = pd.read_html(retable)
real_estate_bv = retables[0]
real_estate_bv.columns = ["tick", "Price", "Market Cap", "12-Month Trailing P/E Ratio"]
real_estate_bv
# Locate column containing ticker symbols: 
real_estate_bv_df = real_estate_bv.iloc[1:]
# Only keep tick information within parentheses:
real_estate_bv_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in real_estate_bv_df["tick"]]
real_estate_bv_ticks

['BPYU', 'KIM', 'EQC']

In [46]:
real_estate_fg = retables[1]
real_estate_fg.columns = ["stock", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
real_estate_fg_df = real_estate_fg.iloc[1:]
real_estate_fg_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in real_estate_fg_df["stock"]]
real_estate_fg_ticks

['KIM', 'COLD', 'ARE']

In [47]:
real_estate_mm = retables[2]
real_estate_mm.columns = ["Communications Most Momentum", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
real_estate_mm_df = real_estate_mm.iloc[1:]
real_estate_mm_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in real_estate_mm_df["Communications Most Momentum"]]
del real_estate_mm_ticks[-2:]
real_estate_mm_ticks

['CSGP', 'EQIX', 'SBAC']

In [48]:
tech = driver.find_elements(By.CSS_SELECTOR, '#journey-nav__sublist_1-0 > li:nth-child(11) > a')
tech[0].click()

In [49]:
tetable = driver.find_element_by_id("main_1-0").get_attribute('outerHTML')
tetables  = pd.read_html(tetable)
tech_bv = tetables[0]
tech_bv.columns = ["tick", "Price", "Market Cap", "12-Month Trailing P/E Ratio"]
tech_bv
# Locate column containing ticker symbols: 
tech_bv_df = tech_bv.iloc[1:]
# Only keep tick information within parentheses:
tech_bv_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in tech_bv_df["tick"]]
tech_bv_ticks

['BPYU', 'KIM', 'EQC']

In [50]:
tech_fg = tetables[1]
tech_fg.columns = ["stock", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
tech_fg_df = tech_fg.iloc[1:]
tech_fg_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in tech_fg_df["stock"]]
tech_fg_ticks

['KIM', 'COLD', 'ARE']

In [51]:
tech_mm = tetables[2]
tech_mm.columns = ["Communications Most Momentum", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
tech_mm_df = discretionary_mm.iloc[1:]
tech_mm_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in tech_mm_df["Communications Most Momentum"]]
del tech_mm_ticks[-2:]
tech_mm_ticks

['ZM', 'ZG', 'TWLO']

In [52]:
utilities = driver.find_elements(By.CSS_SELECTOR, '#journey-nav__sublist_1-0 > li:nth-child(12) > a')
utilities[0].click()

In [53]:
utable = driver.find_element_by_id("main_1-0").get_attribute('outerHTML')
utables  = pd.read_html(utable)
utilities_bv = utables[0]
utilities_bv.columns = ["tick", "Price", "Market Cap", "12-Month Trailing P/E Ratio"]
utilities_bv
# Locate column containing ticker symbols: 
utilities_bv_df = utilities_bv.iloc[1:]
# Only keep tick information within parentheses:
utilities_bv_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in utilities_bv_df["tick"]]
utilities_bv_ticks

['NLOK', 'XRX', 'NCR']

In [54]:
utilities_fg = utables[1]
utilities_fg.columns = ["stock", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
utilities_fg_df = utilities_fg.iloc[1:]
utilities_fg_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in utilities_fg_df["stock"]]
utilities_fg_ticks

['CRM', 'WEX', 'NLOK']

In [55]:
utilities_mm = utables[2]
utilities_mm.columns = ["Communications Most Momentum", "Price", "Market Cap", "12-Month Trailing Total Return (%)"]
utilities_mm_df = utilities_mm.iloc[1:]
utilities_mm_ticks = [tick[tick.find("(")+1:tick.find(")")] for tick in utilities_mm_df["Communications Most Momentum"]]
del utilities_mm_ticks[-2:]
utilities_mm_ticks

['FSLY', 'DOCU', 'ENPH']

In [56]:
lists=[communications_bv_ticks,communications_fg_ticks,communications_mm_ticks, discretionary_bv_ticks,discretionary_fg_ticks,discretionary_mm_ticks,staples_bv_ticks,staples_fg_ticks,staples_mm_ticks,energy_bv_ticks,energy_fg_ticks,energy_mm_ticks, financial_bv_ticks,financial_fg_ticks,financial_mm_ticks,healthcare_bv_ticks,healthcare_fg_ticks,healthcare_mm_ticks,industrial_bv_ticks,industrial_fg_ticks,industrial_mm_ticks,tech_bv_ticks,tech_fg_ticks,tech_mm_ticks,materials_bv_ticks,materials_fg_ticks,materials_mm_ticks,real_estate_bv_ticks,real_estate_fg_ticks,real_estate_mm_ticks,utilities_bv_ticks,utilities_fg_ticks,utilities_mm_ticks]
stock_list = [item for sublist in lists for item in sublist]
lists

[['VIAC', 'LUMN', 'DISCK'],
 ['ZM', 'LBRDA', 'JW.A'],
 ['ZM', 'ZG', 'TWLO'],
 ['ARD', 'MGM', 'EBAY'],
 ['BERY', 'F', 'ETSY'],
 ['ZM', 'ZG', 'TWLO'],
 ['ACI', 'CPB', 'KR'],
 ['ACI', 'OLLI', 'KR'],
 ['SAM', 'CHGG', 'TWOU'],
 ['MUR', 'LNG', 'VVV'],
 ['VLO', 'AM', 'WMB'],
 ['EQT', 'COG', 'AM'],
 ['BHF', 'UNM', 'MET'],
 ['WTM', 'ARES', 'PNC'],
 ['ZM', 'ZG', 'TWLO'],
 ['BIIB', 'BIO', 'CVS'],
 ['QDEL', 'PODD', 'PRGO'],
 ['LVGO', 'IMMU', 'MRNA'],
 ['EAF', 'AL', 'HII'],
 ['ENR', 'PWR', 'VRT'],
 ['GNRC', 'FDX', 'VRT'],
 ['BPYU', 'KIM', 'EQC'],
 ['KIM', 'COLD', 'ARE'],
 ['ZM', 'ZG', 'TWLO'],
 ['ARD', 'MGM', 'EBAY'],
 ['EXP', 'RGLD', 'MDU'],
 ['NEM', 'FCX', 'SMG'],
 ['BPYU', 'KIM', 'EQC'],
 ['KIM', 'COLD', 'ARE'],
 ['CSGP', 'EQIX', 'SBAC'],
 ['NLOK', 'XRX', 'NCR'],
 ['CRM', 'WEX', 'NLOK'],
 ['FSLY', 'DOCU', 'ENPH']]

In [61]:
sp500_df=pd.read_csv('../data/external/sp500.csv')
sector_l=sp500_df["S&P 500 & Sectors"]
sector_l

0                    S&P 500
1     Communication Services
2     Consumer Discretionary
3           Consumer Staples
4                     Energy
5                 Financials
6                Health Care
7                Industrials
8     Information Technology
9                  Materials
10               Real Estate
11                 Utilities
Name: S&P 500 & Sectors, dtype: object

In [62]:
new_sector_df=pd.DataFrame()
new_sector_df["ids"]=sector_l.drop(sector_df.index[0])
new_sector_df["labels"]=sector_l.drop(sector_df.index[0])
new_sector_df

Unnamed: 0,ids,labels
1,Communication Services,Communication Services
2,Consumer Discretionary,Consumer Discretionary
3,Consumer Staples,Consumer Staples
4,Energy,Energy
5,Financials,Financials
6,Health Care,Health Care
7,Industrials,Industrials
8,Information Technology,Information Technology
9,Materials,Materials
10,Real Estate,Real Estate


In [63]:
from itertools import cycle
import numpy as np 

In [64]:
perf_df= pd.DataFrame(np.arange(1,34), columns=['ids'])
seq = cycle(['Best Value','Fastest Growth','Most Momentum'])
perf_df['labels'] = [next(seq) for count in range(perf_df.shape[0])]
perf_df

Unnamed: 0,ids,labels
0,1,Best Value
1,2,Fastest Growth
2,3,Most Momentum
3,4,Best Value
4,5,Fastest Growth
5,6,Most Momentum
6,7,Best Value
7,8,Fastest Growth
8,9,Most Momentum
9,10,Best Value


In [65]:
lists=[]
for i in range(len(sector_l)):
    
    
    seq = cycle([sector_l[i]])
    lists.append([next(seq) for count in range(3)])

combined = [item for sublist in lists for item in sublist]
perf_df['ids']=combined 
perf_df['parents']=combined
perf_df['ids']=perf_df['ids'] + '-' + perf_df['labels']
perf_df

ValueError: Length of values (36) does not match length of index (33)

In [None]:
stocks_df=pd.DataFrame()
stocks_df['parents']=perf_df['parents'] + '-' + perf_df['labels']
stocks_df

In [66]:
driver.quit()