In [1]:
from dotenv import load_dotenv
load_dotenv()
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from datetime import datetime
import pickle, json, time, os
import regex as re
import pandas as pd

In [2]:
def gen_searches():
    
    #source : rst
    #company: fds
    
    sources = ['rst=sfusat', 'rst=sfwsj', 'rst=sfnyt', 'rst=sfglob', '(rst=ftft or rst=ftcom)']
    companies = ['applc']

    return [f'{source} and fds={company}' for source in sources for company in companies]

In [3]:
def enter_search(driver, wait, dates, year, search):

    wait.until(EC.element_to_be_clickable((By.XPATH, '//select[@name="dr"]'))).click()
    wait.until(EC.element_to_be_clickable((By.XPATH, '//option[@value="Custom"]'))).click()
    driver.find_element(By.ID, 'frm').clear()
    driver.find_element(By.ID, 'frm').send_keys('01')
    driver.find_element(By.ID, 'frd').clear()
    driver.find_element(By.ID, 'frd').send_keys('01')
    driver.find_element(By.ID, 'fry').clear()
    driver.find_element(By.ID, 'fry').send_keys(year)
    driver.find_element(By.ID, 'tom').clear()
    driver.find_element(By.ID, 'tom').send_keys(dates[0])
    driver.find_element(By.ID, 'tod').clear()
    driver.find_element(By.ID, 'tod').send_keys(dates[1])
    driver.find_element(By.ID, 'toy').clear()
    driver.find_element(By.ID, 'toy').send_keys(year)
    driver.find_element(By.XPATH, '//select[@name="isrd"]').click()
    wait.until(EC.element_to_be_clickable((By.XPATH, '//option[@value="High"]'))).click()
    search_box = driver.find_element(By.XPATH, '//textarea[@name="ftx"]')
    search_box.clear()
    search_box.send_keys(search)
    try:
        driver.find_element(By.XPATH, '//div[@class="pillNoMenu"]').click()
    except NoSuchElementException:
        pass
    search_box.send_keys(Keys.ENTER)
    wait.until(EC.visibility_of_element_located((By.XPATH, '//span[@data-channel="Dowjones"]')))

In [4]:
def get_page_info(soup, counter):
    
    article_links = []
    map_dict = {
        'January': '1',
        'February': '1',
        'March': '1',
        'April': '2',
        'May': '2',
        'June': '2',
        'July': '3',
        'August': '3',
        'September': '3',
        'October': '4',
        'November': '4',
        'December': '4',
    }
    
    for headline in soup.find_all('tr', {'class': 'headline'}):
            
        article_links.append(headline.find('a').get('href'))
        sub_list = sum([subtitle.split(' ') for subtitle in headline.find('div').text.split(',') if len(re.findall('[0-9]+', subtitle)) > 0], [])
        month = ''.join(set(sub_list) & set(map_dict.keys()))
        counter[map_dict[month]] += 1
        
    return counter, article_links

In [5]:
def get_all_text(eid_username, eid_password, search_function, first_login):
    
    """
    Argument explanations:
    
    1. eid_username
    
        - Should be your UT EID username that you use to login to Canvas
    
    2. eid_password
    
        - Should be your password to login to Canvas
        - I recommend you putting the password in a .env file and using the following format for the password argument:
        os.getenv("password_variable_name") where password variable name is a name you choose to define your password variable
        inside of the .env folder.
        
    3. search_function
        
        - Is defined above by inputting a dataframe filled with company codes
        
    4. first_login
    
        - A boolean that is True if your browser has not yet logged into Factiva and False otherwise. Every time you
        reopen the browser, you need to set this to True because closing the browser deletes some of your session cookies. Afterwards, you
        may return to the main search page by setting this value to false and rerunning the function.
        
    """
    
    url = 'https://guides.lib.utexas.edu/db/144'
    option = webdriver.ChromeOptions()
    option.add_experimental_option("debuggerAddress", "localhost:9222")
    driver = webdriver.Chrome(executable_path = "C://Users//galon//cd_secure//chromedriver.exe",options=option)
    wait = WebDriverWait(driver, 10)
    driver.set_page_load_timeout(20)
    driver.get(url)
    
    if first_login == True:

        wait.until(EC.element_to_be_clickable((By.ID, 'username')))
        driver.find_element(By.ID, 'username').send_keys(eid_username)
        driver.find_element(By.XPATH, '//input[@id="password"]').send_keys(eid_password)    
        driver.find_element(By.XPATH, "//input[@value='Sign in']").click()
        try:
            wait.until(EC.element_to_be_clickable((By.XPATH, "//button[@id='trust-browser-button']"))).click()
            
        except TimeoutException:
            print('Duo cookies still valid; proceeding to search page...')   
    print(f'\nStarting at: {datetime.now()}\n')
    searches = search_function
    df = pd.DataFrame.from_dict({'year': [], 'quarter': [], 'count': []})
    
    for search in searches:
        
        pub_code = search.split('and')[0].strip()
        co_code = search.split('fds=')[-1]
        years = range(1995, 2021)
        duplicates = 0
        for year in years:
            counter = {'1': 0, '2': 0, '3': 0, '4': 0}            
            if year < 2020:
                dates = ('12', '31')
            else:
                dates = ('06', '30')
            print(year)
            enter_search(driver, wait, dates, year, search)

            while year:
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                print(soup.find('div', {'id': 'headlines'}).text.split('.')[0].strip())
                if "No search results" == soup.find('div', {'id': 'headlines'}).text.split('.')[0].strip():
                    wait.until(EC.element_to_be_clickable((By.ID, 'btnModifySearch'))).click()
                    break                    
                else:
                    counter, article_links = get_page_info(soup, counter)
                    try:
                        duplicates += int(soup.find('span', {'id': 'dedupSummary'}).text.split(':')[1].strip())
                    except KeyError:
                        print('No duplicates on this page')
                    try:
                        wait.until(EC.element_to_be_clickable((By.XPATH, '//a[@class="nextItem"]'))).click()
                        wait.until(EC.visibility_of_element_located((By.XPATH, '//img[@src="../img/listmanager/progress.gif"]')))
                        wait.until(EC.invisibility_of_element_located((By.XPATH, '//img[@src="../img/listmanager/progress.gif"]')))
                    except TimeoutException:
                        wait.until(EC.element_to_be_clickable((By.ID, 'btnModifySearch'))).click()
#                         driver.get('https://guides.lib.utexas.edu/db/144')
                        break
            df = df.append(pd.DataFrame.from_dict(
                {
                        'year': [year, year, year, year],
                        'quarter': list(counter.keys()),
                        'count': list(counter.values()),
                        'company_code': [co_code, co_code, co_code, co_code],
                        'pub_code': [pub_code,pub_code,pub_code,pub_code,]
                }
            ))
            display(df)
        print(duplicates)
    print(f'Done!\n{datetime.now()}')

In [6]:
get_all_text('gal767', os.getenv('eid_password'), gen_searches(), first_login=False)

  driver = webdriver.Chrome(executable_path = "C://Users//galon//cd_secure//chromedriver.exe",options=option)



Starting at: 2022-10-26 23:06:56.270793

1995
No search results


  df = df.append(pd.DataFrame.from_dict(


Unnamed: 0,year,quarter,count,company_code,pub_code
0,1995.0,1,0.0,applc,rst=sfusat
1,1995.0,2,0.0,applc,rst=sfusat
2,1995.0,3,0.0,applc,rst=sfusat
3,1995.0,4,0.0,applc,rst=sfusat


1996
No search results


  df = df.append(pd.DataFrame.from_dict(


Unnamed: 0,year,quarter,count,company_code,pub_code
0,1995.0,1,0.0,applc,rst=sfusat
1,1995.0,2,0.0,applc,rst=sfusat
2,1995.0,3,0.0,applc,rst=sfusat
3,1995.0,4,0.0,applc,rst=sfusat
0,1996.0,1,0.0,applc,rst=sfusat
1,1996.0,2,0.0,applc,rst=sfusat
2,1996.0,3,0.0,applc,rst=sfusat
3,1996.0,4,0.0,applc,rst=sfusat


1997
1


  df = df.append(pd.DataFrame.from_dict(


Unnamed: 0,year,quarter,count,company_code,pub_code
0,1995.0,1,0.0,applc,rst=sfusat
1,1995.0,2,0.0,applc,rst=sfusat
2,1995.0,3,0.0,applc,rst=sfusat
3,1995.0,4,0.0,applc,rst=sfusat
0,1996.0,1,0.0,applc,rst=sfusat
1,1996.0,2,0.0,applc,rst=sfusat
2,1996.0,3,0.0,applc,rst=sfusat
3,1996.0,4,0.0,applc,rst=sfusat
0,1997.0,1,2.0,applc,rst=sfusat
1,1997.0,2,12.0,applc,rst=sfusat


1998
1


KeyboardInterrupt: 