In [1]:
from dotenv import load_dotenv
load_dotenv()
from datetime import datetime
import pickle, json, os
import pandas as pd
from parse import *
from bot import *

In [2]:
def gen_searches():
    
    sources = ['rst=sfusat', 'rst=sfwsj', 'rst=sfnyt', 'rst=sfglob', '(rst=ftft or rst=ftcom)']
    companies = ['applc']

    return [f'{source} and fds={company}' for source in sources for company in companies]

In [3]:
def get_year_info(driver, wait, year, counter, total, duplicates):

    while year:

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        if soup.find('tr', {'class': 'headline'}) == None:

            print(f'year: {year}\nNo search results')
            modify_search_loop(driver, wait)
            break       

        elif soup.find('a', {'class', 'nextItem'}) !=  None:

            counter, article_links, total = get_page_info(soup, counter, total)
            duplicates += count_duplicates(soup)
            next_loop(driver, wait)

        elif soup.find('a', {'class', 'nextItem'}) ==  None:

            counter, article_links, total = get_page_info(soup, counter, total)
            duplicates += count_duplicates(soup)
            counter_total = sum(list(counter.values()))

            if (total-duplicates) != counter_total:
                return 'Did not count duplicates properly; increase sleep time if necessary'
            else:
                print(f'year: {year}\nDuplicates are equal and all articles counted')
                modify_search_loop(driver, wait)
                break

In [4]:
def get_all_freq(eid_username, eid_password, path, first_login):
    
    """
    Argument explanations:
    
    1. eid_username
    
        - Should be your UT EID username that you use to login to Canvas
    
    2. eid_password
    
        - Should be your password to login to Canvas
        - I recommend you putting the password in a .env file and using the following format for the password argument:
        os.getenv("password_variable_name") where password variable name is a name you choose to define your password variable
        inside of the .env folder.
        
    3. path
        
        - This is the location of your chromedriver.exe. This file should be in the directory you created to run chromedriver locally
        
    4. first_login
    
        - A boolean that is True if your browser has not yet logged into Factiva and False otherwise. Every time you
        reopen the browser, you need to set this to True because closing the browser deletes some of your session cookies. Afterwards, you
        may return to the main search page by setting this value to false and rerunning the function.
        
    """
    
    
    
    driver, wait = set_driver(path)
    get_page(driver, 'https://guides.lib.utexas.edu/db/144')
    
    if first_login == True:
        
        login(driver, wait, eid_username, eid_password)

    print(f'\nStarting at: {datetime.now()}\n')
    searches = gen_searches()
    df = pd.DataFrame.from_dict({'year': [], 'quarter': [], 'count': []})
    articles = {search: [] for search in searches}
    
    for search in searches:
    
        pub_code = search.split('and')[0].strip()
        co_code = search.split('fds=')[-1]
        
        for year in range(1995, 2021):
            
            duplicates = 0
            counter = {'1': 0, '2': 0, '3': 0, '4': 0}
            total = 0
            if year < 2020:
                dates = ('12', '31')
            else:
                dates = ('06', '30')
                
            enter_search(driver, wait, dates, year, search)
            get_year_info(driver, wait, year, counter, total, duplicates)
            pd.DataFrame.from_dict(
                {
                        'year': [year, year, year, year],
                        'quarter': list(counter.keys()),
                        'count': list(counter.values()),
                        'company_code': [co_code, co_code, co_code, co_code],
                        'pub_code': [pub_code,pub_code,pub_code,pub_code,]
                }
            ).to_csv('test.csv', mode='a', header=not os.path.exists('test.csv'))
    print(f'Done!\n{datetime.now()}')
    return df

In [None]:
get_all_freq('gal767', os.getenv('eid_password'), "C://Users//galon//cd_secure//chromedriver.exe", first_login=False)


Starting at: 2022-10-28 12:55:11.665891

year: 1995
No search results
year: 1996
Duplicates are equal and all articles counted
year: 1997
Duplicates are equal and all articles counted
year: 1998
Duplicates are equal and all articles counted
year: 1999
No search results
year: 2000
Duplicates are equal and all articles counted
year: 2001
Duplicates are equal and all articles counted
year: 2002
Duplicates are equal and all articles counted
year: 2003
Duplicates are equal and all articles counted
year: 2004
Duplicates are equal and all articles counted
year: 2005
Duplicates are equal and all articles counted
year: 2006
Duplicates are equal and all articles counted
year: 2007
Duplicates are equal and all articles counted
year: 2008
Duplicates are equal and all articles counted
year: 2009
Duplicates are equal and all articles counted
year: 2010
Duplicates are equal and all articles counted
year: 2011
Duplicates are equal and all articles counted
year: 2012
Duplicates are equal and all artic