In [1]:
from dotenv import load_dotenv
load_dotenv()
from datetime import datetime
import pickle, json, os
import pandas as pd
from parse import *
from bot import *

DATA_DIR = os.path.join("..//data")
ARTICLE_DIR = os.path.join(DATA_DIR, 'article_soup')
CLEAN_DIR = os.path.join(DATA_DIR, 'clean')

# hr benefits, fintech, 

In [2]:
def gen_searches():
    
    years = range(1995,2021)
    sources = ['rst=sfusat', 'rst=sfwsj', 'rst=sfnyt', 'rst=sfglob', '(rst=ftft or rst=ftcom)']
    companies = ['applc']
    try:
        with open('search_list.pickle', 'rb') as file:
            searches = pickle.load(file)
            if len(searches) == 0:
                
                print('Create a new pickle to manage searches; current list has been exhausted.')
    except FileNotFoundError:
        searches = [f'{source} and fds={company},{year}' for source in sources for company in companies for year in years]
    
    return searches

In [3]:
def get_year_info(driver, wait):
    
    duplicates = 0
    counter = {'1': 0, '2': 0, '3': 0, '4': 0}
    total = 0
    article_links = []
    
    while driver:

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        if soup.find('tr', {'class': 'headline'}) == None:
            article_links = []
            modify_search(driver, wait)
            return counter, article_links, total, duplicates

        elif soup.find('a', {'class', 'nextItem'}) !=  None:

            counter, total, article_links = get_page_info(soup, counter, total, article_links)
            duplicates += get_duplicates(soup)
            next_page(driver, wait)

        elif soup.find('a', {'class', 'nextItem'}) ==  None:

            counter, total, article_links = get_page_info(soup, counter, total, article_links)
            duplicates += get_duplicates(soup)
            counter_total = sum(list(counter.values()))

            if (total-duplicates) != counter_total:
                return 'Did not count duplicates properly; increase sleep time if necessary'
            else:
                modify_search(driver, wait)
                return counter, article_links, total, duplicates

In [4]:
def get_all_frequencies(eid_username, eid_password, path):
    
    """
    Argument explanations:
    
    1. eid_username
    
        - Should be your UT EID username that you use to login to Canvas
    
    2. eid_password
    
        - Should be your password to login to Canvas
        - I recommend you putting the password in a .env file and using the following format for the password argument:
        os.getenv("password_variable_name") where password variable name is a name you choose to define your password variable
        inside of the .env folder.
        
    3. path
        
        - This is the location of your chromedriver.exe. This file should be in the directory you created to run chromedriver locally
        
    4. first_login
    
        - A boolean that is True if your browser has not yet logged into Factiva and False otherwise. Every time you
        reopen the browser, you need to set this to True because closing the browser deletes some of your session cookies. Afterwards, you
        may return to the main search page by setting this value to false and rerunning the function.
        
    """
    
    searches = gen_searches()
    searches_pickle = searches.copy()
    driver, wait = set_driver(path)
    get_page(driver, wait, eid_username, eid_password)

    print(f'\nStarting at: {datetime.now()}\nFirst search term is {searches[0]}\n')
    
    for i, search_text in enumerate(searches):
        
        search = search_text.split(',')[0]
        year = int(search_text.split(',')[1])        
        pub_code = "_".join([e for e in re.split('rst=|\s', re.sub(r'[()]','',search.split('and')[0].strip())) if e != 'or' and len(e) > 0])
        co_code = search.split('fds=')[-1]
        articles = {f'{co_code}_{"".join(pub_code)}_{year}': []}
        
        if year < 2020:
            date_dict = {'frm': '01', 'frd': '01', 'fry': year, 'tom': '12', 'tod': '31', 'toy': year}
        else:
            date_dict = {'frm': '01', 'frd': '01', 'fry': year, 'tom': '06', 'tod': '30', 'toy': year}
        
        enter_search(driver, wait, date_dict, search)
        counter, article_links, total, duplicates = get_year_info(driver, wait)
        articles[f'{co_code}_{"".join(pub_code)}_{year}'] = article_links
        searches_pickle.pop(0)
        try:
            with open(os.path.join(ARTICLE_DIR, f'{co_code}_{"".join(pub_code)}_{year}.json'), 'a+') as file:
                file.write('\n')
                json.dump(articles, file)
                file.write('\n')
            pd.DataFrame.from_dict(
                {
                        'year': [year, year, year, year],
                        'quarter': list(counter.keys()),
                        'count': list(counter.values()),
                        'company_code': [co_code, co_code, co_code, co_code],
                        'pub_code': [pub_code,pub_code,pub_code,pub_code,]
                }
            ).to_csv(os.path.join(CLEAN_DIR, 'test.csv'), mode='a', header=not os.path.exists(os.path.join(CLEAN_DIR, 'test.csv')))
            with open('search_list.pickle', 'wb') as file:
                pickle.dump(searches_pickle, file)
        except:
            return 'Error when saving'

In [None]:
get_all_frequencies('gal767', os.getenv('eid_password'), "C://Users//galon//cd_secure//chromedriver.exe")

Duo cookies still valid; proceeding to search page...

Starting at: 2022-10-28 17:13:42.699454
First search term is rst=sfwsj and fds=applc,2012

