In [1]:
from dotenv import load_dotenv
load_dotenv()
from datetime import datetime
import pickle, json, os
import pandas as pd
from parse import *
from bot import *


# Directories
ARTICLE_DIR = os.path.join("..//data", 'article_soup')
CLEAN_DIR = os.path.join("..//data", 'clean')
RAW_DIR = os.path.join("..//data", 'raw')

# Inpaths
SOURCE_PATH = os.path.join('..//data', 'source_codes.csv')

# Outpaths
CSV_OUTPATH = os.path.join(CLEAN_DIR, 'No5301_5350_Longoria.csv')



In [2]:
def gen_searches():
    
    for file in os.listdir(RAW_DIR):
        companies = [str(code).lower() for code in pd.read_csv(os.path.join(RAW_DIR, file))['factiva_company_code'].values]
    df = pd.read_csv(SOURCE_PATH)
    codes = df['Newspaper_code'].values
    commands = df['search command'].values
    sources = {code: commands[i] for i, code in enumerate(codes)}
    years = range(1995,2021)

    try:
        with open('search_list.pickle', 'rb') as file:
            searches = pickle.load(file)
            if len(searches) == 0:
                print('Create a new pickle to manage searches; current list has been exhausted.')
    except FileNotFoundError:
        searches = [f'{command}{company},{year},{codes[i]},{company}' for i, command in enumerate(commands) for company in companies for year in years]
    
    return searches

In [3]:
def get_year_info(driver, wait):
    
    duplicates = 0
    counter = {'1': 0, '2': 0, '3': 0, '4': 0}
    total = 0
    article_links = []
    
    while driver:

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        if soup.find('tr', {'class': 'headline'}) == None:
            article_links = []
            driver.get('https://guides.lib.utexas.edu/db/144')
            return counter, article_links, total, duplicates

        elif soup.find('a', {'class', 'nextItem'}) !=  None:

            counter, total, article_links = get_page_info(soup, counter, total, article_links)
            duplicates += get_duplicates(soup)
            next_page(driver, wait)

        elif soup.find('a', {'class', 'nextItem'}) ==  None:

            counter, total, article_links = get_page_info(soup, counter, total, article_links)
            duplicates += get_duplicates(soup)
            counter_total = sum(list(counter.values()))

            if (total-duplicates) != counter_total:
                return 'Did not count duplicates properly; increase sleep time if necessary'
            else:
                driver.get('https://guides.lib.utexas.edu/db/144')
                return counter, article_links, total, duplicates

In [4]:
def get_all_frequencies(eid_username, eid_password, path):
    
    """
    Brief walkthrough of the arguments:
    
    1. eid_username
    
        - Should be your UT EID username that you use to login to Canvas
    
    2. eid_password
    
        - Should be your password to login to Canvas
        - As the readme states, store your password in a .env file
        
    3. path
        
        - This is the location of your chromedriver.exe which should be in the directory created to run chromedriver locally
        
    """
    
    searches = gen_searches()
    searches_pickle = searches.copy()
    driver, wait = set_driver(path)
    get_page(driver, wait, eid_username, eid_password)

    print(f'\nStarting at: {datetime.now()}\nFirst search term is {searches[0]}\nLength of the current list is {len(searches)}\n')
    
    for i, text in enumerate(searches):
        print(datetime.now()).
        

        search = text.split(',')[0]
        year = int(text.split(',')[1])        
        pub_code = text.split(',')[2]
        co_code = text.split(',')[3]
        articles = {f'{co_code}_{"".join(pub_code)}_{year}': []}
        
        if year < 2020:
            date_dict = {'frm': '01', 'frd': '01', 'fry': year, 'tom': '12', 'tod': '31', 'toy': year}
        else:
            date_dict = {'frm': '01', 'frd': '01', 'fry': year, 'tom': '06', 'tod': '30', 'toy': year}
        
        enter_search(driver, wait, date_dict, search)
        counter, article_links, total, duplicates = get_year_info(driver, wait)
        articles[f'{co_code}_{pub_code}_{year}'] = article_links
        searches_pickle.pop(0)
        try:
            with open(os.path.join(ARTICLE_DIR, f'{co_code}_{pub_code}_{year}.json'), 'a+') as file:
                file.write('\n')
                json.dump(articles, file)
                file.write('\n')
            pd.DataFrame.from_dict(
                {
                        'year': [year, year, year, year],
                        'quarter': list(counter.keys()),
                        'count': list(counter.values()),
                        'company_code': [co_code, co_code, co_code, co_code],
                        'pub_code': [pub_code,pub_code,pub_code,pub_code,]
                }
            ).to_csv((CSV_OUTPATH), mode='a', header=not os.path.exists(CSV_OUTPATH))
            with open('search_list.pickle', 'wb') as file:
                pickle.dump(searches_pickle, file)
        except:
            return 'Error when saving'

In [None]:
get_all_frequencies('gal767', os.getenv('eid_password'), "C://Users//galon//cd_secure//chromedriver.exe")


Starting at: 2022-10-29 21:25:10.781414
First search term is rst=sfwsj and fds=cathb,2020,sfwsj,cathb
Length of the current list is 12819

2022-10-29 21:25:10.781414
2022-10-29 21:25:23.744632
2022-10-29 21:25:37.882164
2022-10-29 21:25:54.587497
2022-10-29 21:26:09.961783
2022-10-29 21:26:23.870921
2022-10-29 21:26:39.951669
2022-10-29 21:26:54.382288
2022-10-29 21:27:07.250082
2022-10-29 21:27:20.097584
2022-10-29 21:27:33.088695
2022-10-29 21:27:46.489949
2022-10-29 21:28:02.073511
2022-10-29 21:28:16.398470
2022-10-29 21:28:29.424770
2022-10-29 21:28:42.737297
2022-10-29 21:28:55.633563
2022-10-29 21:29:09.139942
2022-10-29 21:29:23.530370
2022-10-29 21:29:38.044958
2022-10-29 21:29:52.671038
no dups
no dups
no dups
no dups
no dups
no dups
no dups
2022-10-29 21:30:08.162897
no dups
no dups
2022-10-29 21:30:24.305470
no dups
no dups
no dups
2022-10-29 21:30:39.070706
no dups
no dups
2022-10-29 21:30:52.479341
no dups
no dups
2022-10-29 21:31:06.091452
2022-10-29 21:31:19.383361
202