In [1]:
from dotenv import load_dotenv
load_dotenv()
from datetime import datetime
import pickle, json, os
import pandas as pd
from parse import *
from bot import *


# Directories
ARTICLE_DIR = os.path.join("..//data", 'article_soup')
CLEAN_DIR = os.path.join("..//data", 'clean')
RAW_DIR = os.path.join("..//data", 'raw')

# Inpaths
SOURCE_PATH = os.path.join('..//data', 'source_codes.csv')

# Outpaths
CSV_OUTPATH = os.path.join(CLEAN_DIR, 'No5301_5350_Longoria.csv')



In [2]:
def gen_searches():
    
    for file in os.listdir(RAW_DIR):
        companies = [str(code).lower() for code in pd.read_csv(os.path.join(RAW_DIR, file))['factiva_company_code'].values]
    df = pd.read_csv(SOURCE_PATH)
    codes = df['Newspaper_code'].values
    commands = df['search command'].values
    sources = {code: commands[i] for i, code in enumerate(codes)}
    years = range(1995,2021)
    

    try:
        with open('search_list.pickle', 'rb') as file:
            searches = pickle.load(file)
            if len(searches) == 0:
                print('Create a new pickle to manage searches; current list has been exhausted.')
    except FileNotFoundError:
        searches = [f'{command}{company},{year},{codes[i]},{company}' for i, command in enumerate(commands) for company in companies for year in years]
    
    return searches

In [3]:
def get_year_info(driver, wait):
    
    duplicates = 0
    counter = {'1': 0, '2': 0, '3': 0, '4': 0}
    total = 0
    article_links = []
    
    while driver:

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        if soup.find('tr', {'class': 'headline'}) == None:
            article_links = []
            get_new_page(driver, wait)
            return counter, article_links, total, duplicates

        elif soup.find('a', {'class', 'nextItem'}) !=  None:

            counter, total, article_links = get_page_info(soup, counter, total, article_links)
            duplicates += get_duplicates(soup)
            next_page(driver, wait)

        elif soup.find('a', {'class', 'nextItem'}) ==  None:

            counter, total, article_links = get_page_info(soup, counter, total, article_links)
            duplicates += get_duplicates(soup)
            counter_total = sum(list(counter.values()))

            if (total-duplicates) != counter_total:
                
                return 'Did not count duplicates properly; increase sleep time if necessary'
            else:
                get_new_page(driver, wait)
                return counter, article_links, total, duplicates

In [4]:
def get_all_frequencies(eid_username, eid_password, path):
    
    """
    Brief walkthrough of the arguments:
    
    1. eid_username
    
        - Should be your UT EID username that you use to login to Canvas
    
    2. eid_password
    
        - Should be your password to login to Canvas
        - As the readme states, store your password in a .env file
        
    3. path
        
        - This is the location of your chromedriver.exe which should be in the directory created to run chromedriver locally
        
    """
    
    searches = gen_searches()
    searches_pickle = searches.copy()
    driver, wait = set_driver(path)
    open_page(driver, wait, eid_username, eid_password)
#     open_tab(driver, wait)

    print(f'\nStarting at: {datetime.now()}\nFirst search term is {searches[0]}\nLength of the current list is {len(searches)}\n')
    
    for i, text in enumerate(searches):
        
        search = text.split(',')[0]
        year = int(text.split(',')[1])        
        pub_code = text.split(',')[2]
        co_code = text.split(',')[3]
        articles = {f'{co_code}_{"".join(pub_code)}_{year}': []}
   
        if year < 2020:
            date_dict = {'frm': '01', 'frd': '01', 'fry': year, 'tom': '12', 'tod': '31', 'toy': year}
        
        else:
            date_dict = {'frm': '01', 'frd': '01', 'fry': year, 'tom': '06', 'tod': '30', 'toy': year}
        
        enter_search(driver, wait, date_dict, search)
        counter, article_links, total, duplicates = get_year_info(driver, wait)
        articles[f'{co_code}_{pub_code}_{year}'] = article_links
        searches_pickle.pop(0)
        try:
            with open(os.path.join(ARTICLE_DIR, f'{co_code}_{pub_code}_{year}.json'), 'a+') as file:
                file.write('\n')
                json.dump(articles, file)
                file.write('\n')
            pd.DataFrame.from_dict(
                {
                        'year': [year, year, year, year],
                        'quarter': list(counter.keys()),
                        'count': list(counter.values()),
                        'company_code': [co_code, co_code, co_code, co_code],
                        'pub_code': [pub_code,pub_code,pub_code,pub_code,]
                }
            ).to_csv((CSV_OUTPATH), mode='a', header=not os.path.exists(CSV_OUTPATH))
            with open('search_list.pickle', 'wb') as file:
                pickle.dump(searches_pickle, file)
        except:
            return 'Error when saving'

In [6]:
soup = get_all_frequencies('gal767', os.getenv('eid_password'), "C://Users//galon//cd_secure//chromedriver.exe")

Duo cookies still valid; proceeding to search page...

Starting at: 2022-10-31 10:38:57.029440
First search term is (rst=ftft or rst=ftcom) and fds=hrpgrp,2002,sfftftp,hrpgrp
Length of the current list is 8963



WebDriverException: Message: unknown error: cannot determine loading status
from disconnected: received Inspector.detached event
  (Session info: chrome=102.0.5005.63)
Stacktrace:
Backtrace:
	Ordinal0 [0x005B2733+2434867]
	Ordinal0 [0x005407A1+1968033]
	Ordinal0 [0x0042C678+837240]
	Ordinal0 [0x0041E45E+779358]
	Ordinal0 [0x0041E04F+778319]
	Ordinal0 [0x0041D646+775750]
	Ordinal0 [0x0041C565+771429]
	Ordinal0 [0x0041CB68+772968]
	Ordinal0 [0x00434AB2+871090]
	Ordinal0 [0x0042EB7B+846715]
	Ordinal0 [0x0042E6F5+845557]
	Ordinal0 [0x0042EDD3+847315]
	Ordinal0 [0x00427F25+818981]
	Ordinal0 [0x004284DB+820443]
	Ordinal0 [0x0041DD3B+777531]
	Ordinal0 [0x0041E265+778853]
	Ordinal0 [0x0041E04F+778319]
	Ordinal0 [0x0041D646+775750]
	Ordinal0 [0x0041CEBC+773820]
	Ordinal0 [0x00432153+860499]
	Ordinal0 [0x004847B7+1198007]
	Ordinal0 [0x00474256+1131094]
	Ordinal0 [0x0044E840+976960]
	Ordinal0 [0x0044F736+980790]
	GetHandleVerifier [0x00823C72+2515426]
	GetHandleVerifier [0x0081702F+2463135]
	GetHandleVerifier [0x0065522A+620442]
	GetHandleVerifier [0x00654016+615814]
	Ordinal0 [0x0054707B+1994875]
	Ordinal0 [0x0054B938+2013496]
	Ordinal0 [0x0054BA25+2013733]
	Ordinal0 [0x00554DE1+2051553]
	BaseThreadInitThunk [0x75EDFA29+25]
	RtlGetAppContainerNamedObjectPath [0x77AE7BBE+286]
	RtlGetAppContainerNamedObjectPath [0x77AE7B8E+238]


In [None]:
### Refactoring work in progress
# if year == 1995:
#     date_dict = {'frm': '01', 'frd': '01', 'fry': 1995, 'tom': '06', 'tod': '30', 'toy': 2020}
#     enter_search(driver, wait, date_dict, search)
#     soup = BeautifulSoup(driver.page_source, 'html.parser')
#     check_frequency_chart(soup)
# return driver.page_source