In [1]:
from dotenv import load_dotenv
load_dotenv()
from datetime import datetime
import pickle, json, os
import pandas as pd
from parse import *
from bot import *


# Directories
ARTICLE_DIR = os.path.join("..//data", 'article_soup')
CLEAN_DIR = os.path.join("..//data", 'clean')
RAW_DIR = os.path.join("..//data", 'raw')

# Inpaths
SOURCE_PATH = os.path.join(RAW_DIR, 'source_codes.csv')
CO_PATH = os.path.join(RAW_DIR, 'STEP4_Factiva_company_list_No5301_5350_sector_Ishika.csv')

# Outpaths
CSV_OUTPATH = os.path.join(CLEAN_DIR, 'No5301_5350_Longoria.csv')

# Notes
# to get the actual link, split the href on ";" and take the first index
# add that to the base factiva url

In [2]:
def gen_searches():
    
    companies = [str(code).lower() for code in pd.read_csv(CO_PATH)['factiva_company_code'].values]
    df = pd.read_csv(SOURCE_PATH)
    codes = df['Newspaper_code'].values
    commands = df['search command'].values
    sources = {code: commands[i] for i, code in enumerate(codes)}    
    try:
        with open('ptest.pickle', 'rb') as file:
            searches = pickle.load(file)
            if len(searches) == 0:
                print('Create a new pickle to manage searches; current list has been exhausted.')
    except FileNotFoundError:
        searches =  [f'{command}{company},{codes[i]},{company}' for company in companies for i,command in enumerate(commands)]
    return searches

In [3]:
def all_none_dataframe(co_code, pub_code):

    for year in range(1995,2021):

        if year == 1995:

            info_dict = {
            'year': [1995,1995,1995,1995],
            'quarter': [1,2,3,4],
            }
        elif year < 2020:

            info_dict['year'] += [year,year,year,year]
            info_dict['quarter'] += [1,2,3,4]
        elif year == 2020:

            info_dict['year'] += [2020,2020]
            info_dict['quarter'] += [1,2]

    df = pd.DataFrame.from_dict(info_dict)
    df['count'] = 0
    df['company_code'] = co_code
    df['pub_code'] = pub_code

    return df

In [4]:
def year_none_dataframe(co_code, pub_code, year):
    
    if year < 2020:
        
        info_dict = {
            'year': [year, year, year, year],
            'quarter': [1,2,3,4],
        }
    else:
        
        info_dict = {
            'year': [year,year],
            'quarter': [1,2],
        }
        
    df = pd.DataFrame.from_dict(info_dict)
    df['count'] = 0
    df['company_code'] = co_code
    df['pub_code'] = pub_code
    
    return df

In [5]:
def get_year_info(driver, wait):
    
    duplicates = 0
    counter = {'1': 0, '2': 0, '3': 0, '4': 0}
    total = 0
    article_links = []
    
    while driver:

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        if soup.find('tr', {'class': 'headline'}) == None:
            article_links = []
            get_new_page(driver, wait)
            return counter, article_links

        elif soup.find('a', {'class', 'nextItem'}) !=  None:

            counter, total, article_links = get_page_info(soup, counter, total, article_links)
            duplicates += get_duplicates(soup)
            next_page(driver, wait)

        elif soup.find('a', {'class', 'nextItem'}) ==  None:

            counter, total, article_links = get_page_info(soup, counter, total, article_links)
            duplicates += get_duplicates(soup)
            counter_total = sum(list(counter.values()))

            if (total-duplicates) != counter_total:
                
                return 'Did not count duplicates properly; increase sleep time if necessary'
            else:
                
                get_new_page(driver, wait)
                return counter, article_links

In [6]:
def get_all_frequencies(eid_username, eid_password, path):
    
    """
    Brief walkthrough of the arguments:
    
    1. eid_username
    
        - Should be your UT EID username that you use to login to Canvas
    
    2. eid_password
    
        - Should be your password to login to Canvas
        - As the readme states, store your password in a .env file
        
    3. path
        
        - This is the location of your chromedriver.exe which should be in the directory created to run chromedriver locally
        
    """
    
    searches = gen_searches()
    searches_pickle = searches.copy()
    driver, wait = set_driver(path)
    open_page(driver, wait, eid_username, eid_password)
    open_tab(driver, wait)

    print(f'\nStarting at: {datetime.now()}\nFirst search term is {searches[0]}\nLength of the current list is {len(searches)}\n')
    
    for i, text in enumerate(searches):
        
        search = text.split(',')[0]
        pub_code = text.split(',')[1]
        co_code = text.split(',')[2]
        articles = {f'{co_code}_{"".join(pub_code)}_{year}': [] for year in range(1995,2021)}
        date_dict = {'frm': '01', 'frd': '01', 'fry': 1995, 'tom': '06', 'tod': '30', 'toy': 2020}
        enter_search(driver, wait, date_dict, search)
        results = check_frequency_chart(BeautifulSoup(driver.page_source, 'html.parser'))
        
        if results == None:
            
            df = all_none_dataframe(co_code,pub_code)
            try:
                searches_pickle.pop(0)
                df.to_csv('test.csv', mode='a', header=not os.path.exists('test.csv'))
                with open('ptest.pickle', 'wb') as file:
                    pickle.dump(searches_pickle, file)
            except:
                return 'Error when saving'
        else:
            
            df = pd.DataFrame()
            for year in range(1995, 2021):
                if year in results:
                    if year < 2020:
                        date_dict = date_dict = {'frm': '01', 'frd': '01', 'fry': year, 'tom': '12', 'tod': '31', 'toy': year}
                    else:
                        date_dict = {'frm': '01', 'frd': '01', 'fry': year, 'tom': '06', 'tod': '30', 'toy': year}
                        
                    enter_search(driver, wait, date_dict, search)
                    counter, article_links = get_year_info(driver, wait)
                    df = pd.concat([df, pd.DataFrame.from_dict({
                        
                        'year': [year, year, year, year],
                        'quarter': list(counter.keys()),
                        'count': list(counter.values()),
                        'company_code': [co_code, co_code, co_code, co_code],
                        'pub_code': [pub_code,pub_code,pub_code,pub_code,]
                    })])
                    articles[f'{co_code}_{pub_code}_{year}'].append(article_links)

                else:
                    df = pd.concat([df, year_none_dataframe(co_code, pub_code, year)])
        
            try:
                searches_pickle.pop(0)
                for name, article_list in articles.items():
                    if len(article_list) == 0:
                        continue
                    else:
                        with open(os.path.join('article_test', f'{name}.json'), 'w+') as file:
                            file.write('\n')
                            json.dump(article_list, file)
                            file.write('\n')
                df.to_csv('test.csv', mode='a', header=not os.path.exists('test.csv'))
                with open('ptest.pickle', 'wb') as file:
                    pickle.dump(searches_pickle, file)
            except:
                return 'Error when saving.'

In [7]:
get_all_frequencies('gal767', os.getenv('eid_password'), "C://Users//galon//cd_secure//chromedriver.exe")

Create a new pickle to manage searches; current list has been exhausted.
Duo cookies still valid; proceeding to search page...


IndexError: list index out of range