In [1]:
from dotenv import load_dotenv
load_dotenv()
from datetime import datetime
import pickle, json, os
import pandas as pd
from parse import *
from bot import *


############## Enter your variables below ##############

output_name = "test_1"

input_company_file = "STEP4_Factiva_company_list_No5301_5350_sector_Ishika"

ut_eid = "gal767"

eid_password = os.getenv('eid_password')

driver_path = r"C:\Users\galon\cd_secure\chromedriver.exe"

"""
1. output_file_name:

    - This will be the name of the file you will upload back to the project repository
    
2. input_company_file:

    - This will be the name of the csv file that contains company codes as a column
    - This csv must contain a column called 'factiva_company_code'
    - If you want to use another file, you need to adjust the gen_searches method

3. ut_eid:

    - This is the eid you use to login to the UT portal

4. eid_password
    
    - This is your eid password
    - If you put this in a dotenv this will simply be: os.getenv('eid_password')
    - IF YOU ENTER YOUR PASSWORD DIRECTLY, DO NOT PUSH TO GITHUB
    
5. driver_path

    - This is the path where you stored chromedriver.exe
    - The location of this file should have been chosen in the setup; see steps on https://github.com/galongoria/factiva
    
"""
############ Do not change anything else ##############



# Directories
ARTICLE_DIR = os.path.join("..\data", 'article_hrefs', output_name)
CLEAN_DIR = os.path.join(r"..\data", 'clean')
RAW_DIR = os.path.join(r"..\data", 'raw')
os.makedirs(ARTICLE_DIR, exist_ok=True)
os.makedirs(CLEAN_DIR, exist_ok=True)
os.makedirs('pickles', exist_ok=True)

# Inpaths
SOURCE_PATH = os.path.join(RAW_DIR, 'source_codes.csv')

# Outpaths
CSV_OUTPATH = os.path.join(CLEAN_DIR, f"{output_name}.csv")
PICKLE_OUTPATH = os.path.join("pickles", f"{input_company_file}_searches.pickle")

In [2]:
def delete_old_pickle():
    
    for file in os.listdir('pickles'):
        if file == '.ipynb_checkpoints':
            continue
        elif file != f"{input_company_file.replace('.csv','')}_searches.pickle":
            os.remove(file)

In [3]:
def check_input_file():
    
    try:
        df = pd.read_csv(os.path.join(RAW_DIR, f"{input_company_file}.csv"))
        return df
    except FileNotFoundError:
        pass
    try:
        df = pd.read_excel(os.path.join(RAW_DIR, f"{input_company_file}.xlsx"))
        return df
    except FileNotFoundError:
        pass

In [4]:
def gen_searches():
    
    delete_old_pickle()
    
    try:
        with open(PICKLE_OUTPATH, 'rb') as file:
            searches = pickle.load(file)
            if not searches:
                return 'You have completed this assignment. Please change the input file before running the program.'
    except FileNotFoundError:
        co_df = read_input_file()
        companies = [str(code).lower() for code in co_df[~co_df['factiva_company_code'].isnull()]['factiva_company_code'].values]
        df = pd.read_csv(SOURCE_PATH)
        codes = df['Newspaper_code'].values
        commands = df['search command'].values
        searches =  [f'{command}{company},{codes[i]},{company}' for company in companies for i,command in enumerate(commands)]
    return searches

In [5]:
def all_none_dataframe(co_code, pub_code):

    for year in range(1995,2021):
        if year == 1995:
            info_dict = {
            'year': [1995,1995,1995,1995],
            'quarter': [1,2,3,4],
            }
            
        elif year < 2020:
            info_dict['year'] += [year,year,year,year]
            info_dict['quarter'] += [1,2,3,4]
            
        elif year == 2020:
            info_dict['year'] += [2020,2020]
            info_dict['quarter'] += [1,2]
    df = pd.DataFrame.from_dict(info_dict)
    df['count'] = 0
    df['company_code'] = co_code
    df['pub_code'] = pub_code

    return df

In [6]:
def year_none_dataframe(co_code, pub_code, year):
    
    if year < 2020:
        info_dict = {
            'year': [year, year, year, year],
            'quarter': [1,2,3,4],
        }
        
    else:
        info_dict = {
            'year': [year,year],
            'quarter': [1,2],
        }
    df = pd.DataFrame.from_dict(info_dict)
    df['count'] = 0
    df['company_code'] = co_code
    df['pub_code'] = pub_code
    
    return df

In [7]:
def get_year_info(driver, wait):
    
    duplicates = 0
    counter = {'1': 0, '2': 0, '3': 0, '4': 0}
    total = 0
    article_links = []
    
    while driver:

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        if soup.find('a', {'class', 'nextItem'}) !=  None:
            counter, total, article_links = get_page_info(soup, counter, total, article_links)
            duplicates += get_duplicates(soup)
            next_page(driver, wait)

        elif soup.find('a', {'class', 'nextItem'}) ==  None:
            counter, total, article_links = get_page_info(soup, counter, total, article_links)
            duplicates += get_duplicates(soup)
            counter_total = sum(list(counter.values()))

            if (total-duplicates) != counter_total:
                return 'Did not count duplicates properly; increase sleep time if necessary'
            else:
                get_new_page(driver, wait, ut_eid, eid_password)
                return counter, article_links

In [8]:
def get_all_frequencies(eid_username, eid_password, path):
    
    searches = gen_searches()
    searches_pickle = searches.copy()
    driver, wait = set_driver(path)
    open_page(driver, wait, eid_username, eid_password)

    print(f'\nStarting at: {datetime.now()}\nFirst search term is {searches[0]}\nLength of the current list is {len(searches)}\n')
    
    for i, text in enumerate(searches):
        
        search, pub_code, co_code = text.split(',')
        articles = {f'{co_code}_{"".join(pub_code)}_{year}': [] for year in range(1995,2021)}
        date_dict = {'frm': '01', 'frd': '01', 'fry': 1995, 'tom': '06', 'tod': '30', 'toy': 2020}
        enter_search(driver, wait, date_dict, search)
        results = check_frequency_chart(BeautifulSoup(driver.page_source, 'html.parser'))
        
        if results == None:
            
            try:
                searches_pickle.pop(0)
                all_none_dataframe(co_code,pub_code).to_csv(CSV_OUTPATH, mode='a', header=not os.path.exists(CSV_OUTPATH))
                with open(PICKLE_OUTPATH, 'wb') as file:
                    pickle.dump(searches_pickle, file)
            except:
                return 'Error when saving'
        else:
            
            df = pd.DataFrame()
            for year in range(1995, 2021):
                
                if year in results:
                    if year < 2020:
                        date_dict = {'frm': '01', 'frd': '01', 'fry': year, 'tom': '12', 'tod': '31', 'toy': year}
                    else:
                        date_dict = {'frm': '01', 'frd': '01', 'fry': year, 'tom': '06', 'tod': '30', 'toy': year}
                    enter_search(driver, wait, date_dict, search)
                    counter, article_links = get_year_info(driver, wait)
                    df = pd.concat([df, pd.DataFrame.from_dict({
                        'year': [year, year, year, year],
                        'quarter': list(counter.keys()),
                        'count': list(counter.values()),
                        'company_code': [co_code, co_code, co_code, co_code],
                        'pub_code': [pub_code,pub_code,pub_code,pub_code,]
                    })])
                    articles[f'{co_code}_{pub_code}_{year}'].append(article_links)

                else:
                    df = pd.concat([df, year_none_dataframe(co_code, pub_code, year)])
        
            try:
                searches_pickle.pop(0)
                for name, article_list in articles.items():
                    if len(article_list) == 0:
                        continue
                    else:
                        with open(os.path.join(ARTICLE_DIR, f'{name}.json'), 'w+') as file:
                            file.write('\n')
                            json.dump(article_list, file)
                            file.write('\n')
                df.to_csv(CSV_OUTPATH, mode='a', header=not os.path.exists(CSV_OUTPATH))
                with open(PICKLE_OUTPATH, 'wb') as file:
                    pickle.dump(searches_pickle, file)
            except:
                return 'Error when saving.'

In [9]:
get_all_frequencies(ut_eid, eid_password, driver_path)


Starting at: 2023-03-07 11:14:36.926585
First search term is (rst=sddz or rst=sudzeit or rst=sz) and fds=ninc,sfsddzp,ninc
Length of the current list is 380



TypeError: get_new_page() missing 2 required positional arguments: 'ut_eid' and 'eid_password'