In [1]:
from dotenv import load_dotenv
load_dotenv()
from datetime import datetime
import pickle, json, os
import pandas as pd
from parse import *
from bot import *


############## Enter your variables below ##############

output_name = "fixed_csv_folder"

input_company_file = "STEP4_Factiva_company_list_No5301_5350_sector_Ishika.csv"

ut_eid = "gal767"

eid_password = os.getenv('eid_password')

driver_path = "C://Users//galon//cd_secure//chromedriver.exe"

"""
1. output_file_name:

    - This will be the name of the file you will upload back to the project repository
    
2. input_company_file:

    - This will be the name of the csv file that contains company codes as a column
    - This csv must contain a column called 'factiva_company_code'
    - If you want to use another file, you need to adjust the gen_searches method

3. ut_eid:

    - This is the eid you use to login to the UT portal

4. eid_password
    
    - This is your eid password
    - If you put this in a dotenv this will simply be: os.getenv('eid_password')
    - IF YOU ENTER YOUR PASSWORD DIRECTLY, DO NOT PUSH TO GITHUB
    
5. driver_path

    - This is the path where you stored chromedriver.exe
    - The location of this file should have been chosen in the setup; see steps on https://github.com/galongoria/factiva
    
Below '#Open Chrome browser', the command will be different if you have a Mac.

"""



############ Do not change anything else ##############


# Directories
ARTICLE_DIR = os.path.join("..//data", 'article_hrefs', output_name)
CLEAN_DIR = os.path.join("..//data", 'clean')
RAW_DIR = os.path.join("..//data", 'raw')
os.makedirs(ARTICLE_DIR, exist_ok=True)
os.makedirs(CLEAN_DIR, exist_ok=True)

# Inpaths
SOURCE_PATH = os.path.join(RAW_DIR, 'source_codes.csv')
CO_PATH = os.path.join(RAW_DIR, input_company_file)

# Outpaths
CSV_OUTPATH = os.path.join(CLEAN_DIR, f"{output_name}.csv")


In [2]:
def get_year_info(driver, wait):
    
    duplicates = 0
    counter = {'1': 0, '2': 0}
    total = 0
    article_links = []
    
    while driver:

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        if soup.find('a', {'class', 'nextItem'}) !=  None:
            counter, total, article_links = get_page_info(soup, counter, total, article_links)
            duplicates += get_duplicates(soup)
            next_page(driver, wait)

        elif soup.find('a', {'class', 'nextItem'}) ==  None:
            counter, total, article_links = get_page_info(soup, counter, total, article_links)
            if total != 0:
                duplicates += get_duplicates(soup)
            
            counter_total = sum(list(counter.values()))
            if (total-duplicates) != counter_total:
                return 'Did not count duplicates properly; increase sleep time if necessary'
            else:
                get_new_page(driver, wait)
                return counter, article_links

In [3]:
def get_all_frequencies(eid_username, eid_password, path, searches):
    
    
    searches_pickle = searches.copy()
    driver, wait = set_driver(path)
    open_page(driver, wait, eid_username, eid_password)
    
    print(f'\nStarting at: {datetime.now()}\nFirst search term is {searches[0]}\nLength of the current list is {len(searches)}\n')
    df = pd.DataFrame()
    articles = {}

    for i, text in enumerate(searches):
        
        search = text.split(',')[0]
        pub_code = text.split(',')[1]
        co_code = text.split(',')[2]
        date_dict = {'frm': '01', 'frd': '01', 'fry': 2020, 'tom': '06', 'tod': '30', 'toy': 2020}
        enter_search(driver, wait, date_dict, search)
        counter, article_links = get_year_info(driver, wait)
        df = pd.concat([df,pd.DataFrame.from_dict({
            'year': [2020, 2020],
            'quarter': list(counter.keys()),
            'count': list(counter.values()),
            'company_code': [co_code, co_code],
            'pub_code': [pub_code,pub_code]
        })])
        articles[f'{co_code}_{pub_code}_2020'] = article_links
        try:
            searches_pickle.pop(0)
            for name, article_list in articles.items():
                if article_list == None:
                    continue
                else:
                    with open(os.path.join(ARTICLE_DIR, f'{name}.json'), 'w+') as file:
                        file.write('\n')
                        json.dump(article_list, file)
                        file.write('\n')
            df.to_csv(CSV_OUTPATH, mode='a', header=not os.path.exists(CSV_OUTPATH))
            with open('searches.pickle', 'wb') as file:
                pickle.dump(searches_pickle, file)
        except:
            return 'Error when saving.'

In [4]:
def fix_files():
    
    
    command_df = pd.read_csv("..//data//raw//source_codes.csv")
    
    done  = []
    for file in os.listdir("..//data//2020_fix"):
        df = pd.read_csv(os.path.join("..//data//2020_fix", file))
        df['index'] = df.index
        
        ## We'll use this for the index
        df_2020 = df.loc[df['year']==2020, :]
    
        df_searches = df_2020.drop_duplicates(['company_code', 'pub_code'])
        df_searches = df_searches.merge(command_df, how='left', left_on='pub_code', right_on='Newspaper_code')
        co_list = list(df_searches.company_code.values)
        pub_list = list(df_searches['search command'].values)
        codes = list(df_searches.Newspaper_code.values)
        try:
            with open('searches.pickle', 'rb') as file:
                searches = pickle.load(file)
                if len(searches) == 0:
                    print('Create a new pickle to manage searches; current list has been exhausted.')
        except FileNotFoundError:
            searches = [f"{pub}{co_list[i]},{codes[i]},{co_list[i]}" for i, pub in enumerate(pub_list)]
        print(get_all_frequencies(ut_eid, eid_password, driver_path, searches))
        break

In [5]:
fix_files()


Duo cookies still valid; proceeding to search page...

Starting at: 2023-02-21 21:38:48.276450
First search term is (rst=ftft or rst=ftcom) and fds=gamhol,sfftftp,gamhol
Length of the current list is 95

None
