In [None]:
"""
DATE: 8/29/2020
AUTHOR: DANIEL WU
PURPOSE: This file contains the procedural algorithm for logging into Factiva,
         query input, etc.         
LAST UPDATE:
"""

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from getpass import getpass

import os
import time
import datetime
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

import importlib
import import_ipynb
import factiva_word_storage as mod1
importlib.reload(mod1) 
import factiva_word_storage as mod1

def pause():
    programPause = input("Press the <ENTER> key to continue or type any letter to break loop")
    return programPause

# run the command below if making changes to word list interactively
# importlib.reload(mod1) 

# This function is unnecessary if logging onto Factiva Manually
def log_onto_factiva(driver, usrnm, pw):
    
    driver.find_element_by_id('edit-search').clear()
    driver.find_element_by_id('edit-search').send_keys('Factiva')
    driver.find_element_by_css_selector('.icon--search').click()

    factiva_xpath = '//*[@id="Databases"]/div/div/ul/li[1]/article/div[2]/h3/a'

    factiva_dtabase = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, factiva_xpath))
        )
    time.sleep(2)
    factiva_dtabase.click()

    # If new tab gets created, switch focus to newest tab
    if len(driver.window_handles) > 1:
        driver.switch_to.window(driver.window_handles[-1])

    driver.find_element_by_id('username').clear()
    driver.find_element_by_id('username').send_keys(usrnm)
    driver.find_element_by_id('password').clear()
    driver.find_element_by_id('password').send_keys(pw)
    driver.find_element_by_xpath('//*[@id="loginform"]/div[4]/button').click()

    # Stale Logon - repeat process
    try:
        if driver.find_elements_by_tag_name('h1')[1].text == 'Please sign on again.':
            print("You timed out. Try again from the beginning.")
            driver.quit()
    except:    
        pass

def send_query(driver, factiva_query):
    if len(driver.window_handles) > 1:
        driver.switch_to.window(driver.window_handles[-1])
    text_cursor = driver.find_element_by_class_name('ace_text-input').location
    if text_cursor['x'] != 180:    
        print('Manually clear query before proceeding.')
        pause()    
    else:
        pass

    driver.find_element_by_class_name('ace_text-input').send_keys(factiva_query)
    
# Scraping Loop
def enter_date(driver, func_start_index, func_index, func_row, func_next_3, func_back_30):
    
    start_date = func_row['IssueDate'] - func_back_30    
    end_date = func_row['IssueDate'] + func_next_3   
    
    if func_index == func_start_index:
        driver.find_element_by_id('dr').click()
        driver.find_element_by_xpath('//*[@id="dr"]/option[10]').click()    # date range, manual entry        
    
    # Dates
    WebDriverWait(driver, 4).until(EC.element_to_be_clickable((By.ID, 'fry')))
    driver.find_element_by_id('fry').clear()
    driver.find_element_by_id('frm').clear()
    driver.find_element_by_id('frd').clear()

    driver.find_element_by_id('fry').send_keys(start_date.year)
    driver.find_element_by_id('frm').send_keys(start_date.month)
    driver.find_element_by_id('frd').send_keys(start_date.day)

    driver.find_element_by_id('toy').clear()
    driver.find_element_by_id('tom').clear()
    driver.find_element_by_id('tod').clear()    

    driver.find_element_by_id('toy').send_keys(end_date.year)
    driver.find_element_by_id('tom').send_keys(end_date.month)
    driver.find_element_by_id('tod').send_keys(end_date.day)

# Search company procedure - currently only called twice in enter_company_name()
def search_company(driver, co_name):
    try:
        driver.find_element_by_id('coTxt').clear()
        driver.find_element_by_id('coTxt').send_keys(co_name)    
    except:
        driver.find_element_by_xpath('//*[@id="coTab"]/div[1]').click()  
        driver.find_element_by_id('coTxt').clear()
        driver.find_element_by_id('coTxt').send_keys(co_name)
        
    first_result = WebDriverWait(driver, 5).until(
    EC.element_to_be_clickable((By.CLASS_NAME, 'ac_descriptor')))    
    search_result = first_result.text                
    print(f'found {search_result.lower()}')                 
    search_success = search_result.lower().find(co_name)                            
    first_result.click()
        
def enter_company_name(driver, func_row):
    
    co_name = func_row['conm']

    #Alternate XPATH search
    #xpath = '/html/body/div[12]/table/tbody/tr[1]/td/div/strong'
    #         EC.presence_of_element_located((By.XPATH, xpath))        

    # Find Company Name search result    
    # First result returns the most accurate string match     
    # Modify this part as a try command. There will be some that won't work

    try:        
        search_company(driver, co_name)
    except TimeoutException:                
        co_name_tokenize = word_tokenize(co_name)        
        for word in co_name_tokenize:    
            if word in list(mod1.AbbrevDict.keys()):
                co_name = co_name.replace(word, mod1.AbbrevDict.get(word))
            else:
                pass                            
        try:            
            search_company(driver, co_name)
        except TimeoutException:            
            co_name_tokenize = word_tokenize(co_name)        
            for word in co_name_tokenize:    
                if word in list(mod1.AbbrevDict2.keys()):                    
                    co_name = co_name.replace(word, mod1.AbbrevDict2.get(word))
                else:
                    pass                
            try:            
                search_company(driver, co_name)                 
            except TimeoutException:
                print(f'no search result for {co_name}')
                return False
    return True 
            
def hit_search(driver):
    driver.find_element_by_id('btnSBSearch').click()
    
def get_articles(driver, working_dir, articles_folder, co_df, cusip, index, excel_index, json_file, run_no):            
    
    time.sleep(2)
    
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'enHeadline'))) 
        number_of_reports = len(driver.find_elements_by_class_name('enHeadline'))    
        time.sleep(3)
        number_of_reports = len(driver.find_elements_by_class_name('enHeadline'))    
        
    except:
        #write to excel no articles found
        co_df['Scrape Comments'][index] = 'No articles returned in search'
        co_df['Articles Scraped'][index] = 0
        return json_file[excel_index]

    report_no = 100*(run_no - 1) 
    
    for report in range(number_of_reports):
  
                
        WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, 'enHeadline')))
        
        if driver.find_elements_by_class_name('enHeadline')[report].text == '':
            continue        
                        
        report_no += 1                
        
        json_file[excel_index][report_no] = {}            
        
        json_file[excel_index][report_no]['headline'] = driver.find_elements_by_class_name('enHeadline')[report].text                                        
        
        # Headline click - try multiple times if webpage is loading slowly
        try:
            driver.find_elements_by_class_name('enHeadline')[report].click()
        except:
            time.sleep(3)
            try:            
                driver.find_elements_by_class_name('enHeadline')[report].click()
            except:
                time.sleep(5)
                driver.find_elements_by_class_name('enHeadline')[report].click()
                
        time.sleep(2)
        
#         # Factiva Download         
#         time.sleep(7)
#         try:
#             driver.find_element_by_xpath('//*[@id="listMenuRoot"]/li[5]/a').click()
#         except:
#             driver.find_element_by_xpath('//*[@id="listMenuRoot"]/li[5]/a').click()            
#         time.sleep(3)
#         try:
#             driver.find_element_by_xpath('//*[@id="listMenu-id-3"]/li[2]/a').click()
#         except:
#             driver.find_element_by_xpath('//*[@id="listMenuRoot"]/li[5]/a').click()
                        
        # dictionary addition                
        json_file[excel_index][report_no]['date'] = ''
        json_file[excel_index][report_no]['time'] = ''        
        json_file[excel_index][report_no]['body'] = ''
        
        article_body = ''
                
        element = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.article>div')))                
        
        # grab date
        for i in range(len(element)):
            try:
                date = datetime.datetime.strptime(element[i].text, '%d %B %Y') 
                date_clean = '{}{:02}{:02}'.format(date.year, date.month, date.day)                 
                json_file[excel_index][report_no]['date'] = date_clean                
                break
            except:
                continue
        
        time.sleep(2)
        
        # grab time
        for j in range(len(element)):
            try:
                time_clean = datetime.datetime.strptime(element[j].text, '%H:%M %p')                
                time_clean = str(time_clean.time().isoformat())                
                json_file[excel_index][report_no]['time'] = time_clean                
                break
            except:
                continue
        
        time.sleep(2)
        
        # grab text body
        body_element = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.article>p')))                

        for k in range(len(body_element)):    
            article_body =  '\n \n'.join([article_body, body_element[k].text])
    
        json_file[excel_index][report_no]['body'] = article_body
        
        dl_text = element[0].text
        file = open("factiva_dl.txt","w")
        file.write(dl_text)
        file.close()        
        prefixed = [filename for filename in os.listdir(working_dir) if filename.startswith("factiva_dl")]                                        
        new_name = f'obs{excel_index}_file{report_no}_{cusip}_{date_clean}'                
        time.sleep(2)        
        os.rename(f'{working_dir}//{prefixed[0]}', f'{articles_folder}//{new_name}')            
        time.sleep(2)
        
        try:
            driver.find_element_by_xpath('//*[@id="returnToHeadlines"]/a').click()        
        except:
            time.sleep(3)
            driver.find_element_by_xpath('//*[@id="returnToHeadlines"]/a').click()        
    
    co_df['Articles Scraped'][index] = report_no
    
    return json_file[excel_index]
    
def hit_next_search(driver):
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, 'btnModifySearch')))
    driver.find_element_by_id('btnModifySearch').click()
    
def clear_prev_search(driver):
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="coLst"]/div/ul/li/div/div/span')))
    driver.find_element_by_xpath('//*[@id="coLst"]/div/ul/li/div/div/span').click()
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="copillscontextmenu"]/div/div[2]/span')))
    driver.find_element_by_xpath('//*[@id="copillscontextmenu"]/div/div[2]/span').click()