In [None]:
import re
import pandas as pd
import os

from RPA.Browser.Selenium import Selenium
from selenium.common.exceptions import ElementClickInterceptedException
from time import sleep
from robot.api import logger
from datetime import datetime, timedelta

from utils import last_date_months, last_date
from get_infos import get_infos

# Search term
search_term = "billion dollar"
sections = ['Books', 'Business']
months = 0

# Open the NY Times search page and search for the term
browser = Selenium()
browser.open_available_browser("https://www.nytimes.com")

#reject terms
reject_button_xpath = "xpath://button[@data-testid='Reject all-btn']"
browser.wait_until_element_is_visible(reject_button_xpath)
sleep(1)
browser.click_button(reject_button_xpath)

#search term
search_button_xpath = "xpath://button[@data-testid='search-button']"
browser.wait_until_element_is_enabled(search_button_xpath)
browser.click_button(search_button_xpath)

search_input_xpath = "xpath://input[@data-testid='search-input']"
browser.wait_until_element_is_enabled(search_input_xpath)
browser.input_text(search_input_xpath, search_term)
browser.press_keys(search_input_xpath, "ENTER")

#click in section
section_button_xpath = "xpath://div[@data-testid='section']//button"
browser.wait_until_element_is_enabled(section_button_xpath)
browser.click_button(section_button_xpath)

#choose section
for section in sections:
    section_xpath = f"xpath:*//ul[@data-testid='multi-select-dropdown-list']//li//label//span[contains(text(), '{section}')]"
    try:
        browser.click_element(section_xpath)
    except Exception as e:
        logger.error(f"Failed to choose section {section}. erro {e}")
        
#choose Newest news
newest_news_xpath = "xpath://option[contains(text(), 'Sort by Newest')]"
browser.click_element(newest_news_xpath)

sleep(1)

#show all the news in the range of months
for x in range(200):
    last_date_value = last_date(browser)
    if isinstance(last_date_value, bool):
        last_date_value = last_date_months(months) + timedelta(days=1)
    if last_date_months(months) < last_date_value:
        show_more_xpath = "xpath://button[@data-testid='search-show-more-button']"
        browser.wait_until_element_is_visible(show_more_xpath)
        browser.scroll_element_into_view(show_more_xpath)
        if browser.is_element_visible(show_more_xpath):
            browser.click_button(show_more_xpath)
    else:
        break
        
        
df_infos = get_infos(browser, search_term, months)

download_images(browser, df_infos)

df_infos.drop('image_src', axis=1, inplace=True)
df_infos.to_excel("nyt_news_info.xlsx", index=False)

In [None]:
df

In [None]:
#extracting data from the news

def extract_from_news(browser):
    list_news_xpath = "xpath://ol[@data-testid='search-results']//li"
    current_year = datetime.now().year
    aux = 1

    news_dict = {
        "title":[],
        "date":[],
        "description":[],
        "picture_filename":[],
        "amount_search_phrases":[],
        "amount_of_money":[],
        "image_src":[],
        "image_filename":[]
    }

    for x in browser.find_elements(list_news_xpath):
        news_html = x.get_attribute("outerHTML")
        if re.search(r'SKIP ADVERTISEMENT', news_html):
            continue

        try:
            date = x.text
            data_str = re.search(r".*\n", date).group(0).replace("\n", "")
            if 'ago' in data_str:
                time_ago = int(re.search(r'(\d+)(m|h) ago', data_str).group(1))
                if re.search(r'(\d+)(m|h) ago', data_str).group(2) == "m":
                    data_obj = datetime.now() - timedelta(minutes=time_ago)
                else:
                    data_obj = datetime.now() - timedelta(hours=time_ago)
            else:
                try:
                    data_obj = datetime.strptime(data_str, "%b. %d, %Y")
                except ValueError:
                    try:
                        complete_date_str = f"{data_str}, {current_year}"
                        data_obj = datetime.strptime(complete_date_str, "%b. %d, %Y")
                    except Exception as e:
                        logger.error(f"Failed to load this News. erro {e}")

            if last_date_months(months) > data_obj:
                break

        except Exception as e:
            logger.error(f"Failed to load this News. Erro: {e}")


        title = re.search(r'<h4.*?>(.*?)<\/h4>', news_html).group(1)
        description = re.search(r'<\/h4><p.*?>(.*?)<\/p>', news_html).group(1)
        try:
            image_name =  re.search(r'<img alt="(.*?)"', news_html).group(1)
            if image_name == "":
                image_name = "image_name_not_found"
        except Exception as e:
            image_name = "image_name_not_found"
            logger.error(f"Image name not found. Erro: {e}")

        try:
            src_image =  re.search(r'src="(.*?)"', news_html).group(1)
        except AttributeError:
            src_image = "link_not_found"


        phrase_to_count = title + description
        phrase_to_count = phrase_to_count.lower()
        count_search_phrase = phrase_to_count.count(search_term.lower())
        regex_coin = re.compile(r'\$(\d+\.\d+|\d+(,\d+)*(\.\d+)?)|(\d+)\s*dollars|\d+\s*USD')
        amount_money = bool(re.search(regex_coin, phrase_to_count))



        # print(title)
        # print(data_str)
        # print(description)
        # print(image_name)
        # print(src_image)
        # print(count_search_phrase)
        # print(amount_money)
        # print("-----------------------------")


        news_dict['title'].append(title)
        news_dict['date'].append(data_str)
        news_dict['description'].append(description)
        news_dict['picture_filename'].append(image_name)
        news_dict['amount_search_phrases'].append(count_search_phrase)
        news_dict['amount_of_money'].append(amount_money)
        news_dict['image_src'].append(src_image)
        news_dict['image_filename'].append(f"image_{aux}")
        aux +=1

    news_information = pd.DataFrame(news_dict)

    return news_information
    
    

    
    

In [None]:

def download_images(browser, df_infos):
    df_imagens = df_infos[['image_src','image_filename']]
    pictures_dir = "pictures"
    
    if not os.path.exists(pictures_dir):
        os.makedirs(pictures_dir)
        
    for index, row in df_imagens.iterrows():
        try:
            browser.go_to(row['image_src'])
            image_path = f"pictures/{(row['image_filename'])}.png"
            browser.capture_element_screenshot("tag:img",image_path)
        except Exception as e:
            logger.error(f"Failed to save image: {row['image_filename']}. Error: {e}") 
            

In [None]:
df_infos

In [None]:
import requests

In [None]:
requests.request("post", "https://cloud.robocorp.com/api/v1/workspaces/699152e3-4b14-405e-9129-8247d2c9df1b/work-items", headers={
  "Content-Type": "application/json",
  "Authorization": "RC-WSKEY {api_key}"
}, json={"process":{"id":"nyt_news"},"payload":{
         "search_term": "billion dollar",
         "sections": ['Books', 'Business'],
         "months": 0
      }})

In [None]:
requests.request("post", "https://cloud.robocorp.com/api/v1/workspaces/699152e3-4b14-405e-9129-8247d2c9df1b/work-items/{work_item_id}/files", headers={
  "Content-Type": "application/json",
  "Authorization": "RC-WSKEY 5pPO5FNjfpnCtLAG1LBB7F7cz3a9lH3K0Wfk0Uzdr0i6JhoY9BFQcNmqFVnlqJHfyoJjt1Nq0sCxXbzPyIE6wtMEFWiG5ikyTlQU1WopSaSbx9vsKA29kdt1S1ImFR6j"
}, json={"file_name":"work_items","file_size":1})