In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import re
import json
import time

In [None]:
# Error Definitions
class MaxRecursionError(Exception):
    def __init__(self):
        pass
class ArticleNotFoundError(Exception):
    def __init__(self):
        pass
class AccessDeniedError(Exception):
    def __init__(self):
        pass

In [None]:
# API Client for webscraping article data
class APIClient:
    def __str__(self):
        return f'Chrome Webscraper {self.driver}'
    
    def __init__(self):
        # webdriver setup
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument('--disable-gpu')
        user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36'
        chrome_options.add_argument('user-agent={0}'.format(user_agent))
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=chrome_options,service_args=["--verbose"])
        self.driver.set_page_load_timeout(10)
        # recursion setup for auto-retry
        self.MAX_DEPTH = 3

        # blacklisted link patterns ie: yahoo finance
        self.link_blacklist = ['finance.yahoo','sec.gov','money.cnn','markets.businessinsider.com','google.com']

    ##################################
    ##################################
    # API
    def get_stock_data(self,tag:str,date:str,num_links:int=25):
        res = []
        tmplinks = self.get_google_links(tag,date,num_links)
        for link in tmplinks:
            try:
                tmpdata = self.get_article_data(link)
                res += [tmpdata]
            except:
                pass
        return res

    def get_google_links(self,tag:str,date:str,num_links:int=25,recursion_depth=1):
        # Extract links from google search
        # Input: tag, date, num_links
        # Output: list of URLs
        url = "https://www.google.com/search?q={}+news+on%3A{}&num={}".format(tag,date,num_links)
        res = self.get_html_from_url(url)
        try:
            res = self.get_links(res)
        except:
            if recursion_depth <= self.MAX_DEPTH: # depth check for successive retry
                time.sleep(1) # prevent api throttling
                res = self.get_google_links(tag,date,recursion_depth=recursion_depth+1)
            else:
                raise MaxRecursionError
        return res
    
    def get_article_data(self,url:str,recursion_depth=1):
        # Extract info from article
        # Input: URL
        # Output: article data json
        try:
            html = self.get_html_from_url(url)
            data = {}
            data['link'] = url # self identifier
            data['title'] = self.get_title(html)
            if re.findall('404',data['title']) != [] or re.findall('Not Found',data['title']) != [] or re.findall('Error',data['title']) != [] or re.findall('Page not found',data['title']) != []:
                # print('Article not found.',url)
                raise ArticleNotFoundError
            elif data['title'] == 'Access to this page has been denied.':
                # print('Access Denied.',url)
                raise AccessDeniedError
            data['text'] = self.get_text(html) # text data
            # try:
            #     data['metadata'] = self.get_metadata(html) # metadata (author,created_at,transcript)
            # except:
            #     data['metadata'] = {}
            #     print('No metadata found.')
        except Exception as e:
            if recursion_depth <= self.MAX_DEPTH: # depth check for successive retry
                time.sleep(1) # prevent api throttling
                data = self.get_article_data(url,recursion_depth=recursion_depth+1)
            else:
                raise MaxRecursionError
        return data
    ##################################
    ##################################

    # Webdriver
    def get_html_from_url(self,url:str):
        try:
            self.driver.get(url)
        except Exception as e:
            print(e)
        return self.driver.page_source

    # HTML Processing Helper functions
    def get_text(self,html:str):
        soup = BeautifulSoup(html, 'html.parser')
        text = ''
        for tmp in [tmp.text for tmp in soup.find_all(['a','p','h','h1','h2'])]:
            if len(tmp) > 25 and re.findall('Click here',tmp) == [] and re.findall('This Simple Trick',tmp) == []:
                text += tmp + ' '
        text = re.sub('\s+',' ',text.replace('\n',' ').replace('\xa0',' ').replace('\'','’').replace('   ',' ').strip())
        return text

    def get_title(self,html:str):
        soup = BeautifulSoup(html, 'html.parser')
        res = soup.find_all('title')[0].text
        return res

    # def get_metadata(self,html:str): 
    #     # Probably abandon metadata extraction because of lack of consistency between sites
    #     soup = BeautifulSoup(html, 'html.parser')
    #     metadata = {}
    #     metadata['title'] = soup.find_all('title')[0].text
    #     try:
    #         jdata = json.loads(soup.find_all('script',type='application/ld+json')[0].text.replace('\t','').replace('\n','').replace('\\','').replace('\"{','{').replace('}\"','}'))[0]
    #     except:
    #         try:
    #             jdata = json.loads(soup.find_all('script',type='application/ld+json')[0].text.replace('\t','').replace('\n','').replace('\\','').replace('\"{','{').replace('}\"','}'))
    #         except:
    #             try:
    #                 jdata = json.loads(soup.find_all('script',type='application/ld+json')[0].text)[0]
    #             except:
    #                 jdata = json.loads(soup.find_all('script',type='application/ld+json')[0].text.replace('\n','').replace('   ','').replace('\\','').replace('\t',''))
    #     try:
    #         metadata['author'] = jdata['author'][0]['name']
    #     except:
    #         try:
    #             metadata['author'] = jdata['author']['name']
    #         except:
    #             try:
    #                 metadata['author'] = jdata['liveBlogUpdate'][0]['author']['name']
    #             except:
    #                 try:
    #                     jdata['@graph'][0]['author']['name']
    #                 except:
    #                     try:
    #                         metadata['author'] = jdata['author'][0]
    #                     except:
    #                         print('No author found.')
    #     try:  
    #         metadata['published_at'] = jdata['datePublished']
    #     except:
    #         print('No publish date found.')
    #     try:
    #         metadata['description'] = jdata['description']
    #     except:
    #         print('No description found.')
    #     try:
    #         # bloomberg article had one
    #         metadata['transcript'] = re.sub('/  +/g',' ',re.sub('/  +/g',' ',jdata['video']['transcript']).replace('\n',' ').replace('  ','').replace('\'','’'))
    #     except Exception as e:
    #         print('No transcript found.')
    #     return metadata

    #####
    # JSON metadata extraction types
    
    #####
    def get_links(self,html:str):
        soup = BeautifulSoup(html, 'html.parser')
        links = []
        for x in soup.find_all('a'):
            try:
                tmp = x['href'].replace('/url?esrc=s&q=&rct=j&sa=U&url=','')
                if 'https://' == tmp[0:8] and min([re.findall(pattern,tmp) == [] for pattern in self.link_blacklist]): # check for blacklisted pattern
                    links += [tmp]
            except Exception as e:
                pass
        return links # ignore first eight links from google

    def __del__(self):
        self.driver.close()


In [None]:
api = APIClient()

In [None]:
print(api)

In [None]:
res = api.get_stock_data('tsla','2023-01-03',50)

In [None]:
res

# TESTING

In [None]:
#url = "https://www.bloomberg.com/news/articles/2023-02-17/summers-says-too-soon-to-call-for-march-50-basis-point-fed-hike?srnd=premium"
#url = "https://www.bloomberg.com/news/articles/2023-02-18/cars-tires-textile-factories-have-shut-in-crisis-hit-pakistan?srnd=industries-v2"

testlinks = api.get_google_links('tsla','2022-02-01',25)
testlinks

In [None]:
res = api.get_article_data(testlinks[4])
res

In [None]:
html = api.get_html_from_url(testlinks[0])

In [None]:
soup = BeautifulSoup(html, 'html.parser')
text = ''
for tmp in [tmp.text for tmp in soup.find_all(['a','p','h','h1','h2'])]:
    if len(tmp) > 25 and re.findall('Click here',tmp) == [] and re.findall('This Simple Trick',tmp) == []:
        text += tmp + ' '
text = re.sub('\s+',' ',text.replace('\n',' ').replace('\xa0',' ').replace('\'','’').replace('   ',' ').strip())
text