In [2]:
import json
import pandas as pd
from newspaper import Article
from newspaper import Config
import logging
import os
from bs4 import BeautifulSoup
from timeit import default_timer as timer
import time
import requests
import numpy as np
import datetime
from dateutil.parser import parse
import neuralcoref
from tqdm import tqdm
from transformers import BertTokenizerFast, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
from collections import defaultdict
from flair.models import TextClassifier
from flair.data import Sentence
from segtok.segmenter import split_single
import re
import spacy
from spacy import displacy
import en_core_web_lg
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
warnings.filterwarnings("ignore")
analyzer = SentimentIntensityAnalyzer()
nlp = en_core_web_lg.load()
neuralcoref.add_to_pipe(nlp)
classifier = TextClassifier.load('en-sentiment')
device = torch.device('cpu')

2021-02-28 17:26:03,326 loading file C:\Users\97254\.flair\models\sentiment-en-mix-distillbert_3.1.pt


In [3]:
#Configurations
headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"}
config = Config()
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
config.browser_user_agent = user_agent
logging.basicConfig(filename='Finviz_log.txt',
                   filemode='a',
                   format='%(asctime)s,%(msecs)d %(levelname)s %(message)s',
                   datefmt='%H:%M:%S',
                   level=logging.INFO)
base_path= r'C:\Users\97254\anaconda3\finbert'

In [4]:
"""Articles Functions"""
def get_article(ticker):
    baseURL=f"https://finviz.com/quote.ashx?t={ticker}"
    response = requests.get(baseURL,headers=headers)
    article_elements = BeautifulSoup(response.content.decode(),features='html.parser').find_all('a',{"class":"tab-link-news"},href=True)
    if article_elements:
        return [element['href'] for element in article_elements]
    else:
        return []
    
def get_article_headline(soup,articleURL):
    try:
        text = soup.find('h1').get_text(separator=" ")
        headln = re.sub('[ג€™]|[ג]|[\x9c]|[\xa0]|[\n]', '', text)
    except Exception as e:
        logging.info(f"Failed: {e}-- Could not find Headline |{articleURL}")
        headln = ''
    return headln

def get_article_date(soup,articleURL):
    for i in soup.findAll('time'):
        if i.has_attr('datetime'):
            return i['datetime'][:10]
    return 'None'

def get_article_record_for_URL(articleURL,ticker):
    response = requests.get(articleURL,headers=headers)
    soup = BeautifulSoup(response.content.decode(),features='html.parser')
    headln = get_article_headline(soup,articleURL)
    article = Article(articleURL, config=config)
    try:
        article.download()
        article.parse()
    except Exception:
        return {}
    try:
        pubdate = str(article.publish_date.date())
    except Exception:
        pubdate = get_article_date(soup,articleURL)
    if pubdate == None or pubdate =='':
        pubdate = get_article_date(soup,articleURL)
    txt = article.text
    if not txt:
        logging.info(f"Failed: Could not scrape article | {articleURL}")
        return {}
    return {"Headline":headln,'Date':str(pubdate),'Text':txt}

"""Misc Functions"""
def get_symbol(ticker):
    url = "http://d.yimg.com/autoc.finance.yahoo.com/autoc?query={}&region=1&lang=en.json".format(ticker)
    result = requests.get(url).json()
    for x in result['ResultSet']['Result']:
        if x['symbol'] == ticker:
            partitioned_string = x['name'].partition(' ')
            company = partitioned_string[0]
            return company
        
def chunk_list(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
def get_weight(sentiment,lengths):
    output=[]
    weights=[]
    for length in lengths:
        weights.append(length/max(lengths))
    output=[x*z if x!=np.nan else 0 for x,z in zip(sentiment,weights)]
    return np.nanmean(output) 

def load_transformer_model(basepath):
    torch.set_grad_enabled(False)
    model_dir = basepath
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.to(device)
    model.eval()
    tokenizer = BertTokenizerFast.from_pretrained(model_dir)
    return model, tokenizer

"""Sentiment Functions"""
def vaderPr(sentence):
    if len(sentence)<4:
        return 0
    return analyzer.polarity_scores(sentence)['compound']    

def flairPr(sentence):
    if len(sentence)<4:
        return 0
    text = Sentence(sentence)
    classifier.predict(text)
    value=text.labels[0].to_dict()['value']
    if value == 'POSITIVE':
        result = text.to_dict()['labels'][0]['confidence']
    else:
        result = -(text.to_dict()['labels'][0]['confidence'])
    return round(result,3)
        
def get_finbert_esg_sentiments_batch(sents, model, tokenizer,  batch_size=50):
    labels = {0: 'neutral', 1: 'positive', 2: 'negative'}
    scores = {'neutral': 0, 'positive': 1 , 'negative':-1}
    pbar = tqdm(total=(len(sents) // batch_size), position=0, leave=True)
    preds = []
    confidence = []
    sentiment=[]
    for batch_of_sents in chunk_list(sents, batch_size):
        inputs = tokenizer.batch_encode_plus(batch_of_sents, return_tensors='pt', padding=True, truncation=True)
        preds_probs = model(inputs['input_ids'].to(device), token_type_ids=None, attention_mask=inputs['attention_mask'].to(device))[0]
        torch.cuda.empty_cache()
        for pp in preds_probs:
            pp = softmax(pp, dim=0)
            top = pp.topk(1)
            idxs = top.indices.tolist()
            wts = top.values.tolist()
            preds.append(labels[idxs[0]])
            confidence.append(wts[0])
            sentiment.append(scores.get(labels[idxs[0]]))
        pbar.update()
    return sentiment, confidence

"""Scraping and Analysis Functions"""
def scrape_ticker (ticker):
    s_time= timer()
    logging.info(f"Running Finviz Scraper on {ticker}")
    articles = get_article(ticker)
    if os.path.isfile(f'./{ticker}.json'):
        print('Historical Data Found')
        with open(f'./{ticker}.json',encoding='utf-8') as f:
            finviz=json.loads(f.read())
        for article in tqdm(articles, position=0, leave=True):
            article_dict = get_article_record_for_URL(article,ticker)
            if not article_dict:
                continue
            else:
                if article_dict['Date']=='None':
                    continue
                if str(article_dict['Date']) in list(finviz.keys()):
                    if article in list(finviz[str(article_dict['Date'])].keys()):
                        continue
                    else:
                        finviz[str(article_dict['Date'])].update({article:article_dict})
                else:
                    finviz[str(article_dict['Date'])] = defaultdict()
                    finviz[str(article_dict['Date'])].update({article:article_dict})
    else:
        print('No Historical Data Found')
        finviz = defaultdict()
        for article in tqdm(articles, position=0, leave=True):
            article_dict = get_article_record_for_URL(article,ticker)
            if not article_dict:
                continue
            if article_dict['Date']=='None':
                continue
            else:
                if str(article_dict['Date']) in list(finviz.keys()):
                    finviz[str(article_dict['Date'])].update({article:article_dict}) 
                else:
                    finviz[str(article_dict['Date'])] = defaultdict()
                    finviz[str(article_dict['Date'])].update({article:article_dict})
    with open(f'./{ticker}.json','w',encoding='utf-8') as f:
        json.dump(finviz,f,ensure_ascii=False,indent=4,sort_keys=True)
    logging.info(f"Finished scraping {ticker} news from Finviz")
    print(f'Scraping done after {round((timer()-s_time)/60,2)} minutes')    
    return finviz

def analyze (ticker,data,classifier):
    logging.info(f"Running Finviz Scraper on {ticker} by {classifier}")
    classifiers = ['vader','flair','finbert']
    if classifier not in classifiers:
        logging.info(f"Program stopped due to an invalid classifier")
        return print(f'Please select a valid classifier out of the following: {classifiers}')
    s_time= timer()
    finviz = data
    print('Starting sentiment analysis process')
    if classifier == 'finbert':
        model,tokenizer = load_transformer_model(base_path)
    if os.path.isfile(f'./{ticker}-{classifier}.json'):
        print('Historical Analysis Found')
        with open(f'./{ticker}-{classifier}.json',encoding='utf-8') as f:
            final=json.loads(f.read())
    else:
        final = defaultdict()            
    company = get_symbol(ticker)
    queries = [ticker,company]
    final[ticker] = defaultdict()
    logging.info(f"Starting analyzing {ticker} news by {classifier}")
    for date in tqdm(list(finviz.keys()), position=0, leave=True):
        if date in list(final[ticker].keys()):
            weights = []
            mean_sent_list = []
            for article in list(final[date].keys()):
                weights.append(len([ticker][date][artDict][artDict][article]['Relevant']))
                mean_sent_list.append([ticker][date][artDict][artDict][article]['Sentiment Score'])
            for article in list(finviz[date].keys()):
                if article not in list(final[date].keys()):
                    sentiment_scores=[]
                    content = article['Text']
                    relevant=[]
                    try:
                        clean = re.sub('[ג€™]|[ג]|[\x9c]|[\xa0]|[\n]', '', content)
                        doc = nlp(clean)
                    except Exception:
                        doc = nlp(str(content))
                    resolved_text = doc._.coref_resolved
                    sentences = [sent.string.strip() for sent in nlp(resolved_text).sents]
                    for query in queries:
                        output = [sent for sent in sentences if str.lower(query) in 
                              (' '.join([token.lemma_.lower() for token in nlp(sent)]))]
                        if len(output)==0:
                            continue
                        else:
                            relevant.extend(output)
                    if len(relevant)==0:
                        continue
                    else:
                        for sentence in relevant:
                            if classifier == 'flair':
                                sentiment_scores.append(flairPr(sentence))
                            elif classifier == 'vader':
                                sentiment_scores.append(vaderPr(sentence))
                    if classifier == 'finbert':
                        sentiment,confidence = get_finbert_esg_sentiments_batch(
                            relevant, model, tokenizer,10)
                        scores = [x*y for x,y in zip(confidence,sentiment)]
                        sentiment_scores = round(np.mean(scores),3)   
                    weights.append(len(relevant))
                    mean_sent_list.append(np.nanmean(sentiment_scores))
                    artDict[article]={
                        'Article Title':article['Headline'],
                        'Relevant Text':relevant,
                        'Sentiment Score':np.nanmean(sentiment_scores)}
        else:
            weights = []
            mean_sent_list = []
            artDict= {}
            for article in list(finviz[date].keys()):
                sentiment_scores=[]
                content = finviz[date][article]['Text']
                relevant=[]
                try:
                    clean = re.sub('[ג€™]|[ג]|[\x9c]|[\xa0]|[\n]', '', content)
                    doc = nlp(clean)
                except Exception:
                    doc = nlp(str(content))
                resolved_text = doc._.coref_resolved
                sentences = [sent.string.strip() for sent in nlp(resolved_text).sents]
                for query in queries:
                    output = [sent for sent in sentences if str.lower(query) in 
                          (' '.join([token.lemma_.lower() for token in nlp(sent)]))]
                    if len(output)==0:
                        continue
                    else:
                        for sentence in output:
                            if sentence not in relevant:
                                relevant.append(sentence)
                if len(relevant)==0:
                    continue
                else:
                    for sentence in relevant:
                        if classifier == 'flair':
                            sentiment_scores.append(flairPr(sentence))
                        elif classifier == 'vader':
                            sentiment_scores.append(vaderPr(sentence))
                    if classifier == 'finbert':
                        sentiment,confidence = get_finbert_esg_sentiments_batch(
                            relevant, model, tokenizer,10)
                        scores = [x*y for x,y in zip(confidence,sentiment)]
                        sentiment_scores = round(np.mean(scores),3)   
                    weights.append(len(relevant))
                mean_sent_list.append(np.nanmean(sentiment_scores))
                artDict[article]={
                    'Article Title':finviz[date][article]['Headline'],
                    'Relevant Text':relevant,
                    'Sentiment Score':round(np.nanmean(sentiment_scores),3)}
        if len(weights)==0:
            continue
        final[ticker][str(date)] ={
            'Weighted Score':round(get_weight(mean_sent_list,weights),3),
            'Articles':artDict}
        logging.info(f"Program finished analyzing")
    print(f'Program done after {round((timer()-s_time)/60,2)} minutes')
    with open(f'./{ticker}-{classifier}.json','w',encoding='utf-8') as f:
        json.dump(final,f,ensure_ascii=False,indent=4,sort_keys=True)                    

In [5]:
finviz = scrape_ticker ('BABA')

  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

Historical Data Found


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [03:34<00:00,  2.15s/it]

Scraping done after 3.59 minutes





In [6]:
analyze('BABA',finviz,'flair')

Starting sentiment analysis process
Historical Analysis Found


100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [18:51<00:00, 26.94s/it]

Program done after 18.86 minutes





In [7]:
analyze('BABA',finviz,'vader')

Starting sentiment analysis process
Historical Analysis Found


100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [17:49<00:00, 25.47s/it]

Program done after 17.84 minutes





In [8]:
analyze('BABA',finviz,'finbert')

Starting sentiment analysis process
Historical Analysis Found


1it [00:00,  2.88it/s]                                                                  | 2/42 [00:38<14:53, 22.34s/it]
1it [00:00,  3.10it/s]
1it [00:00,  1.34it/s]
1it [00:00,  2.07it/s]
1it [00:00,  4.91it/s]                                                                  | 7/42 [02:34<10:30, 18.01s/it]
1it [00:00,  4.00it/s]
1it [00:00,  3.27it/s]
2it [00:00,  3.48it/s]                                                                                                 
1it [00:00,  5.53it/s]
1it [00:00,  2.84it/s]
1it [00:00,  1.84it/s]
1it [00:00,  3.36it/s]
2it [00:03,  1.58s/it]                                                                                                 
3it [00:03,  1.11s/it]                                                                                                 
1it [00:00,  4.89it/s]██▌                                                              | 10/42 [05:55<25:34, 47.95s/it]
2it [00:01,  1.67it/s]                                                   

Program done after 20.97 minutes



