### Attempt at finding a corpus of words related to Financial articles

In [None]:
import demjson
import re
import requests
import nltk
import nltk.classify.util
from collections import Counter
from datetime import datetime, date
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import word_tokenize
from newspaper import Article

In [None]:
best_performers = ['NFLX', 'CSX', 'NRG', 'SWKS', 'URI']
worst_performers = ['ENDP', 'BMY', 'SIG', 'GPRO', 'CYH']

In [None]:
def parse_date(date_str):
    """ Turns a date string into a valid datetime object"""
    date_str_regex = re.compile("[\w]{3} [\d]{1,2}, [\d]{4}")
    hours_ago_regex = re.compile("[\d]{1,2} [\w]+ ago")
    if date_str_regex.match(date_str):
        try:
            return datetime.strptime(date_str, "%b %d, %Y")
        except ValueError:
                pass
    elif hours_ago_regex.match(date_str):
        try:
            return datetime.combine(date.today(), datetime.min.time())
        except:
            pass
    raise ValueError("couldn't parse date string: {0}".format(date_str))

def _make_keys_verbose(article_dict):
    old_to_new = {
        "a": "articles", "d": "date", "s": "source", "t": "title",
        "tt": "titleId", "u": "url", "sp": "openingSentence"
    }
    for old_key in article_dict:
        if old_key in old_to_new:
            new_key = old_to_new[old_key]
            article_dict[new_key] = article_dict.pop(old_key)
        
def get_articles(symbol, num_articles=500):
    payload = {
        "output": "json", "q": symbol, "num": num_articles,"start": 0}
    try:
        resp = requests.get("http://www.google.com/finance/company_news?", params=payload)
        resp.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(e)
        return
    # need demjson's decode, json data is invalid for pythons native decoder
    clusters = demjson.decode(resp.text)['clusters']
    articles = []
    for cluster in clusters:
        if "a" in cluster:
            for article in cluster["a"]:
                _make_keys_verbose(article)
                article['date'] = parse_date(article['date'])
                articles.append(article)
    return articles

def tokenize_words(text):
    words = word_tokenize(text)
    return [word.strip().replace('\n', '') for word in words]

def download_articles(urls):
    article_texts = []
    for url in urls:
        article = Article(url)
        article.download()
        if 'wsj.com' in url or not article.is_downloaded:
            continue
        article.parse()
        article_texts.append(article.text)
    return article_texts

def filter_stopwords(words):
    return [ word for word in words if word not in set(stopwords.words('english')) ]

def word_freqs(articles):
    """ pass in List[ List[words] ]"""
    word_freqs = Counter()
    # accumulate the counter
    for article in articles:
        word_freqs += Counter(article) # Counter supports `+`
    return word_freqs

In [None]:
# use google finance API to get articles
best_dict = {ticker: get_articles(ticker, 200) 
             for ticker in best_performers}

worst_dict = {ticker: get_articles(ticker, 200)
              for ticker in worst_performers}

In [None]:
# get urls to pass to analysis functions
cutoff_date = datetime(2017,1,1)
best_performer_urls = [d['url'] 
                       for ticker in best_dict 
                       for d in best_dict[ticker] if d['date'] >= cutoff_date]
worst_performer_urls = [d['url'] 
                        for ticker in worst_dict 
                        for d in worst_dict[ticker] if d['date'] >= cutoff_date]

In [None]:
# downloading each article, takes a long time
best_performer_articles = download_articles(best_performer_urls[:25])
worst_performer_articles = download_articles(worst_performer_urls[:25])

#%store best_performer_articles
#%store worst_performer_articles

In [None]:
best_performer_article_words = [ filter_stopwords( tokenize_words(article) ) 
                                 for article in best_performer_articles ]

worst_performer_article_words = [ filter_stopwords( tokenize_words(article) )
                                  for article in worst_performer_articles ]

best_performer_word_freqs = word_freqs(best_performer_article_words)
worst_performer_word_freqs = word_freqs(worst_performer_article_words)

In [None]:
worst_performer_word_freqs.most_common()

In [None]:
worst_performer_word_freqs = word_freqs(worst_performer_articles)
best_perfomer_word_freqs = word_freqs(all_articles)

In [None]:
import copy
adjTags = ['JJ', 'JJR', 'JJS']
wordsAndTags = nltk.pos_tag(worst_performer_word_freqs.keys())
worst_performer_adjs = copy.deepcopy(worst_performer_word_freqs)
adjs = { t[0] for t in wordsAndTags if t[1] in adjTags }
for word in list(worst_performer_adjs.keys()):
    if word not in adjs:
        del worst_performer_adjs[word]
worst_performer_adjs.most_common()

In [None]:
worst_dict.keys()

In [None]:
freqs.most_common()