In [57]:
import demjson
import re
import requests
import nltk
import nltk.classify.util
from collections import Counter
from datetime import datetime, date
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from newspaper import Article

In [97]:
best_performers = ['NFLX', 'CSX', 'NRG', 'SWKS', 'URI']
worst_performers = ['ENDP', 'BMY', 'SIG', 'GPRO', 'CYH']

In [98]:
def parse_date(date_str):
    """ Turns a date string into a valid datetime object"""
    date_str_regex = re.compile("[\w]{3} [\d]{1,2}, [\d]{4}")
    hours_ago_regex = re.compile("[\d]{1,2} [\w]+ ago")
    if date_str_regex.match(date_str):
        try:
            return datetime.strptime(date_str, "%b %d, %Y")
        except ValueError:
                pass
    elif hours_ago_regex.match(date_str):
        try:
            return datetime.combine(date.today(), datetime.min.time())
        except:
            pass
    raise ValueError("couldn't parse date string: {0}".format(date_str))

def _make_keys_verbose(article_dict):
    old_to_new = {
        "a": "articles", "d": "date", "s": "source", "t": "title",
        "tt": "titleId", "u": "url", "sp": "openingSentence"
    }
    for old_key in article_dict:
        if old_key in old_to_new:
            new_key = old_to_new[old_key]
            article_dict[new_key] = article_dict.pop(old_key)
        
def get_articles(symbol, num_articles=500):
    payload = {
        "output": "json", "q": symbol, "num": num_articles,"start": 0}
    try:
        resp = requests.get("http://www.google.com/finance/company_news?", params=payload)
        resp.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(e)
        return
    # need demjson's decode, json data is invalid for pythons native decoder
    clusters = demjson.decode(resp.text)['clusters']
    articles = []
    for cluster in clusters:
        if "a" in cluster:
            for article in cluster["a"]:
                _make_keys_verbose(article)
                article['date'] = parse_date(article['date'])
                articles.append(article)
    return articles

def tokenize_words(text):
    words = word_tokenize(text)
    return [word.strip().replace('\n', '') for word in words]

In [99]:
best_dict = {ticker: get_articles(ticker, 200) 
             for ticker in best_performers}

worst_dict = {ticker: get_articles(ticker, 200)
              for ticker in worst_performers}

In [100]:
def download_articles(urls):
    article_texts = []
    for url in urls:
        article = Article(url)
        article.download()
        if 'wsj.com' in url or not article.is_downloaded:
            continue
        article.parse()
        article_texts.append(article.text)
    return article_texts

def word_freqs(articles):
    word_freqs = Counter()
    # accumulate the counter
    for article in articles:
        words = tokenize_words(article)
        word_freqs += Counter(words) # counters support `+`
    return word_freqs

In [105]:
# best_performer_urls = (d['url'] for ticker in best_dict for d in best_dict[ticker])
worst_performer_urls = [d['url'] for ticker in worst_dict for d in worst_dict[ticker]]

# best_performer_articles = download_all_articles(best_performer_urls)
worst_performer_articles = download_articles(worst_performer_urls)

#best_performer_word_freqs = word_freqs(best_performer_articles)
worst_performer_word_freqs = word_freqs(worst_performer_articles)

#%store best_performer_articles
%store worst_performer_articles

Stored 'worst_performer_articles' (list)


In [121]:
worst_performer_word_freqs = word_freqs(worst_performer_articles)
best_perfomer_word_freqs = word_freqs(all_articles)

In [123]:
import copy
adjTags = ['JJ', 'JJR', 'JJS']
wordsAndTags = nltk.pos_tag(worst_performer_word_freqs.keys())
worst_performer_adjs = copy.deepcopy(worst_performer_word_freqs)
adjs = { t[0] for t in wordsAndTags if t[1] in adjTags }
for word in list(worst_performer_adjs.keys()):
    if word not in adjs:
        del worst_performer_adjs[word]
worst_performer_adjs.most_common()

[('average', 990),
 ('last', 888),
 ('high', 460),
 ('low', 407),
 ('current', 384),
 ('more', 346),
 ('past', 318),
 ('other', 318),
 ('additional', 287),
 ('new', 265),
 ('52-week', 262),
 ('total', 230),
 ('worth', 225),
 ('most', 218),
 ('mean', 218),
 ('recent', 215),
 ('same', 201),
 ('due', 197),
 ('second', 194),
 ('quarterly', 186),
 ('financial', 186),
 ('daily', 181),
 ('net', 178),
 ('stock’s', 167),
 ('2016-12-31', 165),
 ('previous', 157),
 ('next', 148),
 ('hold', 148),
 ('200-day', 145),
 ('lower', 143),
 ('first', 126),
 ('latest', 125),
 ('close', 125),
 ('potential', 107),
 ('closed', 104),
 ('such', 100),
 ('institutional', 99),
 ('50-day', 98),
 ('long', 98),
 ('strong', 93),
 ('less', 93),
 ('future', 91),
 ('biopharmaceutical', 91),
 ('general', 90),
 ('Other', 86),
 ('assigned', 85),
 ('short', 82),
 ('positive', 82),
 ('much', 82),
 ('own', 79),
 ('pharmaceutical', 76),
 ('generic', 75),
 ('prior', 71),
 ('fourth', 71),
 ('upside', 71),
 ('common', 70),
 ('adve

In [96]:
worst_dict.keys()

dict_keys(['BMY', 'ENDP', 'SIG', 'GOPRO', 'GPRO', 'CYH'])

In [87]:
freqs.most_common()

[('.', 12912),
 (',', 12065),
 ('the', 11727),
 ('of', 8832),
 ('a', 6349),
 ('and', 5879),
 ('$', 5680),
 ('to', 4694),
 ('in', 4311),
 ('is', 3555),
 ('%', 3525),
 ('The', 3454),
 ('on', 3152),
 (')', 2780),
 ('(', 2769),
 ('stock', 2749),
 ('for', 2674),
 ('at', 2326),
 (':', 2258),
 ('shares', 2152),
 ('price', 1897),
 ('its', 1895),
 ('has', 1697),
 ('company', 1534),
 ('by', 1504),
 ('that', 1420),
 ('Inc.', 1402),
 ('with', 1399),
 ('quarter', 1325),
 ('CSX', 1293),
 ('NRG', 1282),
 ('from', 1248),
 ('was', 1240),
 ('an', 1229),
 ('rating', 1217),
 ('as', 1170),
 ('last', 1081),
 ('have', 982),
 ('average', 952),
 ('analysts', 925),
 ('earnings', 917),
 ('year', 894),
 ('Netflix', 876),
 ('this', 826),
 ('it', 824),
 ('are', 772),
 ('United', 770),
 ('Corporation', 767),
 ('share', 738),
 ('be', 722),
 ('company’s', 713),
 ('Energy', 708),
 ('Rentals', 704),
 ('Skyworks', 702),
 ('target', 702),
 ('NYSE', 696),
 ('trading', 665),
 ('EPS', 650),
 ('NASDAQ', 638),
 ('Solutions', 6

In [74]:
c1 = Counter(list('aabcd')); c2 = Counter(list('abbbbcd')); c3 = Counter(list('zz'))
c = Counter()
c += c1
c

Counter({'a': 2, 'b': 1, 'c': 1, 'd': 1})

In [75]:
c += c2
c

Counter({'a': 3, 'b': 5, 'c': 2, 'd': 2})