In [1]:
import re
import unicodedata

import nltk
from nltk.corpus import wordnet, stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from html.entities import name2codepoint

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/druce/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/druce/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/druce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/druce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
with open('raw.txt', 'r') as hfile:
    headlines = [h.strip() for h in hfile] 
    
headlines[:20]


["'It disturbs me to my core': Fox News staffers express outrage over Hannity's rally appearance",
 'Voting in Georgia is even more of a nightmare than you thought',
 'Trump Begins Midterm Election Day Bracing for Grim Political News, Aides Say',
 "Steve King bars 'leftist propaganda' outlet Des Moines Register from election night event",
 'Le president de lAS Monaco perquisitionne et place en garde a vue',
 'In full: The notes of apparent plan to sell Brexit deal',
 'Blockchain-based elections would be a disaster for democracy',
 'Bitcoin volatility sinks to lowest in nearly two years',
 'American Meritocracy Is Killing Youth Sports',
 'On election morning, Border Patrol holds public crowd control demon...',
 'Long lines and machines down at multiple polling places across Houston',
 'Why Long Voting Lines Could Have Long-Term Consequences',
 'Blame Fox, not Facebook, for fake news',
 'Russian business trio told to stay away from Davos',
 'The McRib Effect',
 'Amazon has been looking o

In [16]:
headlines = ["MGM Casino exploring Caesars merger: sources",
             "China Seeks Allies as Trump’s Trade War Mounts. It Won’t Be Easy.",
             "China’s Xi Jinping hits out at ‘law of the jungle’ trade policies",
             "UK business leaders call for ‘people’s vote’ on Brexit deal",
             "Theresa May to warn pro-Brexit ministers time is running out",
             "SEC Adopts Rules That Increase Information Brokers Must Provide to Investors on Order Handling",
             "Wealthy Americans Assure Populace That Heavily Armed Floating City Being Built Above Nation Has Nothing To Do With Anything",
             "Fewer Stars to Rise at Goldman Sachs as Partnership Class Shrinks",
             "S&P 500 Earnings Season Update: November 2, 2018",
             "What We Have to Fear",
            ]

In [17]:
# preclean

# things I want to replace before tokenize messes with them
pre_dict = [
    ('s & p 500' , 's_p_500'), # & is a word boundary
    ('s & p' , 's_p'),
    ('m & a' , 'm_a'),
    ('s&p 500' , 's_p_500'), # & is a word boundary
    ('s&p' , 's_p'),
    ('m&a' , 'm_a'),
    ('j\.p\.' , 'jp'),
    ('j\. p\.' , 'jp'),
    ('u\.s\.' , 'us'),
    ('u\.k\.' , 'uk'),
    ('e\.u\.' , 'eu'),
    ('n\.y\.' , 'ny'),
    ('n\.y\.c\.' , 'nyc'),
    ('g\.o\.p\.' , 'gop'),
    ('d\.c\.' , 'dc'),
    ('u\.n\.' , 'un'),
]

pre_rewrites = {}
for re_pat, replace_str in pre_dict:
    matchRe = re.compile(re_pat)
    pre_rewrites[re_pat] = (matchRe, replace_str)

def unescape(text):
    """Removes HTML or XML character references and entities from a text string.
    @param text The HTML (or XML) source text.
    @return The plain text, as a Unicode string, if necessary.
    http://effbot.org/zone/re-sub.htm#unescape-html"""
    def fixup(m):
        # pass this function to re.sub to tell it what to do on a matched group
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = unichr(name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text  # leave as is
    return re.sub("&#?\w+;", fixup, text)

def preclean_text(dirty_str):
    cleaned_text = unescape(dirty_str)
    # convert from unicode to ascii with minimal data loss
    cleaned_text = unicodedata.normalize('NFKD', cleaned_text)
    cleaned_text = cleaned_text.lower()
    for key, item in pre_rewrites.items():
        (matchRe, replaceStr) = item
        cleaned_text = matchRe.sub(replaceStr, cleaned_text)

    return cleaned_text


In [18]:
lmtzr = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    treebank_tag = treebank_tag.lower()
    if treebank_tag.startswith('j'):
        return wordnet.ADJ
    elif treebank_tag.startswith('s'):
        return wordnet.ADJ_SAT
    elif treebank_tag.startswith('v'):
        return wordnet.VERB
    elif treebank_tag.startswith('n'):
        return wordnet.NOUN
    elif treebank_tag.startswith('r'):
        return wordnet.ADV
    else:
        return ''
    
def tokenize_text(text):
    #tokenize and lemmatize
    lsentence = []
    wtokens = nltk.word_tokenize(text)
    tokens_pos = nltk.pos_tag(wtokens)
    for word, tag in tokens_pos:
        pos = get_wordnet_pos(tag)
        if pos:
            lemma = lmtzr.lemmatize(word, pos)
        else:
            lemma = lmtzr.lemmatize(word)
        lsentence.append(lemma)
    tokenized_text = " ".join(lsentence)
    return tokenized_text

In [19]:
# maybe some are redundant
post_dict = [
    ('s & p 500' , 's_p_500'), # & is a word boundary
    ('s & p' , 's_p'),
    ('m & a' , 'm_a'),
    ('panamapapers' , 'panama_papers'),
]

# popular ngrams
ngrams = [
    (('tabbforum', 'where', 'capital', 'market', 'speak'), 1554),
    (('daily', 'weekly', 'spr', 'plus'), 183),
    (('gif', 'find', 'share', 'giphy'), 212),
    (('hamzei', 'analytics', 'financial', 'network'), 1287),
    (('hedge', 'fund', 'news', 'from', 'hedgeco', 'net'), 1287),
    (('news', 'from', 'associate', 'press'), 922),
    (('news', 'from', 'associated', 'press'), 922),
    (('affordable', 'care', 'act'), 175),
    (('all', 'time', 'high'), 175),
    (('amazon', 'com', 'book'), 684),
    (('black', 'life', 'matter'), 101),
    (('bureau', 'labor', 'statistic'), 106),
    (('european', 'central', 'bank'), 212),
    (('fred', 'economic', 'data'), 212),
    (('health', 'care', 'bill'), 170),
    (('initial', 'public', 'offering'), 519),
    (('international', 'monetary', 'fund'), 781),
    (('joy', 'ann', 'reid'), 215),
    (('kim', 'jong', 'un'), 170),
    (('m', '&', 'a'), 429),
    (('monte', 'dei', 'paschi'), 111),
    (('national', 'security', 'adviser'), 205),
    (('national', 'security', 'council'), 106),
    (('new', 'york', 'city'), 472),
    (('new', 'york', 'time'), 1287),
    (('news', 'from', 'associate_press'), 1056),
    (('news', 'from', 'associated_press'), 1056),
    (('nikkei', 'asian', 'review'), 543),
    (('office', 'national', 'statistic'), 195),
    (('paris', 'climate', 'accord'), 114),
    (('presidential', 'tracking', 'poll'), 183),
    (('read', 'big', 'picture'), 144),
    (('s', '&', 'p'), 429),
    (('saturday', 'night', 'live'), 147),
    (('security', 'exchange', 'commission'), 510),
    (('self', 'driving', 'car'), 752),
    (('share', 'chart', 'stocktwits'), 1789),
    (('south', 'china', 'sea'), 516),
    (('trump', 'white', 'house'), 204),
    (('wall', 'street', 'journal'), 672),
    (('what', 'happens', 'when'), 149),
    (('why', 'you', 'should'), 172),
    (('wikipedia', 'free', 'encyclopedia'), 2324),
    (('you', 'need', 'know'), 714),
    (('david', 'tawil'), 953),
    (('after', 'year'), 202),
    (('alex', 'jones'), 329),
    (('all', 'time'), 374),
    (('alt', 'right'), 222),
    (('amazon', 'com'), 4627),
    (('artificial', 'intelligence'), 1391),
    (('asset', 'management'), 1115),
    (('associate', 'press'), 1966),
    (('associated', 'press'), 1966),
    (('attorney', 'general'), 431),
    (('bank', 'america'), 1014),
    (('bank', 'england'), 1594),
    (('barack', 'obama'), 253),
    (('basic', 'income'), 105),
    (('bbc', 'radio'), 264),
    (('ben', 'carson'), 1063),
    (('bernie', 'sander'), 5528),
    (('big', 'data'), 476),
    (('bill', 'gate'), 234),
    (('bond', 'market'), 1200),
    (('border', 'wall'), 112),
    (('central', 'bank'), 6407),
    (('charge', 'with'), 251),
    (('chief', 'executive'), 3278),
    (('chris', 'christie'), 1023),
    (('climate', 'change'), 2769),
    (('climate', 'deal'), 112),
    (('credit', 'card'), 1356),
    (('credit', 'suisse'), 1345),
    (('david', 'cameron'), 1475),
    (('deal', 'with'), 1238),
    (('democratic', 'party'), 1005),
    (('democratic', 'presidential'), 1083),
    (('deutsche', 'bank'), 2021),
    (('do', 'not'), 1037),
    (('do', 'you'), 332),
    (('dodd', 'frank'), 349),
    (('donald', 'trump'), 21304),
    (('economic', 'data'), 1911),
    (('economic', 'growth'), 1913),
    (('elizabeth', 'warren'), 238),
    (('elon', 'musk'), 376),
    (('emerge', 'market'), 1551),
    (('emmanuel', 'macron'), 306),
    (('european', 'union'), 2403),
    (('executive', 'order'), 711),
    (('fact', 'check'), 210),
    (('fake', 'news'), 1945),
    (('fbi', 'director'), 209),
    (('federal', 'reserve'), 6195),
    (('fiduciary', 'rule'), 245),
    (('finance', 'minister'), 1454),
    (('financial', 'crisis'), 2388),
    (('financial', 'market'), 1625),
    (('financial', 'service'), 1091),
    (('first', 'quarter'), 1409),
    (('first', 'time'), 1017),
    (('foreign', 'policy'), 1369),
    (('fourth', 'quarter'), 1409),
    (('fox', 'news'), 1959),
    (('fred', 'economic_data'), 314),
    (('free', 'speech'), 311),
    (('free', 'trade'), 259),
    (('from', 'associate_press'), 1058),
    (('from', 'associated_press'), 1058),
    (('front', 'runner'), 1449),
    (('fund', 'manager'), 2070),
    (('give', 'up'), 210),
    (('global', 'economy'), 1410),
    (('goldman', 'sachs'), 2744),
    (('health', 'bill'), 115),
    (('health', 'care'), 1530),
    (('health', 'insurance'), 264),
    (('hedge', 'fund'), 13144),
    (('here', 'how'), 1109),
    (('here', 'what'), 1029),
    (('here', 'why'), 527),
    (('high', 'school'), 244),
    (('hillary', 'clinton'), 9139),
    (('hong', 'kong'), 2011),
    (('housing', 'market'), 1732),
    (('how', 'do'), 388),
    (('how', 'much'), 339),
    (('human', 'right'), 1089),
    (('immigration', 'ban'), 158),
    (('implode', 'meter'), 138),
    (('insider', 'trading'), 242),
    (('instagram', 'photo'), 1101),
    (('instagram', 'post'), 2118),
    (('interest', 'rate'), 6931),
    (('investment', 'bank'), 1219),
    (('islamic', 'state'), 3666),
    (('ivanka', 'trump'), 752),
    (('james', 'comey'), 360),
    (('jar', 'kushner'), 408),
    (('jeb', 'bush'), 1625),
    (('jeff', 'session'), 532),
    (('jeremy', 'corbyn'), 273),
    (('jp', 'morgan'), 1322),
    (('justice', 'department'), 264),
    (('kellyanne', 'conway'), 421),
    (('know', 'about'), 1191),
    (('last', 'year'), 221),
    (('learn', 'from'), 324),
    (('lesson', 'from'), 258),
    (('long', 'term'), 1966),
    (('look', 'like'), 442),
    (('los', 'angeles'), 2102),
    (('machine', 'learn'), 263),
    (('machine', 'learning'), 324),
    (('mar', 'lago'), 310),
    (('marco', 'rubio'), 1361),
    (('marine', 'pen'), 324),
    (('mark', 'zuckerberg'), 223),
    (('may', 'have'), 606),
    (('meet', 'with'), 318),
    (('melania', 'trump'), 262),
    (('michael', 'flynn'), 322),
    (('middle', 'east'), 1658),
    (('minimum', 'wage'), 1750),
    (('monetary', 'policy'), 2418),
    (('more', 'than'), 2493),
    (('morgan', 'stanley'), 1322),
    (('muslim', 'ban'), 236),
    (('mutual', 'fund'), 1017),
    (('national', 'review'), 516),
    (('national', 'security'), 616),
    (('neil', 'gorsuch'), 236),
    (('net', 'neutrality'), 309),
    (('new', 'jersey'), 1170),
    (('new', 'york'), 18440),
    (('north', 'korea'), 2043),
    (('north', 'korean'), 282),
    (('nuclear', 'deal'), 1846),
    (('official', 'say'), 207),
    (('oil', 'price'), 317),
    (('panama', 'paper'), 1114),
    (('paul', 'krugman'), 1029),
    (('paul', 'ryan'), 1168),
    (('pension', 'fund'), 222),    
    (('per', 'cent'), 1817),
    (('plan', 'parenthood'), 221),
    (('police', 'officer'), 1039),
    (('pope', 'francis'), 1272),
    (('president', 'trump'), 1195),
    (('presidential', 'campaign'), 1227),
    (('press', 'conference'), 330),
    (('press', 'release'), 434),
    (('prime', 'minister'), 3781),
    (('private', 'equity'), 1446),
    (('puerto', 'rico'), 18440),
    (('rate', 'hike'), 1710),
    (('real', 'estate'), 485),
    (('republican', 'party'), 1005),
    (('reserve', 'bank'), 1076),
    (('rex', 'tillerson'), 368),
    (('roy', 'moore'), 268),
    (('russia', 'investigation'), 286),
    (('russia', 'probe'), 363),
    (('s_p', '500'), 319),
    (('san', 'francisco'), 2345),
    (('saudi', 'arabia'), 1069),
    (('sean', 'spicer'), 508),
    (('second', 'quarter'), 1409),
    (('secretary', 'state'), 1224),
    (('self', 'driving'), 1205),
    (('sexual', 'harassment'), 222),
    (('short', 'term'), 1075),
    (('shut', 'down'), 334),
    (('silicon', 'valley'), 2229),
    (('small', 'business'), 1308),
    (('social', 'medium'), 1338),
    (('social', 'security'), 1092),
    (('south', 'korea'), 428),
    (('stand', 'up'), 203),
    (('state', 'department'), 327),
    (('step', 'down'), 208),
    (('steve', 'bannon'), 362),
    (('stock', 'exchange'), 1484),
    (('stock', 'market'), 4768),
    (('study', 'find'), 211),
    (('super', 'bowl'), 1027),
    (('supreme', 'court'), 3248),
    (('tax', 'bill'), 261),
    (('tax', 'cut'), 480),
    (('tax', 'plan'), 377),
    (('tax', 'reform'), 499),
    (('ted', 'cruz'), 4319),
    (('tell', 'us'), 200),
    (('terror', 'attack'), 325),
    (('theresa', 'may'), 1917),
    (('third', 'quarter'), 1409),
    (('this', 'week'), 514),
    (('this', 'year'), 1238),
    (('town', 'hall'), 231),
    (('trade', 'deal'), 287),
    (('trade', 'war'), 249),
    (('travel', 'ban'), 1240),
    (('trump', 'administration'), 884),
    (('trump', 'budget'), 242),
    (('trump', 'campaign'), 223),
    (('twitter', 'video'), 1282),
    (('unite', 'state'), 5428),
    (('united', 'state'), 579),
    (('us', 'economy'), 1435),
    (('venture', 'capital'), 234),
    (('vice', 'president'), 1324),
    (('vladimir', 'putin'), 1021),
    (('wall', 'street'), 8408),
    (('warren', 'buffett'), 1042),
    (('washington', 'dc'), 1256),
    (('washington', 'post'), 3308),
    (('well', 'fargo'), 1381),
    (('what', 'do'), 1261),
    (('what', 'happen'), 429),
    (('white', 'house'), 3758),
    (('white', 'supremacist'), 221),
    (('whole', 'food'), 344),
    (('why', 'do'), 1000),
    (('with', 'vitweet'), 130),
    (('year', 'ago'), 228),
    (('year', 'high'), 1140),
    (('year', 'low'), 1140),
    (('year', 'old'), 1530),
    (('you', 'can'), 1105),
    (('you', 'need'), 1125),
    (('you', 'think'), 296),
    (('you_need_know', 'about'), 201)
]

post_rewrites = {}
for re_pat, replace_str in post_dict:
    matchRe = re.compile('\\b' + re_pat + '\\b')
    post_rewrites[re_pat] = (matchRe, replace_str)

ngram_rewrites = [] # preserve order

for mytuple, c in ngrams:
    # precompile all re.subs
    source = '\\b' + ' '.join(mytuple) + '\\b' # match only on word boundary
    target = '_'.join(mytuple)
    print ("%d -> %s" % (len(ngram_rewrites), target))
    ngram_rewrites.append( (source, target, re.compile(source)) )


0 -> tabbforum_where_capital_market_speak
1 -> daily_weekly_spr_plus
2 -> gif_find_share_giphy
3 -> hamzei_analytics_financial_network
4 -> hedge_fund_news_from_hedgeco_net
5 -> news_from_associate_press
6 -> news_from_associated_press
7 -> affordable_care_act
8 -> all_time_high
9 -> amazon_com_book
10 -> black_life_matter
11 -> bureau_labor_statistic
12 -> european_central_bank
13 -> fred_economic_data
14 -> health_care_bill
15 -> initial_public_offering
16 -> international_monetary_fund
17 -> joy_ann_reid
18 -> kim_jong_un
19 -> m_&_a
20 -> monte_dei_paschi
21 -> national_security_adviser
22 -> national_security_council
23 -> new_york_city
24 -> new_york_time
25 -> news_from_associate_press
26 -> news_from_associated_press
27 -> nikkei_asian_review
28 -> office_national_statistic
29 -> paris_climate_accord
30 -> presidential_tracking_poll
31 -> read_big_picture
32 -> s_&_p
33 -> saturday_night_live
34 -> security_exchange_commission
35 -> self_driving_car
36 -> share_chart_stocktwits

In [20]:
mystring = '!"#$%&\'()*+,-./:;<>=?@[\]^`{|}~'
myrestring = "[%s]" % re.escape(mystring)

skipwords = stopwords.words("english")
noskip = set(['uk', 'eu', 'ny', 'nyc', 'dc', 'un', 'nz', 'pm', 'up', 'us', 'go', 'do', 'xi', 'up', 'ad', 'ai', 'ex',])

def postclean_text(text):

    cleaned_text = text
    for key, item in post_rewrites.items():
        (matchRe, replaceStr) = item
        cleaned_text = matchRe.sub(replaceStr, cleaned_text)

    #remove special characters
    cleaned_text = re.sub(myrestring, " ", cleaned_text)

    cleaned_words = cleaned_text.split()

    #remove skipwords
    my_lemmas = []
    for t in cleaned_words:
        if t in skipwords:
            continue
        if len(t) <= 2 and (t not in noskip) :
            continue
        my_lemmas.append(t)

    lemmatized_text = " ".join(my_lemmas)

    for source, target, re_pat in ngram_rewrites:
        lemmatized_text = re_pat.sub(target, lemmatized_text)

    return lemmatized_text


In [22]:
for h in headlines:
    print(h)
    cleaned_text = preclean_text(h)
    cleaned_text = tokenize_text(cleaned_text)
    cleaned_text = postclean_text(cleaned_text)

    print(cleaned_text)


MGM Casino exploring Caesars merger: sources
mgm casino explore caesar merger source
China Seeks Allies as Trump’s Trade War Mounts. It Won’t Be Easy.
china seek ally trump trade_war mount win easy
China’s Xi Jinping hits out at ‘law of the jungle’ trade policies
china xi jinping hit law jungle trade policy
UK business leaders call for ‘people’s vote’ on Brexit deal
uk business leader call people vote brexit deal
Theresa May to warn pro-Brexit ministers time is running out
theresa_may warn pro brexit minister time run
SEC Adopts Rules That Increase Information Brokers Must Provide to Investors on Order Handling
sec adopts rule increase information broker must provide investor order handling
Wealthy Americans Assure Populace That Heavily Armed Floating City Being Built Above Nation Has Nothing To Do With Anything
wealthy american assure populace heavily arm float city build nation nothing anything
Fewer Stars to Rise at Goldman Sachs as Partnership Class Shrinks
star rise goldman_sachs 