In [1]:
# load packages
import os
import sys
import re
import pandas as pd
import numpy as np
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# load data
data = pd.read_csv("data/news-headlines.csv")

In [3]:
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [4]:
data.iloc[0][1], data.iloc[100][1], data.iloc[200][1]

('aba decides against community broadcasting licence',
 'more women urged to become councillors',
 'academic upbeat about higher education review')

In [5]:
en_stop_words = stopwords.words("english")

def clean_text(sentence):
    sentence = re.sub("[^a-zA-Z ]", "", sentence)
    words = [w.strip().lower() for w in sentence.split()]
    words = [w for w in words if len(w) > 2 and w not in en_stop_words]
    words = [w for w in words]
    return words

data["words"] = data.headline_text.apply(lambda x: clean_text(x))

In [6]:
data.head()

Unnamed: 0,publish_date,headline_text,words
0,20030219,aba decides against community broadcasting lic...,"[aba, decides, community, broadcasting, licence]"
1,20030219,act fire witnesses must be aware of defamation,"[act, fire, witnesses, must, aware, defamation]"
2,20030219,a g calls for infrastructure protection summit,"[calls, infrastructure, protection, summit]"
3,20030219,air nz staff in aust strike for pay rise,"[air, staff, aust, strike, pay, rise]"
4,20030219,air nz strike to affect australian travellers,"[air, strike, affect, australian, travellers]"


In [7]:
def lemmatize_text(words):
    lem = WordNetLemmatizer()
    new_words = list()
    for w in words:
        new_words.append(lem.lemmatize(w))
    return new_words
    
data["lemmatized_words"] = data.words.apply(lambda x: lemmatize_text(x))

In [8]:
data.head()

Unnamed: 0,publish_date,headline_text,words,lemmatized_words
0,20030219,aba decides against community broadcasting lic...,"[aba, decides, community, broadcasting, licence]","[aba, decides, community, broadcasting, licence]"
1,20030219,act fire witnesses must be aware of defamation,"[act, fire, witnesses, must, aware, defamation]","[act, fire, witness, must, aware, defamation]"
2,20030219,a g calls for infrastructure protection summit,"[calls, infrastructure, protection, summit]","[call, infrastructure, protection, summit]"
3,20030219,air nz staff in aust strike for pay rise,"[air, staff, aust, strike, pay, rise]","[air, staff, aust, strike, pay, rise]"
4,20030219,air nz strike to affect australian travellers,"[air, strike, affect, australian, travellers]","[air, strike, affect, australian, traveller]"


In [9]:
bow = dict()
def create_bow(words):
    for w in words:
        if w not in bow:
            bow[w] = 1
        else:
            bow[w] += 1
    return len(words)
data["len_of_headlines"] = data.lemmatized_words.apply(lambda x: create_bow(x))
"number of words in all the documents = ", len(bow)

('number of words in all the documents = ', 82064)

In [15]:
bow

{'ramdev': 1,
 'acquire': 35,
 'moe': 61,
 'guulty': 1,
 'afghani': 26,
 'bunkering': 1,
 'bosque': 9,
 'eardrum': 4,
 'hustling': 1,
 'lelia': 1,
 'deduction': 25,
 'anphilippines': 2,
 'keysborough': 9,
 'loka': 1,
 'quadbike': 8,
 'reebok': 4,
 'saiki': 4,
 'ogilvy': 172,
 'dieter': 18,
 'spcs': 2,
 'appease': 25,
 'mance': 1,
 'suspicon': 1,
 'waterski': 8,
 'minuscule': 1,
 'rucking': 1,
 'nexens': 1,
 'hoxton': 4,
 'wootton': 1,
 'chepchumba': 3,
 'divya': 1,
 'millroy': 2,
 'bushcare': 1,
 'kunawarritji': 1,
 'ravaillion': 2,
 'menopausal': 6,
 'elaborate': 16,
 'diatreme': 2,
 'pilley': 1,
 'obession': 1,
 'martindale': 8,
 'worton': 1,
 'cootes': 17,
 'willies': 1,
 'sunshine': 807,
 'colanya': 1,
 'theileria': 11,
 'milnehockey': 1,
 'hannity': 2,
 'enclave': 2,
 'yearsl': 1,
 'caira': 13,
 'undara': 1,
 'medihotel': 2,
 'pinewood': 1,
 'afghanistannew': 1,
 'armstrong': 560,
 'meteorologist': 22,
 'herdshare': 1,
 'manner': 18,
 'expunged': 1,
 'overmars': 2,
 'suspicious': 

In [10]:
data.head()

Unnamed: 0,publish_date,headline_text,words,lemmatized_words,len_of_headlines
0,20030219,aba decides against community broadcasting lic...,"[aba, decides, community, broadcasting, licence]","[aba, decides, community, broadcasting, licence]",5
1,20030219,act fire witnesses must be aware of defamation,"[act, fire, witnesses, must, aware, defamation]","[act, fire, witness, must, aware, defamation]",6
2,20030219,a g calls for infrastructure protection summit,"[calls, infrastructure, protection, summit]","[call, infrastructure, protection, summit]",4
3,20030219,air nz staff in aust strike for pay rise,"[air, staff, aust, strike, pay, rise]","[air, staff, aust, strike, pay, rise]",6
4,20030219,air nz strike to affect australian travellers,"[air, strike, affect, australian, travellers]","[air, strike, affect, australian, traveller]",5


In [11]:
"number of rows = ", data.shape[0]

('number of rows = ', 1103665)

In [13]:
# filter bow: 
filterd_bow = dict()
for key in bow.keys():
    if bow[key] >= 10:
        filterd_bow[key] = bow[key]
    else:
        continue
"number of filtered words = ", len(filterd_bow)

('number of filtered words = ', 22167)

In [14]:
filterd_bow

{'pitt': 144,
 'acquire': 35,
 'moe': 61,
 'afghani': 26,
 'tainted': 75,
 'indianapolis': 33,
 'behalf': 16,
 'bosca': 84,
 'oxley': 39,
 'ogilvy': 172,
 'dieter': 18,
 'nalder': 61,
 'appease': 25,
 'sewing': 22,
 'ashtray': 17,
 'mismanaged': 11,
 'abyss': 12,
 'betraying': 12,
 'pseudoephedrine': 24,
 'offence': 1021,
 'crowley': 36,
 'plaque': 77,
 'passion': 163,
 'elaborate': 16,
 'gsk': 10,
 'fairfax': 415,
 'brennan': 78,
 'face': 11267,
 'cootes': 17,
 'moo': 25,
 'sunshine': 807,
 'peck': 10,
 'beneficial': 19,
 'meteorologist': 22,
 'given': 1523,
 'manner': 18,
 'suspicious': 1058,
 'catastrophe': 52,
 'morris': 255,
 'hamstring': 102,
 'senior': 1100,
 'pluto': 81,
 'pep': 15,
 'recharge': 13,
 'lebanon': 636,
 'canine': 39,
 'boyanup': 10,
 'redgum': 14,
 'mcleish': 20,
 'harrop': 15,
 'collaborative': 12,
 'electoral': 569,
 'fox': 642,
 'unemployed': 85,
 'fishway': 10,
 'foe': 72,
 'centrebet': 11,
 'shut': 610,
 'crusade': 31,
 'unfolds': 17,
 'liberian': 101,
 'psyc

In [19]:
# sort filtered bow by values
sorted_bow = sorted(filterd_bow.items(), key=lambda k: k[1])

In [20]:
sorted_bow

[('gsk', 10),
 ('peck', 10),
 ('boyanup', 10),
 ('fishway', 10),
 ('rasheed', 10),
 ('northparkes', 10),
 ('exorcise', 10),
 ('exercising', 10),
 ('overheated', 10),
 ('melrose', 10),
 ('guido', 10),
 ('poore', 10),
 ('faye', 10),
 ('evocca', 10),
 ('supersub', 10),
 ('kalgoorlies', 10),
 ('cipo', 10),
 ('concussed', 10),
 ('serpent', 10),
 ('oneal', 10),
 ('thein', 10),
 ('deegan', 10),
 ('hasleby', 10),
 ('populism', 10),
 ('pneumococcal', 10),
 ('meatworker', 10),
 ('whisker', 10),
 ('jeffers', 10),
 ('fesus', 10),
 ('corinthian', 10),
 ('bok', 10),
 ('reassessment', 10),
 ('diarrhoea', 10),
 ('balding', 10),
 ('banishes', 10),
 ('spineless', 10),
 ('mammography', 10),
 ('birnie', 10),
 ('tae', 10),
 ('holed', 10),
 ('limping', 10),
 ('liked', 10),
 ('melaleuca', 10),
 ('wotif', 10),
 ('nil', 10),
 ('hooliganism', 10),
 ('pardoo', 10),
 ('curlew', 10),
 ('nymboida', 10),
 ('wark', 10),
 ('ramanauskas', 10),
 ('waver', 10),
 ('intensifying', 10),
 ('lan', 10),
 ('epstein', 10),
 ('mi

In [21]:
"number of words = ", len(sorted_bow)

('number of words = ', 22167)

In [29]:
#select first 15000 words
selected_bow = dict()
def select_top_words(word_dict):
    count = 0
    for tup in word_dict:
        if count < 15000:
            selected_bow[tup[0]] = tup[1]
            count += 1
        else:
            break
select_top_words(sorted_bow)

In [30]:
len(selected_bow)

15000

In [31]:
selected_bow

{'acquire': 35,
 'moe': 61,
 'tiered': 10,
 'afghani': 26,
 'hammam': 11,
 'walton': 31,
 'moo': 25,
 'frederik': 13,
 'penis': 48,
 'gateway': 71,
 'culina': 80,
 'behalf': 16,
 'pep': 15,
 'spearhead': 51,
 'vege': 39,
 'oxley': 39,
 'forklift': 51,
 'dieter': 18,
 'nalder': 61,
 'appease': 25,
 'sewing': 22,
 'ashtray': 17,
 'mismanaged': 11,
 'molinari': 10,
 'enforces': 14,
 'smiling': 36,
 'madame': 10,
 'toorale': 22,
 'betraying': 12,
 'geoffrey': 72,
 'pseudoephedrine': 24,
 'dizzy': 36,
 'trespass': 57,
 'crowley': 36,
 'elaborate': 16,
 'gsk': 10,
 'grandson': 53,
 'cootes': 17,
 'superintendent': 39,
 'warkworth': 23,
 'donetsk': 11,
 'schipper': 52,
 'kewells': 12,
 'mercedes': 54,
 'peck': 10,
 'beneficial': 19,
 'meteorologist': 22,
 'hardware': 74,
 'magna': 17,
 'unleashes': 50,
 'catastrophe': 52,
 'traction': 15,
 'walshs': 14,
 'connie': 10,
 'exceeding': 15,
 'chirac': 50,
 'pluto': 81,
 'simone': 23,
 'hanna': 31,
 'stallion': 15,
 'ochoa': 49,
 'recharge': 13,
 '