# Select out daily popular topics
**Objective**: For each day, select out daily popular topic by analyzing high frequency terms in news titles of that day.

# Roadmap
1. Build news title docs for each day
2. Find out high frequency word in each news title doc
3. Write out to csv files for manually inspection

# Steps

In [1]:
"""
Initialization
"""

'''
Standard modules
'''
import os
import collections
from pprint import pprint

'''
Analysis modules
'''
import pandas as pd
import nltk


'''
Custom modules
'''
import config
import utilities

'''
Misc
'''
nb_name = '20171002-daheng-select_daily_popular_topics'

news_period_title_docs_pkl = os.path.join(config.TMP_DIR, '{}-{}'.format(nb_name, 'news-period-title_docs.sr.pkl'))
news_title_docs_high_freq_words_df_pkl = os.path.join(config.TMP_DIR, '{}-{}'.format(nb_name, 'news-title_docs-high_freq_words.df.pkl'))

## Build news title docs for each day

In [2]:
if 1 == 1:
    '''
    Load in pickle for news data over selected period.
    '''
    news_period_df = pd.read_pickle(config.NEWS_PERIOD_DF_PKL)

In [3]:
"""
Print any single news title
"""
news_period_df.loc[3, 'news_title']

'At least 4 dead in attack in Kabul, official says'

In [4]:
"""
Print complete news titles
"""
with pd.option_context('display.max_colwidth', 100):
    display(news_period_df[['news_collected_time', 'news_title']])

Unnamed: 0,news_collected_time,news_title
0,2014-11-18,Missouri's Nixon Declares State of Emergency Awaiting Grand Jury
1,2014-11-18,"PEOPLE: Bill Cosby. Charles Manson, Solange Knowles and more!"
2,2014-11-18,Ebola patient who died had received ZMapp late in his treatment
3,2014-11-18,"At least 4 dead in attack in Kabul, official says"
4,2014-11-18,Australia will not be at periphery of India's vision: Modi
5,2014-11-18,FBI: Violence could follow Ferguson indictment decision
6,2014-11-18,Four Killed in Palestinian Attack at Jerusalem Synagogue
7,2014-11-18,"Mass murderer Charles Manson issued marriage license, may get hitched next ..."
8,2014-11-18,News Guide: Texas' latest history textbook tussle
9,2014-11-18,Abdul-Rahman Kassig's parents mourn 'beloved son'


In [11]:
"""
Group news by day of news_collected_time and concatenate news_titles
"""
news_titles_sr = news_period_df.resample('D', on='news_collected_time')['news_title'].apply(lambda x: '\n'.join(x))

In [12]:
"""
Print any single news title doc
"""
print(news_titles_sr.iloc[0])
# news_titles_sr.loc['2015-01-01']

Missouri's Nixon Declares State of Emergency Awaiting Grand Jury
PEOPLE: Bill Cosby. Charles Manson, Solange Knowles and more!
Ebola patient who died had received ZMapp late in his treatment
At least 4 dead in attack in Kabul, official says
Australia will not be at periphery of India's vision: Modi
FBI: Violence could follow Ferguson indictment decision
Four Killed in Palestinian Attack at Jerusalem Synagogue
Mass murderer Charles Manson issued marriage license, may get hitched next ...
News Guide: Texas' latest history textbook tussle
Abdul-Rahman Kassig's parents mourn 'beloved son'
Obama orders full review of US hostage policy
Homeless Children in US: A parent-to-parent approach to help kids (+video)
Alleged Bill Cosby victim has connection to Colorado
Church of England approves women bishops
Uber executive wants to dig into personal lives, discredit journalists who cover ...
Suicide blast kills two at Kabul's foreign compound
Answers to questions about the Ferguson grand jury
Four 

In [13]:
"""
Print all news title docs
"""
with pd.option_context('display.max_colwidth', 130):
    print(news_titles_sr)

news_collected_time
2014-11-18    Missouri's Nixon Declares State of Emergency Awaiting Grand Jury\nPEOPLE: Bill Cosby. Charles Manson, Solange Knowles and more...
2014-11-19    Early winter pummels much of country, strands motorists, emergency vehicles\nAt the site of Jerusalem terror attack, no calls ...
2014-11-20    Americans brace for more icy temperatures and snow as ferocious storms linger\nREFILE-UPDATE 5-NBC, Netflix cancel Bill Cosby'...
2014-11-21    Obama unveils actions to spare some illegal immigrants\nTears, smiles in Nevada over US immigration reform some call bitterswe...
2014-11-22    Activists Rush to Help People Use Obama Immigration Plan\nOfficial: Ferguson grand jury still meeting\nFamily of NYC man kille...
2014-11-23    Mike Brown's Mom Urges Ferguson Protesters To Remain Peaceful\n6 things to watch for this holiday shopping season\nWork begins...
2014-11-24    Obama: Americans want 'new car smell' in 2016\nFormer DC Mayor Marion Barry Dies At 78 « CBS Baltimore

In [14]:
"""
Make tmp sr pickle
"""
if 0 == 1:
    news_titles_sr.to_pickle(news_period_title_docs_pkl)

## Find out high frequency word in each news title doc

In [15]:
"""
Load tmp sr pickle for news title docs
"""
if 1 == 1:
    news_titles_sr = pd.read_pickle(news_period_title_docs_pkl)

In [16]:
test_str = news_titles_sr.iloc[0]

In [19]:
test_str

'Missouri\'s Nixon Declares State of Emergency Awaiting Grand Jury\nPEOPLE: Bill Cosby. Charles Manson, Solange Knowles and more!\nEbola patient who died had received ZMapp late in his treatment\nAt least 4 dead in attack in Kabul, official says\nAustralia will not be at periphery of India\'s vision: Modi\nFBI: Violence could follow Ferguson indictment decision\nFour Killed in Palestinian Attack at Jerusalem Synagogue\nMass murderer Charles Manson issued marriage license, may get hitched next ...\nNews Guide: Texas\' latest history textbook tussle\nAbdul-Rahman Kassig\'s parents mourn \'beloved son\'\nObama orders full review of US hostage policy\nHomeless Children in US: A parent-to-parent approach to help kids (+video)\nAlleged Bill Cosby victim has connection to Colorado\nChurch of England approves women bishops\nUber executive wants to dig into personal lives, discredit journalists who cover ...\nSuicide blast kills two at Kabul\'s foreign compound\nAnswers to questions about the F

In [20]:
"""
Check stop-words
"""
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words)
print('Count: {}'.format(len(stop_words)))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [21]:
"""
Pre-processing steps
"""
if 1 == 1:
    tokens = nltk.word_tokenize(test_str)
    
    '''
    Regonize NE in tokens and preserve
    
    Note: performance of NE recognition depends on the pre-trained model provided in nltk package
    '''
    chunks = nltk.ne_chunk(nltk.pos_tag(tokens))
    # Each element of chunks is either a (word, pos) tuple or a Tree() containing the parts of the chunk
    tokens = [chunk[0] if isinstance(chunk, tuple) else ' '.join(node[0] for node in chunk) for chunk in chunks]
    
    '''
    Remove non-alphabetical tokens
    '''
    # tokens = [token for token in tokens if token.isalpha()]
    
    '''
    Remove stop-words
    '''
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    '''
    Remove misc single punctuation tokens
    '''
    misc_punc_lst = [",", ":", "'", "'s", ";", ".", "?", "(", ")", "..."]
    tokens = [token for token in tokens if token not in misc_punc_lst]
    
    print(tokens)

['Missouri', 'Nixon Declares State', 'Emergency Awaiting Grand Jury', 'PEOPLE', 'Bill Cosby', 'Charles Manson', 'Solange Knowles', '!', 'Ebola', 'patient', 'died', 'received', 'ZMapp', 'late', 'treatment', 'least', '4', 'dead', 'attack', 'Kabul', 'official', 'says', 'Australia', 'periphery', 'India', 'vision', 'Modi FBI', 'Violence', 'could', 'follow', 'Ferguson', 'indictment', 'decision', 'Four', 'Killed', 'Palestinian', 'Attack', 'Jerusalem Synagogue Mass', 'murderer', 'Charles Manson', 'issued', 'marriage', 'license', 'may', 'get', 'hitched', 'next', 'News', 'Guide', 'Texas', 'latest', 'history', 'textbook', 'tussle', 'Abdul-Rahman', 'Kassig', 'parents', 'mourn', "'beloved", "son'", 'Obama', 'orders', 'full', 'review', 'US', 'hostage', 'policy', 'Homeless Children', 'US', 'parent-to-parent', 'approach', 'help', 'kids', '+video', 'Alleged Bill Cosby', 'victim', 'connection', 'Colorado Church', 'England', 'approves', 'women', 'bishops', 'Uber', 'executive', 'wants', 'dig', 'personal',

In [22]:
"""
Count token frequency and print
"""
if 1 == 1:
    token_counter = collections.Counter(tokens)
    print(str(token_counter.most_common(50)))

[('Uber', 20), ('attack', 15), ('Charles Manson', 13), ('Obama', 13), ('US', 13), ('Ferguson', 12), ('Bill Cosby', 11), ('Jerusalem', 10), ('synagogue', 10), ('2014', 10), ('orders', 9), ('review', 9), ('policy', 9), ('marriage', 8), ('hostage', 8), ('Chicago', 8), ('-', 8), ('Senate', 8), ('says', 7), ('license', 7), ('home', 7), ('4', 6), ('grand', 6), ('jury', 6), ('near', 6), ('crashes', 6), ('storm', 6), ('NFL', 6), ('Adrian Peterson', 6), ('``', 6), ("''", 6), ('Missouri', 5), ('Ebola', 5), ('Kabul', 5), ('decision', 5), ('kills', 5), ('killed', 5), ('police', 5), ('gets', 5), ('Exec', 5), ('Police', 5), ('Keystone', 5), ('plane', 5), ('1', 5), ('House', 5), ('Word', 5), ('student', 5), ('Tracy Morgan', 5), ('!', 4), ('least', 4)]


In [23]:
%%time
"""
Warp up previous pre-process steps in function and apply to all news title docs.
Put resluts in df.
"""
def count_high_freq_words(news_title_doc):
    """
    param new_title_doc: a string of news title doc
    return: a string of high frequency words in doc
    """
    tokens = nltk.word_tokenize(news_title_doc)
    
    chunks = nltk.ne_chunk(nltk.pos_tag(tokens))
    # Each element of chunks is either a (word, pos) tuple or a Tree() containing the parts of the chunk
    tokens = [chunk[0] if isinstance(chunk, tuple) else ' '.join(node[0] for node in chunk) for chunk in chunks]
    
    stop_words = nltk.corpus.stopwords.words('english')
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    misc_punc_lst = [",", ":", "'", "'s", ";", ".", "?", "(", ")", "..."]
    tokens = [token for token in tokens if token not in misc_punc_lst]
    
    token_counter = collections.Counter(tokens)
    return str(token_counter.most_common(50))

if 1 == 1:
    results_dict = {}
    
    for ind_val, sr_val in news_titles_sr.iteritems():
        results_dict[ind_val] = count_high_freq_words(sr_val)
        
    high_freq_words_sr = pd.Series(results_dict)

CPU times: user 4min 18s, sys: 28 ms, total: 4min 18s
Wall time: 4min 18s


In [24]:
with pd.option_context('display.max_rows', 150, 'display.max_colwidth', 130):
    print(high_freq_words_sr)

2014-11-18    [('Uber', 20), ('attack', 15), ('Charles Manson', 13), ('Obama', 13), ('US', 13), ('Ferguson', 12), ('Bill Cosby', 11), ('Jeru...
2014-11-19    [('Man', 19), ('Obama', 19), ('Bill Cosby', 17), ('immigration', 15), ("'Sexiest", 12), ('Missouri', 10), ('GOP', 10), ('video...
2014-11-20    [('Obama', 23), ('Buffalo', 14), ('Mike Nichols', 14), ('immigration', 13), ('Iran', 12), ('FSU', 12), ('Bill Cosby', 11), ('3...
2014-11-21    [('Obama', 41), ('immigration', 20), ('Iran', 17), ('nuclear', 13), ('Ferguson', 13), ('Buffalo', 12), ('police', 10), ('kille...
2014-11-22    [('Obama', 34), ('Afghanistan', 17), ('immigration', 16), ('US', 15), ('28', 14), ('Japan', 13), ('role', 12), ('bus', 11), ('...
2014-11-23    [('Obama', 20), ('Iran', 15), ('Ferguson', 14), ('Marion Barry', 14), ('45', 13), ('immigration', 12), ('grand', 12), ('jury',...
2014-11-24    [('Iran', 22), ('police', 18), ('2016', 17), ('nuclear', 16), ('talks', 16), ('Cleveland', 15), ('Ferguson', 14), ('boy', 

In [25]:
"""
Make tmp df pickle
"""
if 0 == 1:
    news_title_docs_high_freq_words_df = pd.concat([news_titles_sr, high_freq_words_sr], axis=1)
    news_title_docs_high_freq_words_df.columns = ['news_title_doc', 'high_freq_words']

    news_title_docs_high_freq_words_df.to_pickle(news_title_docs_high_freq_words_df_pkl)

In [26]:
"""
Check results
"""
news_title_docs_high_freq_words_df = pd.read_pickle(news_title_docs_high_freq_words_df_pkl)
with pd.option_context('display.max_colwidth', 100):
    display(news_title_docs_high_freq_words_df)

Unnamed: 0_level_0,news_title_doc,high_freq_words
news_collected_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-11-18,Missouri's Nixon Declares State of Emergency Awaiting Grand Jury\nPEOPLE: Bill Cosby. Charles Ma...,"[('Uber', 20), ('attack', 15), ('Charles Manson', 13), ('Obama', 13), ('US', 13), ('Ferguson', 1..."
2014-11-19,"Early winter pummels much of country, strands motorists, emergency vehicles\nAt the site of Jeru...","[('Man', 19), ('Obama', 19), ('Bill Cosby', 17), ('immigration', 15), (""'Sexiest"", 12), ('Missou..."
2014-11-20,Americans brace for more icy temperatures and snow as ferocious storms linger\nREFILE-UPDATE 5-N...,"[('Obama', 23), ('Buffalo', 14), ('Mike Nichols', 14), ('immigration', 13), ('Iran', 12), ('FSU'..."
2014-11-21,"Obama unveils actions to spare some illegal immigrants\nTears, smiles in Nevada over US immigrat...","[('Obama', 41), ('immigration', 20), ('Iran', 17), ('nuclear', 13), ('Ferguson', 13), ('Buffalo'..."
2014-11-22,Activists Rush to Help People Use Obama Immigration Plan\nOfficial: Ferguson grand jury still me...,"[('Obama', 34), ('Afghanistan', 17), ('immigration', 16), ('US', 15), ('28', 14), ('Japan', 13),..."
2014-11-23,Mike Brown's Mom Urges Ferguson Protesters To Remain Peaceful\n6 things to watch for this holida...,"[('Obama', 20), ('Iran', 15), ('Ferguson', 14), ('Marion Barry', 14), ('45', 13), ('immigration'..."
2014-11-24,Obama: Americans want 'new car smell' in 2016\nFormer DC Mayor Marion Barry Dies At 78 « CBS Bal...,"[('Iran', 22), ('police', 18), ('2016', 17), ('nuclear', 16), ('talks', 16), ('Cleveland', 15), ..."
2014-11-25,Could Obama choose a woman as next Defense secretary? One name tops list. (+video)\nWith No Imme...,"[('US', 18), ('Iran', 17), ('FDA', 16), ('Thanksgiving', 16), ('Ferguson', 16), ('Hong Kong', 16..."
2014-11-26,"Mississippi same-sex marriage ban overturned\nRain, snow could mess up plans for Thanksgiving tr...","[('Thanksgiving', 46), ('Hong Kong', 19), ('Obama', 18), ('Ferguson', 16), ('Police', 15), ('US'..."
2014-11-27,Ferguson shooting: Governor 'rejects calls for second jury'\nSpecial forces free eight hostages ...,"[('Thanksgiving', 44), ('British', 16), ('Ferguson', 15), ('shooting', 13), ('OPEC', 12), ('lead..."


In [27]:
with pd.option_context('display.max_colwidth', 130):
    print(news_title_docs_high_freq_words_df['high_freq_words'])

news_collected_time
2014-11-18    [('Uber', 20), ('attack', 15), ('Charles Manson', 13), ('Obama', 13), ('US', 13), ('Ferguson', 12), ('Bill Cosby', 11), ('Jeru...
2014-11-19    [('Man', 19), ('Obama', 19), ('Bill Cosby', 17), ('immigration', 15), ("'Sexiest", 12), ('Missouri', 10), ('GOP', 10), ('video...
2014-11-20    [('Obama', 23), ('Buffalo', 14), ('Mike Nichols', 14), ('immigration', 13), ('Iran', 12), ('FSU', 12), ('Bill Cosby', 11), ('3...
2014-11-21    [('Obama', 41), ('immigration', 20), ('Iran', 17), ('nuclear', 13), ('Ferguson', 13), ('Buffalo', 12), ('police', 10), ('kille...
2014-11-22    [('Obama', 34), ('Afghanistan', 17), ('immigration', 16), ('US', 15), ('28', 14), ('Japan', 13), ('role', 12), ('bus', 11), ('...
2014-11-23    [('Obama', 20), ('Iran', 15), ('Ferguson', 14), ('Marion Barry', 14), ('45', 13), ('immigration', 12), ('grand', 12), ('jury',...
2014-11-24    [('Iran', 22), ('police', 18), ('2016', 17), ('nuclear', 16), ('talks', 16), ('Cleveland', 15), ('Ferg

## Write out to csv files for manually inspection

In [2]:
"""
Write out all news titles to csv file
"""
if 0 == 1:
    news_titles_csv_file = os.path.join(config.HR_DIR, 'news_titles.csv')
    news_period_df = pd.read_pickle(config.NEWS_PERIOD_DF_PKL)
    
    news_period_df.to_csv(path_or_buf=news_titles_csv_file, columns=['news_collected_time', 'news_title'], sep='\t', header=True, index=True)

In [4]:
"""
Write out news title high freq words to csv file
"""
if 0 == 1:
    news_title_high_freq_words_csv_file = os.path.join(config.HR_DIR, 'news_title_high_freq_words.csv')
    news_title_docs_high_freq_words_df = pd.read_pickle(news_title_docs_high_freq_words_df_pkl)
    
    news_title_docs_high_freq_words_df.to_csv(path_or_buf=news_title_high_freq_words_csv_file, columns=['high_freq_words'], sep='\t', header=True, index=True)