# Select out daily popular topics
**Objective**: For each day, select out daily popular topic by analyzing high frequency terms in news titles of that day.

# Roadmap
1. Build news title docs for each day
2. Find out high frequency word in each news title doc

# Steps

In [1]:
"""
Initialization
"""

'''
Standard modules
'''
import os
import collections
from pprint import pprint

'''
Analysis modules
'''
import pandas as pd
import nltk


'''
Custom modules
'''
import config
import utilities

'''
Misc
'''
nb_name = '20171002-daheng-select_daily_popular_topics'

news_period_title_docs_pkl = os.path.join(config.TMP_DIR, '{}-{}'.format(nb_name, 'news-period-title_docs.sr.pkl'))
news_title_docs_high_freq_words_df_pkl = os.path.join(config.TMP_DIR, '{}-{}'.format(nb_name, 'news-title_docs-high_freq_words.df.pkl'))

## Build news title docs for each day

In [4]:
if 0 == 1:
    '''
    Load in pickle for news data over selected period.
    '''
    news_period_df = pd.read_pickle(config.NEWS_PERIOD_DF_PKL)

In [3]:
"""
Print any single news title
"""
news_period_df.loc[3, 'news_title']

'Jeb Bush quits board posts ahead of possible White House run reports'

In [4]:
"""
Print complete news titles
"""
with pd.option_context('display.max_colwidth', 100):
    display(news_period_df[['post_time', 'news_title']])

Unnamed: 0,post_time,news_title
0,2015-01-01 00:03:09,"Jeb Bush takes ""natural next step"" toward 2016 bid"
1,2015-01-01 00:03:26,"Fireworks, parties and prayers usher in 2015"
2,2015-01-01 00:04:41,2 Killed in Helicopter Crash in Southern Arizona
3,2015-01-01 00:04:41,Jeb Bush quits board posts ahead of possible White House run reports
4,2015-01-01 00:06:16,North Korea's Kim Jong Un to South Korean leader: Let's meet
5,2015-01-01 00:08:39,Western states get brutal blast of winter
6,2015-01-01 00:09:14,Abbas paves way to join International Criminal Court
7,2015-01-01 00:12:44,A look at Egypt's Al-Jazeera English trial
8,2015-01-01 00:16:27,"Storm Brings Snow, Cold to West for New Year's"
9,2015-01-01 00:17:17,"Storm brings snow, cold to West for New Year's"


In [5]:
"""
Group news by day of post_time and concatenate news_titles
"""
news_titles_sr = news_period_df.resample('D', on='post_time')['news_title'].apply(lambda x: ' '.join(x))

In [6]:
"""
Print any single news title doc
"""
news_titles_sr.iloc[0]
# news_titles_sr.loc['2015-01-01']

'Jeb Bush takes "natural next step" toward 2016 bid Fireworks, parties and prayers usher in 2015 2 Killed in Helicopter Crash in Southern Arizona Jeb Bush quits board posts ahead of possible White House run reports North Korea\'s Kim Jong Un to South Korean leader: Let\'s meet Western states get brutal blast of winter Abbas paves way to join International Criminal Court A look at Egypt\'s Al-Jazeera English trial Storm Brings Snow, Cold to West for New Year\'s Storm brings snow, cold to West for New Year\'s A look at the trial of 3 Al-Jazeera English journalists imprisoned in Egypt over a year Storm brings snow, cold to West for New Year\'s Storm brings snow, cold to West for New Year\'s Mohamed Fahmy, Canadian imprisoned in Egypt, to get retrial North Korean leader open to summit with South 2 killed in helicopter crash in southern Arizona Egypt court orders retrial in Al-Jazeera case North Korean leader open to summit with South New Year\'s Stampede in Shanghai Prompts Anxious Wait Fo

In [7]:
"""
Print all news title docs
"""
with pd.option_context('display.max_colwidth', 130):
    print(news_titles_sr)

post_time
2015-01-01    Jeb Bush takes "natural next step" toward 2016 bid Fireworks, parties and prayers usher in 2015 2 Killed in Helicopter Crash i...
2015-01-02    West Virginia Police Shooting: 2 Officers Injured, 2 Dead Bodies Found Inside ... Polar Bear Plunge draws big crowds despite 2...
2015-01-03    7-year-old survives plane crash that kills 4 in Kentucky The Life and Times of Mario Cuomo People: Harry Reid injured in exerc...
2015-01-04    Pakistan Strikes Kill 31 Militants, Drone Kills 7 Republicans take control in House, Senate Tuesday Pakistan Strikes Kill 31 M...
2015-01-05    Nest's thermostat gets smarter with support for more third-party devices national football league playoffs: first round Stuart...
2015-01-06    German Anti-Islam Protests Hit Record Numbers UPDATE 2-Sony CEO praises employees, partners for standing up to hackers Three t...
2015-01-07    Tail of AirAsia plane located in Java Sea, Indonesian official says California breaks ground on bullet train as 

In [8]:
"""
Make tmp sr pickle
"""
if 0 == 1:
    news_titles_sr.to_pickle(news_period_title_docs_pkl)

## Find out high frequency word in each news title doc

In [2]:
"""
Load tmp sr pickle for news title docs
"""
if 1 == 1:
    news_titles_sr = pd.read_pickle(news_period_title_docs_pkl)

In [3]:
test_str = news_titles_sr.iloc[0]

In [4]:
test_str

'Jeb Bush takes "natural next step" toward 2016 bid Fireworks, parties and prayers usher in 2015 2 Killed in Helicopter Crash in Southern Arizona Jeb Bush quits board posts ahead of possible White House run reports North Korea\'s Kim Jong Un to South Korean leader: Let\'s meet Western states get brutal blast of winter Abbas paves way to join International Criminal Court A look at Egypt\'s Al-Jazeera English trial Storm Brings Snow, Cold to West for New Year\'s Storm brings snow, cold to West for New Year\'s A look at the trial of 3 Al-Jazeera English journalists imprisoned in Egypt over a year Storm brings snow, cold to West for New Year\'s Storm brings snow, cold to West for New Year\'s Mohamed Fahmy, Canadian imprisoned in Egypt, to get retrial North Korean leader open to summit with South 2 killed in helicopter crash in southern Arizona Egypt court orders retrial in Al-Jazeera case North Korean leader open to summit with South New Year\'s Stampede in Shanghai Prompts Anxious Wait Fo

In [5]:
"""
Check stop-words
"""
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words)
print('Count: {}'.format(len(stop_words)))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [8]:
"""
Pre-processing steps
"""
if 0 == 1:
    tokens = nltk.word_tokenize(test_str)
    
    '''
    Regonize NE in tokens and preserve
    
    Note: performance of NE recognition depends on the pre-trained model provided in nltk package
    '''
    chunks = nltk.ne_chunk(nltk.pos_tag(tokens))
    # Each element of chunks is either a (word, pos) tuple or a Tree() containing the parts of the chunk
    tokens = [chunk[0] if isinstance(chunk, tuple) else ' '.join(node[0] for node in chunk) for chunk in chunks]
    
    '''
    Remove non-alphabetical tokens
    '''
    # tokens = [token for token in tokens if token.isalpha()]
    
    '''
    Remove stop-words
    '''
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    '''
    Remove misc single punctuation tokens
    '''
    misc_punc_lst = [",", ":", "'", "'s", ";", ".", "?", "(", ")", "..."]
    tokens = [token for token in tokens if token not in misc_punc_lst]
    
    print(tokens)

['Jeb', 'Bush', 'takes', '``', 'natural', 'next', 'step', "''", 'toward', '2016', 'bid', 'Fireworks', 'parties', 'prayers', 'usher', '2015', '2', 'Killed', 'Helicopter', 'Crash', 'Southern Arizona', 'Jeb', 'Bush', 'quits', 'board', 'posts', 'ahead', 'possible', 'White House', 'run', 'reports', 'North Korea', 'Kim Jong Un', 'South', 'Korean', 'leader', 'Let', 'meet', 'Western', 'states', 'get', 'brutal', 'blast', 'winter', 'Abbas', 'paves', 'way', 'join', 'International Criminal', 'Court', 'look', 'Egypt', 'Al-Jazeera', 'English', 'trial', 'Storm Brings Snow', 'Cold', 'West', 'New Year', 'Storm', 'brings', 'snow', 'cold', 'West', 'New Year', 'look', 'trial', '3', 'Al-Jazeera', 'English', 'journalists', 'imprisoned', 'Egypt', 'year', 'Storm', 'brings', 'snow', 'cold', 'West', 'New Year', 'Storm', 'brings', 'snow', 'cold', 'West', 'New Year', 'Mohamed Fahmy', 'Canadian', 'imprisoned', 'Egypt', 'get', 'retrial', 'North Korean', 'leader', 'open', 'summit', 'South', '2', 'killed', 'helicopte

In [9]:
"""
Count token frequency and print
"""
if 0 == 1:
    token_counter = collections.Counter(tokens)
    print(str(token_counter.most_common(50)))

[('New Year', 15), ('retrial', 12), ('Year', 12), ('killed', 11), ('New', 11), ('Shanghai', 11), ('says', 11), ('police', 11), ('2015', 10), ('run', 10), ('stampede', 10), ('Police', 10), ('Bush', 9), ('2', 9), ('board', 9), ('possible', 9), ('leader', 9), ('Egypt', 9), ('year', 9), ('open', 9), ('court', 9), ('orders', 9), ('Jeb', 8), ('South', 8), ('Al-Jazeera', 8), ('journalists', 8), ('Eve', 8), ('wedding', 8), ('man', 8), ('2016', 7), ('West', 7), ('summit', 7), ('Jeb Bush', 7), ('resigns', 7), ('talks', 7), ('36', 7), ('new', 7), ('kills', 7), ('Florida', 7), ('Storm', 6), ('snow', 6), ('case', 6), ('mother', 6), ('Syria', 6), ('GM', 6), ('brings', 5), ('cold', 5), ('3', 5), ('crash', 5), ('20', 5)]


In [3]:
"""
Warp up previous pre-process steps in function and apply to all news title docs.
Put resluts in df.
"""
def count_high_freq_words(news_title_doc):
    """
    param new_title_doc: a string of news title doc
    return: a string of high frequency words in doc
    """
    tokens = nltk.word_tokenize(news_title_doc)
    
    chunks = nltk.ne_chunk(nltk.pos_tag(tokens))
    # Each element of chunks is either a (word, pos) tuple or a Tree() containing the parts of the chunk
    tokens = [chunk[0] if isinstance(chunk, tuple) else ' '.join(node[0] for node in chunk) for chunk in chunks]
    
    stop_words = nltk.corpus.stopwords.words('english')
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    misc_punc_lst = [",", ":", "'", "'s", ";", ".", "?", "(", ")", "..."]
    tokens = [token for token in tokens if token not in misc_punc_lst]
    
    token_counter = collections.Counter(tokens)
    return str(token_counter.most_common(50))

if 0 == 1:
    results_dict = {}
    
    for ind_val, sr_val in news_titles_sr.iteritems():
        results_dict[ind_val] = count_high_freq_words(sr_val)
        
    high_freq_words_sr = pd.Series(results_dict)

In [4]:
with pd.option_context('display.max_colwidth', 130):
    print(high_freq_words_sr)

2015-01-01    [('New Year', 15), ('retrial', 12), ('Year', 12), ('killed', 11), ('New', 11), ('Shanghai', 11), ('says', 11), ('police', 11),...
2015-01-02    [('AirAsia', 14), ('found', 13), ('New Year', 12), ('man', 12), ('new', 11), ('bodies', 11), ('Mario Cuomo', 10), ('2', 9), ('...
2015-01-03    [('US', 29), ('dies', 21), ('AirAsia', 15), ('Israel', 13), ('crash', 12), ('4', 12), ('plane', 11), ('Girl', 11), ('Palestini...
2015-01-04    [('US', 15), ('new', 13), ('crash', 12), ('Prince', 11), ('Andrew', 11), ('funeral', 11), ('sanctions', 11), ('plane', 11), ('...
2015-01-05    [('dead', 12), ('police', 11), ('$', 11), ('US', 10), ('crash', 10), ('says', 9), ('2', 8), ('5', 7), ('fund', 7), ('home', 7)...
2015-01-06    [('Gov', 17), ('2', 14), ('SpaceX', 12), ('McDonnell', 11), ('Hall', 10), ('shot', 8), ('rocket', 8), ('officers', 8), ('Congr...
2015-01-07    [('FBI', 27), ('NAACP', 16), ('AirAsia', 14), ('plane', 14), ('shooting', 14), ('Yemen', 13), ('Police', 13), ('brother', 

In [6]:
"""
Make tmp df pickle
"""
if 0 == 1:
    news_title_docs_high_freq_words_df = pd.concat([news_titles_sr, high_freq_words_sr], axis=1)
    news_title_docs_high_freq_words_df.columns = ['news_title_doc', 'high_freq_words']

    news_title_docs_high_freq_words_df.to_pickle(news_title_docs_high_freq_words_df_pkl)

In [10]:
"""
Check results
"""
news_title_docs_high_freq_words_df = pd.read_pickle(news_title_docs_high_freq_words_df_pkl)
with pd.option_context('display.max_colwidth', 100):
    display(news_title_docs_high_freq_words_df)

Unnamed: 0_level_0,news_title_doc,high_freq_words
post_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01,"Jeb Bush takes ""natural next step"" toward 2016 bid Fireworks, parties and prayers usher in 2015 ...","[('New Year', 15), ('retrial', 12), ('Year', 12), ('killed', 11), ('New', 11), ('Shanghai', 11),..."
2015-01-02,"West Virginia Police Shooting: 2 Officers Injured, 2 Dead Bodies Found Inside ... Polar Bear Plu...","[('AirAsia', 14), ('found', 13), ('New Year', 12), ('man', 12), ('new', 11), ('bodies', 11), ('M..."
2015-01-03,7-year-old survives plane crash that kills 4 in Kentucky The Life and Times of Mario Cuomo Peopl...,"[('US', 29), ('dies', 21), ('AirAsia', 15), ('Israel', 13), ('crash', 12), ('4', 12), ('plane', ..."
2015-01-04,"Pakistan Strikes Kill 31 Militants, Drone Kills 7 Republicans take control in House, Senate Tues...","[('US', 15), ('new', 13), ('crash', 12), ('Prince', 11), ('Andrew', 11), ('funeral', 11), ('sanc..."
2015-01-05,Nest's thermostat gets smarter with support for more third-party devices national football leagu...,"[('dead', 12), ('police', 11), ('$', 11), ('US', 10), ('crash', 10), ('says', 9), ('2', 8), ('5'..."
2015-01-06,"German Anti-Islam Protests Hit Record Numbers UPDATE 2-Sony CEO praises employees, partners for ...","[('Gov', 17), ('2', 14), ('SpaceX', 12), ('McDonnell', 11), ('Hall', 10), ('shot', 8), ('rocket'..."
2015-01-07,"Tail of AirAsia plane located in Java Sea, Indonesian official says California breaks ground on ...","[('FBI', 27), ('NAACP', 16), ('AirAsia', 14), ('plane', 14), ('shooting', 14), ('Yemen', 13), ('..."
2015-01-08,Protesters shout at fans as Cosby returns to stage in Canada Charlie Hebdo attack: The hunt for ...,"[('California', 17), ('$', 15), ('People', 14), ('attack', 13), ('bridge', 12), ('says', 11), ('..."
2015-01-09,Police say man threw daughter off bridge Honda fined $70M Boxer won't seek reelection to Senate ...,"[('Obama', 13), ('community', 13), ('2024', 13), ('college', 12), ('bid', 12), ('Olympics', 12),..."
2015-01-10,Mitt Romney ponders another White House run Neighborhood college that is free is backed by Obama...,"[('Obama', 30), ('college', 14), ('AirAsia', 14), ('George Zimmerman', 13), ('SpaceX', 12), ('ar..."
