# Check basic statistics of manually selected topics
**Objective**: make sure manually selected topics have high quality.  
 - Characteristic keywords: easy to regonize associated news.
 - Amout of disscussion: reasonable size of associated news and tweets
 - Consistent in meaning: no drift/disperse in content.
 - Evolution of event: reasonable time-span of associated news.

# Roadmap
1. Manually compile a list of topics with keywords
2. Check number of associated news and tweets for each topic
3. Check news titles and sample tweets of each topic
4. Check time-span of each topic

# Steps

In [1]:
"""
Initialization
"""

'''
Standard modules
'''
import os
import pickle
import sqlite3
import time
from pprint import pprint

'''
Analysis modules
'''
import pandas as pd


'''
Custom modules
'''
import config
import utilities

'''
Misc
'''
nb_name = '20171011-daheng-check_topics_basic_statistics'

## Manually compile a list of topics with keywords
Topics information (category, name, keywords_lst) are manually compiled into config.MANUALLY_SELECTED_TOPICS_LST

In [2]:
"""
Print out manually selected topics information
"""
for topic in config.MANUALLY_SELECTED_TOPICS_LST:
    print(topic)

{'category': 'politics', 'name': 'Hillary_Clinton_email_controversy', 'keywords_lst': [('email', 'e-mail'), ('Hillary', 'Clinton')]}
{'category': 'politics', 'name': 'Iran_nuclear_deal', 'keywords_lst': ['Iran', 'nuclear']}
{'category': 'politics', 'name': 'Jihadi_John_identity_reveled', 'keywords_lst': ['Jihadi John']}
{'category': 'social', 'name': 'Ferguson_unrest', 'keywords_lst': ['Ferguson']}
{'category': 'social', 'name': 'Hong_Kong_protests', 'keywords_lst': ['Hong Kong']}
{'category': 'social', 'name': 'Sony_cyberattack', 'keywords_lst': ['Sony']}
{'category': 'social', 'name': 'Bill_Cosby_sexual_assault_allegations ', 'keywords_lst': ['Bill Cosby']}
{'category': 'social', 'name': 'SpaceX_fails_rocket_landing', 'keywords_lst': ['SpaceX']}
{'category': 'social', 'name': 'Brian_Williams_fake_war_story  ', 'keywords_lst': ['Brian Williams']}
{'category': 'entertainment', 'name': 'Oscar', 'keywords_lst': ['Oscar']}
{'category': 'entertainment', 'name': 'Super_Bowl', 'keywords_lst'

## Check number of associated news and tweets for each topic

### Build pickle for news and tweets native_id associated with each topic

In [4]:
%%time
"""
Register
    TOPICS_LST_PKL = os.path.join(DATA_DIR, 'topics.lst.pkl')
in config.
"""
if 0 == 1:
    supplement_topics_lst = []
    
    '''
    Load in pickle for news data over selected period.
    '''
    news_period_df = pd.read_pickle(config.NEWS_PERIOD_DF_PKL)
 
    for topic_ind, topic in enumerate(config.MANUALLY_SELECTED_TOPICS_LST):
        localtime = time.asctime(time.localtime(time.time()))
        print('({}/{}) processing topic: {} ... {}'.format(topic_ind+1,
                                                           len(config.MANUALLY_SELECTED_TOPICS_LST),
                                                           topic['name'],
                                                           localtime))
        '''
        Match out associated news titles.
        '''
        asso_news_native_ids_lst = []
        for ind, row in news_period_df.iterrows():
            if utilities.news_title_match(row['news_title'], topic['keywords_lst'], verbose=False):
                asso_news_native_ids_lst.append(row['news_native_id'])
        
        topic['news_native_ids_lst'] = asso_news_native_ids_lst
        
        '''
        Query associated tweets
        '''
        asso_tweets_native_ids_lst = []
        
        query_news_tweets = '''
        select tweet_native_id from tweets
        where news_native_id = :news_native_id
        order by tweet_native_id asc;'''
        
        with sqlite3.connect(config.NEWS_TWEETS_DB_FILE) as conn:
            cursor = conn.cursor()
            for news_native_id in topic['news_native_ids_lst']:
                cursor.execute(query_news_tweets, {'news_native_id': news_native_id})
                tweets_native_ids_lst = [item[0] for item in cursor.fetchall()]
                asso_tweets_native_ids_lst.extend(tweets_native_ids_lst)
                
        topic['tweets_native_ids_lst'] = asso_tweets_native_ids_lst
        
        supplement_topics_lst.append(topic)
    
    '''
    Make pickle
    '''
    with open(config.TOPICS_LST_PKL, 'wb') as f:
        pickle.dump(supplement_topics_lst, f)

(1/23) processing topic: Hillary_Clinton_email_controversy ... Thu Oct 12 12:56:00 2017
(2/23) processing topic: Iran_nuclear_deal ... Thu Oct 12 12:56:10 2017
(3/23) processing topic: Jihadi_John_identity_reveled ... Thu Oct 12 12:56:38 2017
(4/23) processing topic: Ferguson_unrest ... Thu Oct 12 12:56:50 2017
(5/23) processing topic: Hong_Kong_protests ... Thu Oct 12 12:57:25 2017
(6/23) processing topic: Sony_cyberattack ... Thu Oct 12 12:57:38 2017
(7/23) processing topic: Bill_Cosby_sexual_assault_allegations  ... Thu Oct 12 12:58:01 2017
(8/23) processing topic: SpaceX_fails_rocket_landing ... Thu Oct 12 12:58:15 2017
(9/23) processing topic: Brian_Williams_fake_war_story   ... Thu Oct 12 12:58:26 2017
(10/23) processing topic: Oscar ... Thu Oct 12 12:58:38 2017
(11/23) processing topic: Super_Bowl ... Thu Oct 12 12:58:58 2017
(12/23) processing topic: Grammy ... Thu Oct 12 12:59:16 2017
(13/23) processing topic: Golden_Globe ... Thu Oct 12 12:59:28 2017
(14/23) processing topic:

### Recover pickle and print number of news and tweets for each topic

In [6]:
"""
Test recover topics lst pkl
"""
if 0 == 1:
    with open(config.TOPICS_LST_PKL, 'rb') as f:
        topics_lst = pickle.load(f)
    
    for topic_ind, topic in enumerate(topics_lst):
        print('{} Topic_name: {}; news_num: {}; tweets_num: {}'.format(topic_ind,
                                                                       topic['name'],
                                                                       len(topic['news_native_ids_lst']),
                                                                       len(topic['tweets_native_ids_lst'])))

0 Topic_name: Hillary_Clinton_email_controversy; news_num: 228; tweets_num: 860564
1 Topic_name: Iran_nuclear_deal; news_num: 412; tweets_num: 2468540
2 Topic_name: Jihadi_John_identity_reveled; news_num: 101; tweets_num: 620121
3 Topic_name: Ferguson_unrest; news_num: 630; tweets_num: 3507025
4 Topic_name: Hong_Kong_protests; news_num: 161; tweets_num: 572997
5 Topic_name: Sony_cyberattack; news_num: 279; tweets_num: 1954894
6 Topic_name: Bill_Cosby_sexual_assault_allegations ; news_num: 172; tweets_num: 681065
7 Topic_name: SpaceX_fails_rocket_landing; news_num: 89; tweets_num: 397246
8 Topic_name: Brian_Williams_fake_war_story  ; news_num: 69; tweets_num: 475319
9 Topic_name: Oscar; news_num: 249; tweets_num: 1951701
10 Topic_name: Super_Bowl; news_num: 213; tweets_num: 1633463
11 Topic_name: Grammy; news_num: 101; tweets_num: 686425
12 Topic_name: Golden_Globe; news_num: 81; tweets_num: 859308
13 Topic_name: 500_million_Powerball; news_num: 80; tweets_num: 532111
14 Topic_name: Ebo

## Check news titles and sample tweets of each topic

In [3]:
"""
Select a topic, print out name of topic
"""
if 1 == 1:
    with open(config.TOPICS_LST_PKL, 'rb') as f:
        topics_lst = pickle.load(f)
    
    topic = topics_lst[1]

In [4]:
'''
Print associated news titles
'''
if 1 == 1:
    print('TOPIC: {}; KEYWORDS: {}'.format(topic['name'], topic['keywords_lst']))
    
    # limit to first 100 news
    news_native_ids_lst = topic['news_native_ids_lst'][:100]
    
    query_news = '''
    select news_title, news_collected_time from news
    where news_native_id = :news_native_id
    order by news_native_id asc;'''

    with sqlite3.connect(config.NEWS_TWEETS_DB_FILE) as conn:
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        for news_native_id in news_native_ids_lst:
            cursor.execute(query_news, {'news_native_id': news_native_id})
            for row in cursor.fetchall():
                print('{}: {}'.format(row['news_collected_time'], row['news_title']))

TOPIC: Iran_nuclear_deal; KEYWORDS: ['Iran', 'nuclear']
2014-11-19: Britain 'not optimistic' on Iran nuclear talks
2014-11-19: Britain says not optimistic about Iran nuclear deal by deadline
2014-11-20: Mideast Updates / Kerry in Paris to continue Iran nuclear meetings - Middle East Updates Israel News
2014-11-20: UK 'not optimistic' about Iran nuclear deal by deadline
2014-11-20: Kerry to Join Iran Nuclear Talks in Vienna
2014-11-20: Kerry to travel to Vienna for Iran nuclear talks
2014-11-20: Iran not providing explanations on nuclear claims: watchdog
2014-11-20: For many Iranians, nuclear talks aren't about the sanctions
2014-11-20: Iranians Count Sanctions Cost as Nuclear Accord Deadline Nears
2014-11-20: Iran nuclear talks deadline may be extended to March: officials
2014-11-20: Iran nuclear talks stuck, deadline may be extended officials
2014-11-20: Kerry: Iran nuclear talks seek agreements not extension in upcoming deadline
2014-11-20: Kerry Meets French Counterpart Ahead of Ira

In [5]:
'''
Print associated tweets
'''
if 1 == 1:
    print('TOPIC: {}; KEYWORDS: {}'.format(topic['name'], topic['keywords_lst']))
    
    # limit to first 100 tweets
    tweets_native_ids_lst = topic['tweets_native_ids_lst'][:100]
    
    query_tweets = '''
    select tweet_text, tweet_collected_time from tweets
    where tweet_native_id = :tweet_native_id
    order by tweet_native_id asc;'''

    with sqlite3.connect(config.NEWS_TWEETS_DB_FILE) as conn:
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        for tweet_native_id in tweets_native_ids_lst:
            cursor.execute(query_tweets, {'tweet_native_id': tweet_native_id})
            for row in cursor.fetchall():
                print('{}: {}'.format(row['tweet_collected_time'], row['tweet_text']))

TOPIC: Iran_nuclear_deal; KEYWORDS: ['Iran', 'nuclear']
2014-11-19: Us congress,JudgeTedPoe to convene hrg entitled Iranian Nuclear Talks: Negotiating a Bad Deal? http://t.co/WPkUx1jGLk  #Iran #NoNuclearIran
2014-11-19: #Iran U.S. Nov. 24 Nuclear Deal:  Love Me, Love me not, Love Meeeeeeee http://t.co/sz4BubPbCs http://t.co/RQ95bU3xx1
2014-11-19: #Event Today at2PM ET US. #Congress suspicion about Iran Nuke deal!  #IranTalksVienna http://t.co/WPkUx1jGLk  #Iran #NoNuclearIran
2014-11-19: U.K. Official Says Deadline for Iran Talks Unlikely to Be Met (WSJ) http://t.co/LEeTKDHSRW
2014-11-19: Talks between US and Iran haunted by difficult history http://t.co/VXbr4i3O6w
2014-11-19: Doubts about a nuclear deal with #Iran as talks near end #irantalks #IranTalksvienna http://t.co/z37eZmfwiB
2014-11-19: #UN Rebukes #Iran on Human Rights as #Nuclear Talks Near Deadline http://t.co/IxeWoD1L08
2014-11-19: #UN Rebukes #Iran on Human Rights as #Nuclear Talks Near Deadline http://t.co/IxeWoD1L08
2014-