# Build selected Hedonometer words frequency dicts for topic_news and topic_tweets docs

Last modified: 2017-10-23

# Roadmap
1. Check shed words pattern-matching requiremnts
2. Build shed words freq dicts for topic docs

# Steps

In [1]:
"""
Initialization
"""

'''
Standard modules
'''
import os
import pickle
import csv
import time
from pprint import pprint

'''
Analysis modules
'''
import pandas as pd


'''
Custom modules
'''
import config
import utilities

'''
Misc
'''
nb_name = '20171019-daheng-build_shed_words_freq_dicts'

## Check shed words pattern-matching requiremnts

__Ref__:
 - Dodds, P. S., Harris, K. D., Kloumann, I. M., Bliss, C. A., & Danforth, C. M. (2011). Temporal patterns of happiness and information in a global social network: Hedonometrics and Twitter. PloS one, 6(12), e26752.

__Notes__:
 - See _2.1 Algorithm for Hedonometer_ P3
 - See _Methods_ P23

## Build shed words freq dicts for topic docs

In [2]:
"""
Check all shed words
"""
if 1 == 1:
    ind_shed_word_dict = pd.read_pickle(config.IND_SHED_WORD_DICT_PKL)
    print(ind_shed_word_dict.values())



### Build single shed words freq dict for topic_news docs

#### Result single dict format (for all topic_news docs)

```
{topic_ind_0: {
    news_native_id_0_0: {shed_word_0_ind: shed_word_0_freq,
                         shed_word_1_ind: shed_word_1_freq,
                         ...},
    news_native_id_0_1: {shed_word_0_ind: shed_word_0_freq,
                         shed_word_1_ind: shed_word_1_freq,
                         ...},
    ...},
topic_ind_1: {
    news_native_id_1_0: {shed_word_0_ind: shed_word_0_freq,
                         shed_word_1_ind: shed_word_1_freq,
                         ...},
    news_native_id_1_1: {shed_word_0_ind: shed_word_0_freq,
                         shed_word_1_ind: shed_word_1_freq,
                         ...},
    ...},
...}
```

#### Build single shed words freq dict for all topic_news docs

In [4]:
%%time
"""
Build single shed words freq dict for all topic_news docs

Register
    TOPICS_NEWS_SHED_WORDS_FREQ_DICT_PKL = os.path.join(DATA_DIR, 'topics_news_shed_words_freq.dict.pkl')
in config
"""
if 0 == 1:
    topics_news_shed_words_freq_dict = {}
    
    for topic_ind, topic in enumerate(config.MANUALLY_SELECTED_TOPICS_LST):
        localtime = time.asctime(time.localtime(time.time()))
        print('({}/{}) processing topic: {} ... {}'.format(topic_ind+1,
                                                           len(config.MANUALLY_SELECTED_TOPICS_LST),
                                                           topic['name'],
                                                           localtime))
        
        topic_shed_words_freq_dict = {}
        
        '''
        Load shed_word and shed_word_ind mapping pkls
        '''
        ind_shed_word_dict = pd.read_pickle(config.IND_SHED_WORD_DICT_PKL)
        shed_word_ind_dict = pd.read_pickle(config.SHED_WORD_IND_DICT_PKL)
        shed_words_set = set(ind_shed_word_dict.values())
        
        '''
        Load topic_news doc
        '''
        csv.register_dialect('topics_docs_line', delimiter='\t', doublequote=True, quoting=csv.QUOTE_ALL)
        topic_news_csv_file = os.path.join(config.TOPICS_DOCS_DIR, '{}-{}.news.csv'.format(topic_ind, topic['name']))
        with open(topic_news_csv_file, 'r') as f:
            reader = csv.DictReader(f, dialect='topics_docs_line')
            '''
            Count shed words freq for each tweet
            '''
            # lazy load
            for row in reader:
                news_native_id = int(row['news_native_id'])
                news_doc = row['news_doc']
                
                news_doc_shed_words_freq_dict = utilities.count_news_doc_shed_words_freq(news_doc, ind_shed_word_dict, shed_word_ind_dict, shed_words_set)
                
                topic_shed_words_freq_dict[news_native_id] = news_doc_shed_words_freq_dict
        
        topics_news_shed_words_freq_dict[topic_ind] = topic_shed_words_freq_dict
    
    '''
    Make pkl for result single dict
    '''
    with open(config.TOPICS_NEWS_SHED_WORDS_FREQ_DICT_PKL, 'wb') as f:
        pickle.dump(topics_news_shed_words_freq_dict, f)

(1/51) processing topic: Hillary_Clinton_email_controversy ... Sun Oct 22 18:49:53 2017
(2/51) processing topic: Iran_nuclear_deal ... Sun Oct 22 18:49:53 2017
(3/51) processing topic: ISIS_Jihadi_John_identity_reveal ... Sun Oct 22 18:49:53 2017
(4/51) processing topic: Ukraine_cease_fire ... Sun Oct 22 18:49:54 2017
(5/51) processing topic: Egypt_free_Al_Jazeera_journalist ... Sun Oct 22 18:49:54 2017
(6/51) processing topic: Keystone_XL_Pipeline_bill ... Sun Oct 22 18:49:54 2017
(7/51) processing topic: CIA_Torture_Report ... Sun Oct 22 18:49:54 2017
(8/51) processing topic: Obama_cybersecurity_plan ... Sun Oct 22 18:49:54 2017
(9/51) processing topic: DHS_funding_issue ... Sun Oct 22 18:49:54 2017
(10/51) processing topic: US_Cuba_relationship ... Sun Oct 22 18:49:54 2017
(11/51) processing topic: 2015_CPAC ... Sun Oct 22 18:49:54 2017
(12/51) processing topic: Iraq_free_ISIS_Tikrit ... Sun Oct 22 18:49:54 2017
(13/51) processing topic: Nigeria_Boko_Haram_terrorists ... Sun Oct 22 

#### Check basic statistics

In [6]:
"""
Print out sample news shed_words_freq_dicts inside single topic
"""
if 0 == 1:
    target_topic_ind = 0
    
    with open(config.TOPICS_NEWS_SHED_WORDS_FREQ_DICT_PKL, 'rb') as f:
        topics_news_shed_words_freq_dict = pickle.load(f)
        
    count = 0
    for news_native_id, news_doc_shed_words_freq_dict in topics_news_shed_words_freq_dict[target_topic_ind].items():
        print('news_native_id: {}'.format(news_native_id))
        print('\t{}'.format(news_doc_shed_words_freq_dict))
        news_doc_shed_words_len = sum(news_doc_shed_words_freq_dict.values())
        print('\tLEN: {}'.format(news_doc_shed_words_len))
        count += 1
        if count >= 5:
            break


news_native_id: 30275
	{2228: 1, 1221: 15, 2773: 10, 2575: 9, 2111: 3, 704: 12, 2290: 1, 2451: 7, 1644: 1, 1497: 1, 1341: 3, 788: 1, 791: 1, 2387: 2, 2106: 1, 1361: 1, 1504: 1, 416: 1, 1656: 6, 3718: 1, 1294: 1, 3586: 1, 1986: 1, 2856: 2, 733: 1, 1884: 1, 1849: 1, 1326: 1, 1577: 2, 1767: 1, 2922: 2, 2324: 1, 3555: 1, 3685: 1, 1008: 1, 460: 1, 1198: 1, 1260: 1, 1678: 1, 3393: 1, 1949: 3, 52: 1, 2592: 1, 2416: 1, 2204: 1, 3069: 1, 1916: 1, 1828: 4, 996: 1, 2901: 1, 2637: 1, 2299: 1, 2330: 1, 758: 1, 3674: 1, 779: 1, 3196: 3, 2653: 1, 3032: 1, 1492: 1, 2597: 2, 2908: 1, 849: 1, 1116: 2, 2439: 1, 3122: 1, 3374: 1, 383: 1, 2308: 1, 2718: 1, 1991: 1, 1934: 1, 1195: 1, 2248: 1}
	LEN: 144
news_native_id: 30282
	{1690: 1, 2751: 1, 2451: 2, 704: 6, 791: 1, 3317: 1, 1341: 1, 539: 1, 1257: 1, 1283: 1, 2278: 1, 3217: 1, 3174: 1, 1607: 1, 2139: 1, 3069: 1, 2597: 1, 2575: 3, 2111: 2, 1127: 1, 3110: 1, 1504: 2, 1656: 1, 3570: 1, 1664: 1, 1221: 3, 301: 1, 788: 1, 1118: 1, 1849: 1, 2908: 1, 3134: 1, 277

In [9]:
%%time
"""
Check total shed words length of this topic_news doc
"""  
if 0 == 1:
    topic_news_shed_words_len = sum([sum(news_doc_shed_words_freq_dict.values()) for news_doc_shed_words_freq_dict in topics_news_shed_words_freq_dict[target_topic_ind].values()])
    print('Total shed words length of this topic_news doc: {}'.format(topic_news_shed_words_len))

Total shed words length of this topic_news doc: 22325
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.06 ms


### Build shed words freq dicts for each topic_tweets doc separately

#### Result dict format (for each given topic_tweets doc)

```
{tweet_id_0_0: {shed_word_0_ind: shed_word_0_freq,
                shed_word_1_ind: shed_word_1_freq,
                ...},
tweet_id_0_1: {shed_word_0_ind: shed_word_0_freq,
               shed_word_1_ind: shed_word_1_freq,
               ...},
...}
```

#### Build shed words freq dict for each topic separately

In [2]:
%%time
"""
Build shed words freq dict for each topic separately

Register
    TOPICS_TWEETS_SHED_WORDS_FREQ_DICT_PKLS_DIR = os.path.join(DATA_DIR, 'topics_tweets_shed_words_freq_dict_pkls')
in config

Note:
 - Number of tweets is large. Process each topic_tweets doc individually to avoid crash
 - Execute second time for updated topic_tweets docs
"""
if 0 == 1:
    for topic_ind, topic in enumerate(config.MANUALLY_SELECTED_TOPICS_LST):
        localtime = time.asctime(time.localtime(time.time()))
        print('({}/{}) processing topic: {} ... {}'.format(topic_ind+1,
                                                           len(config.MANUALLY_SELECTED_TOPICS_LST),
                                                           topic['name'],
                                                           localtime))
        
        topic_shed_words_freq_dict = {}
        
        '''
        Load shed_word and shed_word_ind mapping pkls
        '''
        ind_shed_word_dict = pd.read_pickle(config.IND_SHED_WORD_DICT_PKL)
        shed_word_ind_dict = pd.read_pickle(config.SHED_WORD_IND_DICT_PKL)
        shed_words_set = set(ind_shed_word_dict.values())
        
        '''
        Load topic_tweets doc
        '''
        csv.register_dialect('topics_docs_line', delimiter='\t', doublequote=True, quoting=csv.QUOTE_ALL)
        topic_tweets_csv_file = os.path.join(config.TOPICS_DOCS_DIR, '{}-{}.updated.tweets.csv'.format(topic_ind, topic['name']))
        with open(topic_tweets_csv_file, 'r') as f:
            reader = csv.DictReader(f, dialect='topics_docs_line')
            
            '''
            Count shed words freq for each tweet
            '''
            # lazy load
            for row in reader:
                tweet_id = int(row['tweet_id'])
                tweet_text = row['tweet_text']
                
                tweet_shed_words_freq_dict = utilities.count_tweet_shed_words_freq(tweet_text, ind_shed_word_dict, shed_word_ind_dict, shed_words_set)
                
                topic_shed_words_freq_dict[tweet_id] = tweet_shed_words_freq_dict
        
        '''
        Make pkl for result dict file
        '''
        topic_tweets_shed_words_freq_dict_pkl_file = os.path.join(config.TOPICS_TWEETS_SHED_WORDS_FREQ_DICT_PKLS_DIR,
                                                                  '{}.updated.dict.pkl'.format(topic_ind))
        with open(topic_tweets_shed_words_freq_dict_pkl_file, 'wb') as f:
            pickle.dump(topic_shed_words_freq_dict, f)

(1/51) processing topic: Hillary_Clinton_email_controversy ... Mon Oct 23 16:00:01 2017
(2/51) processing topic: Iran_nuclear_deal ... Mon Oct 23 16:00:23 2017
(3/51) processing topic: ISIS_Jihadi_John_identity_reveal ... Mon Oct 23 16:01:36 2017
(4/51) processing topic: Ukraine_cease_fire ... Mon Oct 23 16:01:54 2017
(5/51) processing topic: Egypt_free_Al_Jazeera_journalist ... Mon Oct 23 16:02:08 2017
(6/51) processing topic: Keystone_XL_Pipeline_bill ... Mon Oct 23 16:02:10 2017
(7/51) processing topic: CIA_Torture_Report ... Mon Oct 23 16:02:12 2017
(8/51) processing topic: Obama_cybersecurity_plan ... Mon Oct 23 16:02:17 2017
(9/51) processing topic: DHS_funding_issue ... Mon Oct 23 16:02:31 2017
(10/51) processing topic: US_Cuba_relationship ... Mon Oct 23 16:02:35 2017
(11/51) processing topic: 2015_CPAC ... Mon Oct 23 16:03:19 2017
(12/51) processing topic: Iraq_free_ISIS_Tikrit ... Mon Oct 23 16:03:26 2017
(13/51) processing topic: Nigeria_Boko_Haram_terrorists ... Mon Oct 23 

#### Check basic statistics

In [2]:
%%time
"""
Print out sample tweet shed_words_freq_dicts inside single topic
"""
if 0 == 1:
    target_topic_ind = 0
    
    topic_tweets_shed_words_freq_dict_pkl_file = os.path.join(config.TOPICS_TWEETS_SHED_WORDS_FREQ_DICT_PKLS_DIR, '{}.updated.dict.pkl'.format(target_topic_ind))
    with open(topic_tweets_shed_words_freq_dict_pkl_file, 'rb') as f:
        topic_tweets_shed_words_freq_dict_tmp = pickle.load(f)
        
    count = 0
    for tweet_id, tweet_shed_words_freq_dict in topic_tweets_shed_words_freq_dict_tmp.items():
        print('tweet_id: {}'.format(tweet_id))
        print('\t{}'.format(tweet_shed_words_freq_dict))
        tweet_shed_words_len = sum(tweet_shed_words_freq_dict.values())
        print('\tLEN: {}'.format(tweet_shed_words_len))
        count += 1
        if count >= 20:
            break

tweet_id: 128954438
	{704: 1}
	LEN: 1
tweet_id: 128954439
	{1916: 1, 677: 1, 2451: 1, 704: 1}
	LEN: 4
tweet_id: 128954440
	{2451: 1, 704: 1}
	LEN: 2
tweet_id: 128954441
	{2451: 1, 704: 1}
	LEN: 2
tweet_id: 128954442
	{2451: 1, 704: 1}
	LEN: 2
tweet_id: 128954443
	{1940: 1, 2884: 1}
	LEN: 2
tweet_id: 128954444
	{2451: 1, 704: 1}
	LEN: 2
tweet_id: 128954445
	{2451: 1, 704: 1}
	LEN: 2
tweet_id: 128954446
	{680: 1, 2387: 1, 3366: 1, 2451: 1, 704: 1}
	LEN: 5
tweet_id: 128954448
	{1577: 1, 2451: 1, 704: 1}
	LEN: 3
tweet_id: 128954451
	{2451: 1, 704: 1}
	LEN: 2
tweet_id: 128954452
	{232: 1, 2451: 1, 704: 1}
	LEN: 3
tweet_id: 128954454
	{2451: 1, 704: 1}
	LEN: 2
tweet_id: 128954459
	{704: 1}
	LEN: 1
tweet_id: 128954460
	{704: 1}
	LEN: 1
tweet_id: 128954461
	{3437: 1, 2451: 1, 704: 1}
	LEN: 3
tweet_id: 128954462
	{2451: 1, 704: 1}
	LEN: 2
tweet_id: 128954463
	{704: 1}
	LEN: 1
tweet_id: 128954464
	{}
	LEN: 0
tweet_id: 128954465
	{2111: 2, 1219: 1, 704: 1}
	LEN: 4
CPU times: user 408 ms, sys: 172

In [3]:
%%time
"""
Check total shed words length of a topic_tweets doc
"""  
if 0 == 1:
    topic_tweets_shed_words_len = sum([sum(tweet_shed_words_freq_dict.values()) for tweet_shed_words_freq_dict in topic_tweets_shed_words_freq_dict_tmp.values()])
    print('Total shed words length of this topic_tweets_doc: {}'.format(topic_tweets_shed_words_len))

Total shed words length of this topic_tweets_doc: 1105282
CPU times: user 216 ms, sys: 8 ms, total: 224 ms
Wall time: 224 ms


# Notes

 - Do NOT try to merge all topic_tweets shed words freq dicts into a single huge dict. This is extremely time-consuming and would leave VM unresponsive.