# Exploratory data analysis on happiness scores of topic_news and topic_tweets docs
Last modified: 2017-10-22

# Roadmap
1. Check happiness scores of topic_news and topic_tweets docs
2. Check word coverage of shed words on topic_news and topic_tweets docs
3. Check any special topic

# Steps

In [1]:
"""
Initialization
"""

'''
Standard modules
'''
import os
import pickle
import csv
import time
from pprint import pprint

'''
Analysis modules
'''
import pandas as pd


'''
Custom modules
'''
import config
import utilities

'''
Misc
'''
nb_name = '20171021-daheng-eda_topic_news_tweets_happiness'

## Check happiness scores of topic_news and topic_tweets docs

### Avg. happiness scores

In [2]:
%%time
"""
Compute h_avg scores for each topic_news and topic_tweets doc
"""

topics_h_avg_csv_file = os.path.join(config.FIG_DIR, 'topics_h_avg.csv')

if 0 == 1:
    topics_h_avg_lst = []
    
    '''
    Recover pkl info
    '''
    with open(config.TOPICS_NEWS_SHED_WORDS_FREQ_DICT_PKL, 'rb') as f:
        topics_news_shed_words_freq_dict = pickle.load(f)
    
    with open(config.IND_HAPPS_DICT_PKL, 'rb') as f:
        ind_happs_dict = pickle.load(f)
    
    for topic_ind, topic in enumerate(config.MANUALLY_SELECTED_TOPICS_LST):
        topic_dict = {'topic_ind': topic_ind, 'topic_name': topic['name']}
        localtime = time.asctime(time.localtime(time.time()))
        print('({}/{}) processing topic: {} ... {}'.format(topic_ind+1,
                                                           len(config.MANUALLY_SELECTED_TOPICS_LST),
                                                           topic['name'],
                                                           localtime))
        
        '''
        Compute h_avg score for topic_news doc
        '''
        topic_news_shed_words_freq_dict = topics_news_shed_words_freq_dict[topic_ind]
        
        topic_dict['news_num'] = len(topic_news_shed_words_freq_dict.keys())
        
        topic_news_merged_freq_dict = utilities.merge_shed_words_freq_dicts(topic_news_shed_words_freq_dict.values())
        topic_news_h_avg = utilities.compute_h_score(topic_news_merged_freq_dict, ind_happs_dict)
        
        topic_dict['news_h_avg'] = topic_news_h_avg
        
        '''
        Compute h_avg score for topic_tweets doc
        '''
        topic_tweets_shed_words_freq_dict_pkl_file = os.path.join(config.TOPICS_TWEETS_SHED_WORDS_FREQ_DICT_PKLS_DIR, '{}.dict.pkl'.format(topic_ind))
        with open(topic_tweets_shed_words_freq_dict_pkl_file, 'rb') as f:
            topic_tweets_shed_words_freq_dict = pickle.load(f)
        
        topic_dict['tweets_num'] = len(topic_tweets_shed_words_freq_dict.keys())
        
        topic_tweets_merged_freq_dict = utilities.merge_shed_words_freq_dicts(topic_tweets_shed_words_freq_dict.values())
        topic_tweets_h_avg = utilities.compute_h_score(topic_tweets_merged_freq_dict, ind_happs_dict)
        
        topic_dict['tweets_h_avg'] = topic_tweets_h_avg
        
        topics_h_avg_lst.append(topic_dict)
    
    topics_h_avg_df = pd.DataFrame(topics_h_avg_lst)
    topics_h_avg_df.to_csv(path_or_buf=topics_h_avg_csv_file,
                           columns=['topic_ind', 'topic_name', 'news_num', 'news_h_avg', 'tweets_num', 'tweets_h_avg'],
                           sep='\t',
                           quoting=csv.QUOTE_MINIMAL,
                           header=True,
                           index=False)
    print('Done')

(1/51) processing topic: Hillary_Clinton_email_controversy ... Sun Oct 22 22:10:50 2017
(2/51) processing topic: Iran_nuclear_deal ... Sun Oct 22 22:11:09 2017
(3/51) processing topic: ISIS_Jihadi_John_identity_reveal ... Sun Oct 22 22:12:01 2017
(4/51) processing topic: Ukraine_cease_fire ... Sun Oct 22 22:12:15 2017
(5/51) processing topic: Egypt_free_Al_Jazeera_journalist ... Sun Oct 22 22:12:28 2017
(6/51) processing topic: Keystone_XL_Pipeline_bill ... Sun Oct 22 22:12:31 2017
(7/51) processing topic: CIA_Torture_Report ... Sun Oct 22 22:12:33 2017
(8/51) processing topic: Obama_cybersecurity_plan ... Sun Oct 22 22:12:37 2017
(9/51) processing topic: DHS_funding_issue ... Sun Oct 22 22:12:47 2017
(10/51) processing topic: US_Cuba_relationship ... Sun Oct 22 22:12:49 2017
(11/51) processing topic: 2015_CPAC ... Sun Oct 22 22:13:15 2017
(12/51) processing topic: Iraq_free_ISIS_Tikrit ... Sun Oct 22 22:13:21 2017
(13/51) processing topic: Nigeria_Boko_Haram_terrorists ... Sun Oct 22 