# Build topic_news and topic_tweets docs
**Objective**: aggregate associated news and tweets for each topic into a doc

Last modified: 2017-10-23

# Roadmap
1. Build topic_news and topic_tweets docs.
2. Re-build topic_tweets docs. Remove tweets with the same or silimar tweet_text field.

# Steps

In [1]:
"""
Initialization
"""

'''
Standard modules
'''
import os
import pickle
import sqlite3
import time
import codecs
import csv
from pprint import pprint

'''
Analysis modules
'''
import pandas as pd


'''
Custom modules
'''
import config
import utilities

'''
Misc
'''
nb_name = '20171012-daheng-build_topic_news_tweets_docs'

## Build topic_news and topic_tweets docs

In [2]:
%%time
'''
Load in topics information pkl
'''
if 1 == 1:
    with open(config.TOPICS_LST_PKL, 'rb') as f:
        topics_lst = pickle.load(f)

CPU times: user 2.65 s, sys: 3.28 s, total: 5.93 s
Wall time: 5.93 s


In [3]:
%%time
"""
For each topic, query db
 - write news_doc into topic_news doc
 - write tweet_text into topic_tweets doc
 
All docs are placed inside config.TOPICS_DOCS_DIR
 - topic_news doc follows name convention: [topic_ind]-[topic_name].news.csv
 - topic_tweets doc follows name convention: [topic_ind]-[topic_name].tweets.csv
"""

if 0 == 1:
    '''
    Define topic_news and topic_tweets doc format
    '''
    csv.register_dialect('topics_docs_line', delimiter='\t', doublequote=True, quoting=csv.QUOTE_ALL)
    
    for topic_ind, topic in enumerate(topics_lst):
        topic_name = topic['name']
        news_native_ids_lst = topic['news_native_ids_lst']
        tweets_ids_lst = topic['tweets_ids_lst']
        
        print('{} Topic_name: {}; news_num: {}; tweets_num: {}'.format(topic_ind, topic_name, len(news_native_ids_lst), len(tweets_ids_lst)))
        
        with sqlite3.connect(config.NEWS_TWEETS_DB_FILE) as conn:
            conn.row_factory = sqlite3.Row
            cursor = conn.cursor()
            
            '''
            Write topic_news doc
            '''
            query_news = '''
            select news_title, news_collected_time, news_native_id, news_doc from news
            where news_native_id = :news_native_id;'''
            
            output_file = os.path.join(config.TOPICS_DOCS_DIR, '{}-{}.news.csv'.format(topic_ind, topic_name))
            with open(output_file, 'w') as f:
                print('\tWriting topic_news doc ...')
                fieldnames = ('news_native_id', 'news_collected_time', 'news_title', 'news_doc')
                writer = csv.DictWriter(f, fieldnames=fieldnames, dialect='topics_docs_line')
                writer.writeheader()

                for news_native_id in news_native_ids_lst:
                    cursor.execute(query_news, {'news_native_id': news_native_id})
                    for row in cursor.fetchall():
                        writer.writerow({'news_native_id': row['news_native_id'],
                                         'news_collected_time': row['news_collected_time'],
                                         'news_title': row['news_title'],
                                         'news_doc': row['news_doc']})
                    
            
            '''
            Write topic_tweets doc
            
            NOTE: tweets_num for each topic is much larger, should use batch query otherwise it would take hours
            '''
            batch_size = 500
                        
            output_file = os.path.join(config.TOPICS_DOCS_DIR, '{}-{}.tweets.csv'.format(topic_ind, topic_name))
            
            with open(output_file, 'w') as f:
                print('\tWriting topic_tweets doc ...')
                fieldnames = ('tweet_id', 'tweet_collected_time', 'tweet_text', 'news_native_id')
                writer = csv.DictWriter(f, fieldnames=fieldnames, dialect='topics_docs_line')
                writer.writeheader()
                
                '''
                Split queries into batches
                '''
                if len(tweets_ids_lst) % batch_size:
                    batch_num = len(tweets_ids_lst) // batch_size + 1
                else:
                    batch_num = len(tweets_ids_lst) // batch_size
                    
                for batch_ind in range(0, batch_num):
                    start_ind = batch_ind * batch_size
                    end_ind = start_ind + batch_size
                    
                    # build tuple argument containing tweet_native_ids in this batch
                    batch_tweets_ids_tpl = tuple(tweets_ids_lst[start_ind: end_ind])
                    
                    query_tweet = '''
                    select tweet_id, tweet_collected_time, tweet_text, news_native_id from tweets
                    where tweet_id in ({place_holder});'''.format(place_holder=','.join(['?']*len(batch_tweets_ids_tpl)))

                    cursor.execute(query_tweet, batch_tweets_ids_tpl)
                    for row in cursor.fetchall():
                        writer.writerow({'tweet_id': row['tweet_id'],
                                         'tweet_collected_time': row['tweet_collected_time'],
                                         'tweet_text': row['tweet_text'].replace('\n',' '), # keep each tweet in one line
                                         'news_native_id': row['news_native_id']})

0 Topic_name: Hillary_Clinton_email_controversy; news_num: 228; tweets_num: 860564
	Writing topic_news doc ...
	Writing topic_tweets doc ...
1 Topic_name: Iran_nuclear_deal; news_num: 406; tweets_num: 2412264
	Writing topic_news doc ...
	Writing topic_tweets doc ...
2 Topic_name: ISIS_Jihadi_John_identity_reveal; news_num: 101; tweets_num: 620121
	Writing topic_news doc ...
	Writing topic_tweets doc ...
3 Topic_name: Ukraine_cease_fire; news_num: 84; tweets_num: 603709
	Writing topic_news doc ...
	Writing topic_tweets doc ...
4 Topic_name: Egypt_free_Al_Jazeera_journalist; news_num: 50; tweets_num: 129120
	Writing topic_news doc ...
	Writing topic_tweets doc ...
5 Topic_name: Keystone_XL_Pipeline_bill; news_num: 55; tweets_num: 117692
	Writing topic_news doc ...
	Writing topic_tweets doc ...
6 Topic_name: CIA_Torture_Report; news_num: 41; tweets_num: 167362
	Writing topic_news doc ...
	Writing topic_tweets doc ...
7 Topic_name: Obama_cybersecurity_plan; news_num: 73; tweets_num: 495576

## Re-build topic_tweets docs. Remove tweets with the same or silimar tweet_text field

In [2]:
%%time
"""
Re-build topic_tweets docs. Remove tweets with the same or silimar tweet_text field

For each topic
 - read in original topic_tweets doc
 - remove tweets with the same or silimar tweet_text field
 - write filtered tweets into updated topic_tweets doc
 
All docs are placed inside config.TOPICS_DOCS_DIR
 - updated topic_tweets doc follows name convention: [topic_ind]-[topic_name].updated.tweets.csv
"""

if 0 == 1:
    for topic_ind, topic in enumerate(config.MANUALLY_SELECTED_TOPICS_LST):
        localtime = time.asctime(time.localtime(time.time()))
        print('({}/{}) processing topic: {} ... {}'.format(topic_ind+1,
                                                           len(config.MANUALLY_SELECTED_TOPICS_LST),
                                                           topic['name'],
                                                           localtime))
        filtered_tweets_lst = []
        
        # set for unique tweet_text field (after remvoing all URL and hashtag entities)
        unique_tweet_text_set = set()
        
        '''
        Read in original topic_tweets doc
        '''
        csv.register_dialect('topics_docs_line', delimiter='\t', doublequote=True, quoting=csv.QUOTE_ALL)
        topic_tweets_csv_file = os.path.join(config.TOPICS_DOCS_DIR, '{}-{}.tweets.csv'.format(topic_ind, topic['name']))
        with open(topic_tweets_csv_file, 'r') as f:
            reader = csv.DictReader(f, dialect='topics_docs_line')
            
            '''
            Remove tweets with the same or silimar tweet_text field
            '''
            # lazy load
            for row in reader:
                tweet_text = row['tweet_text']
                
                cleaned_tweet_text = utilities.clean_tweet_text(tweet_text)
                
                if cleaned_tweet_text not in unique_tweet_text_set:
                    filtered_tweets_lst.append(row)
                    unique_tweet_text_set.add(cleaned_tweet_text)
        
        """
        Write filtered tweets into updated topic_tweets doc
        """
        output_file = os.path.join(config.TOPICS_DOCS_DIR, '{}-{}.updated.tweets.csv'.format(topic_ind, topic['name']))
        with open(output_file, 'w') as f:
            print('\tWriting updated topic_tweets doc ...')
            fieldnames = ('tweet_id', 'tweet_collected_time', 'tweet_text', 'news_native_id')
            writer = csv.DictWriter(f, fieldnames=fieldnames, dialect='topics_docs_line')
            writer.writeheader()

            for tweet in filtered_tweets_lst:
                writer.writerow({'tweet_id': tweet['tweet_id'],
                                 'tweet_collected_time': tweet['tweet_collected_time'],
                                 'tweet_text': tweet['tweet_text'].replace('\n',' '), # keep each tweet in one line
                                 'news_native_id': tweet['news_native_id']})

(1/51) processing topic: Hillary_Clinton_email_controversy ... Mon Oct 23 15:27:39 2017
	Writing updated topic_tweets doc ...
(2/51) processing topic: Iran_nuclear_deal ... Mon Oct 23 15:28:14 2017
	Writing updated topic_tweets doc ...
(3/51) processing topic: ISIS_Jihadi_John_identity_reveal ... Mon Oct 23 15:29:59 2017
	Writing updated topic_tweets doc ...
(4/51) processing topic: Ukraine_cease_fire ... Mon Oct 23 15:30:26 2017
	Writing updated topic_tweets doc ...
(5/51) processing topic: Egypt_free_Al_Jazeera_journalist ... Mon Oct 23 15:30:50 2017
	Writing updated topic_tweets doc ...
(6/51) processing topic: Keystone_XL_Pipeline_bill ... Mon Oct 23 15:30:55 2017
	Writing updated topic_tweets doc ...
(7/51) processing topic: CIA_Torture_Report ... Mon Oct 23 15:31:00 2017
	Writing updated topic_tweets doc ...
(8/51) processing topic: Obama_cybersecurity_plan ... Mon Oct 23 15:31:07 2017
	Writing updated topic_tweets doc ...
(9/51) processing topic: DHS_funding_issue ... Mon Oct 23