# Prepare tweets and news data for IBM topic
Last modifed: 2017-10-24

# Roadmap
1. Prepare multiprocessing and MongoDB scripts available in ibm_tweets_analysis project
2. Filter out tweets with keyword 'ibm' in tweet_text field
3. Select out all tweets with URL link to external news article
4. Collect external news articles manually
5. Check basic statistics

# Steps

In [1]:
"""
Initialization
"""

'''
Standard modules
'''
import os
import pickle
import csv
import time
from pprint import pprint
import json
import pymongo
import multiprocessing
import logging
import collections

'''
Analysis modules
'''
import pandas as pd


'''
Custom modules
'''
import config
import utilities
import mongodb
import multiprocessing_workers

'''
Misc
'''
nb_name = '20171024-daheng-prepare_ibm_tweets_news_data'

## Prepare multiprocessing and MongoDB scripts available in ibm_tweets_analysis project

Copy `mongodb.py` and `multiprocessing_workers.py` files to the project root dir.  
 - `mongodb.py` can be used to get connection to local MongoDB database.
 - `multiprocessing_workers.py` can be used to query MongoDB database in multiple processes to save time (need modifications)

Native tweets are stored in `tweets_ek-2` db and `tw_nt` table.

## Filter out tweets with keyword 'ibm' in tweet_text field

In [2]:
%%time
"""
Register
    IBM_TWEETS_NEWS_DIR = os.path.join(DATA_DIR, 'ibm_tweets_news')
in config
"""
DB_NAME  = 'tweets_ek-2'
COL_NAME = 'tw_nt'

if 0 == 1:
    multiprocessing.log_to_stderr(logging.DEBUG)
    '''
    Use multiprocessing to parse tweet_text field for "ibm" keyword
    '''
    procedure_name = 'tag_native_tweets_text_ibm'
    
    # set processes number to CPU numbers minus 1
    process_num = multiprocessing.cpu_count() - 1
    process_file_names_lst = ['{}-{}.json'.format(process_ind, procedure_name)
                              for process_ind in range(process_num)]
    process_files_lst = [os.path.join(config.IBM_TWEETS_NEWS_DIR, process_file_name) 
                         for process_file_name in process_file_names_lst]
    
    jobs = []
    for process_ind in range(process_num):
        p = multiprocessing.Process(target=multiprocessing_workers.find_keywords_in_tweet_text,
                                    args=(DB_NAME, COL_NAME, process_ind, process_num, process_files_lst[process_ind], ['ibm']),
                                    name='Process-{}/{}'.format(process_ind, process_num))
        jobs.append(p)
    
    for job in jobs:
        job.start()
        
    for job in jobs:
        job.join()

[INFO/Process-0/11] child process calling self.run()
[INFO/Process-1/11] child process calling self.run()


MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!


[INFO/Process-2/11] child process calling self.run()
[INFO/Process-4/11] child process calling self.run()
[INFO/Process-5/11] child process calling self.run()
[INFO/Process-3/11] child process calling self.run()


MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!
MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!


[INFO/Process-6/11] child process calling self.run()


MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!
Process-0/11 handling documents 0 to 528437...


[INFO/Process-7/11] child process calling self.run()


MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!
MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!
MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!


[INFO/Process-8/11] child process calling self.run()


MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!


[INFO/Process-9/11] child process calling self.run()


MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!
MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!


[INFO/Process-10/11] child process calling self.run()


MongoDB on localhost:27017/tweets_ek-2.tw_nt connected successfully!
Process-1/11 handling documents 528438 to 1056875...
Process-2/11 handling documents 1056876 to 1585313...
Process-3/11 handling documents 1585314 to 2113751...
Process-4/11 handling documents 2113752 to 2642189...
Process-5/11 handling documents 2642190 to 3170627...
Process-6/11 handling documents 3170628 to 3699065...
Process-7/11 handling documents 3699066 to 4227503...
Process-8/11 handling documents 4227504 to 4755941...
Process-9/11 handling documents 4755942 to 5284379...
Process-10/11 handling documents 5284380 to 5812824...


[INFO/Process-0/11] process shutting down
[DEBUG/Process-0/11] running all "atexit" finalizers with priority >= 0
[DEBUG/Process-0/11] running the remaining "atexit" finalizers
[INFO/Process-0/11] process exiting with exitcode 0
[INFO/Process-1/11] process shutting down
[DEBUG/Process-1/11] running all "atexit" finalizers with priority >= 0
[DEBUG/Process-1/11] running the remaining "atexit" finalizers
[INFO/Process-1/11] process exiting with exitcode 0
[INFO/Process-2/11] process shutting down
[DEBUG/Process-2/11] running all "atexit" finalizers with priority >= 0
[DEBUG/Process-2/11] running the remaining "atexit" finalizers
[INFO/Process-2/11] process exiting with exitcode 0
[INFO/Process-3/11] process shutting down
[DEBUG/Process-3/11] running all "atexit" finalizers with priority >= 0
[DEBUG/Process-3/11] running the remaining "atexit" finalizers
[INFO/Process-3/11] process exiting with exitcode 0
[INFO/Process-4/11] process shutting down
[DEBUG/Process-4/11] running all "atexit" 

CPU times: user 136 ms, sys: 152 ms, total: 288 ms
Wall time: 14min 45s


# Select out all tweets with URL link to external news article

In [5]:
%%time
"""
Check number of filtered tweets and number of tweets with URL
"""
if 1 == 1:
    procedure_name = 'tag_native_tweets_text_ibm'
    
    # set processes number to CPU numbers minus 1
    process_num = multiprocessing.cpu_count() - 1
    process_file_names_lst = ['{}-{}.json'.format(process_ind, procedure_name)
                              for process_ind in range(process_num)]
    process_files_lst = [os.path.join(config.IBM_TWEETS_NEWS_DIR, process_file_name) 
                         for process_file_name in process_file_names_lst]
    
#     process_files_lst = ['data/ibm_tweets_news/3-tag_native_tweets_text_ibm.json']
    
    tweets_num = 0
    url_tweets_num = 0
    shortened_urls_num = 0
    
    valid_urls_set = set()
    
    valid_urls_counter = collections.Counter()
    valid_url_domain_names_counter = collections.Counter()
    
    for process_file in process_files_lst:
        with open(process_file) as f:
            for line in f:
                tweets_num += 1
                tweet_json = json.loads(line)
                entities_urls = tweet_json['entities']['urls']
                if entities_urls:
                    url_tweets_num += 1
                    for entities_url in entities_urls:
                        expanded_url = entities_url['expanded_url']
                        
                        misc_websites_lst = ['twitter', 'youtube', 'youtu.be', 'amazon', 'paper.li', 'lnkd.in']
                        if not any(word in expanded_url for word in misc_websites_lst):
                            
                            shortened_url_identifiers_lst = ['bit.ly', 'ift.tt', 'dlvr.it', 'ow.ly', 'buff.ly', 'oal.lu', 'goo.gl', 'ln.is', 'gag.gl', 'fb.me', 'trap.it',
                                                            'ibm.co', 'ibm.biz', 'shar.es', 'crwd.fr', 'klou.tt', 'tek.io', 'owler.us', 'upflow.co', 'hubs.ly', 'zd.net', 
                                                            'spr.ly', 'flip.it']
                            if not any(word in expanded_url for word in shortened_url_identifiers_lst):
                                # discard all shortened URLs
                                if len(expanded_url.split('/')) > 1:
                                    valid_urls_set.add(expanded_url)
                                    valid_urls_counter.update([expanded_url])
                                    valid_url_domain_name = expanded_url.split('/')[2]
                                    valid_url_domain_names_counter.update([valid_url_domain_name])
                                    if len(expanded_url.split('/')) == 4:
                                        shortened_urls_num += 1
    
    print('Number of tweets: {}'.format(tweets_num))
    print('Number of tweets with URL: {}'.format(url_tweets_num))
    print('Number of shortened URL: {}'.format(shortened_urls_num))
    print('Number of valid URLs: {}'.format(len(valid_urls_set)))
    pprint(valid_url_domain_names_counter.most_common(50))

Number of tweets: 152526
Number of tweets with URL: 130747
Number of shortened URL: 10297
Number of valid URLs: 15793
[('www.optimalalgotrading.com', 1375),
 ('www.forbes.com', 776),
 ('tweetedtimes.com', 516),
 ('www.lemonde.fr', 414),
 ('venturebeat.com', 403),
 ('www.ironbcg.com', 397),
 ('finance.yahoo.com', 373),
 ('www.engadget.com', 368),
 ('www.ibmchefwatson.com', 363),
 ('tinyurl.com', 306),
 ('tcrn.ch', 296),
 ('sco.lt', 295),
 ('www.linkedin.com', 283),
 ('dld.bz', 269),
 ('adweek.it', 265),
 ('xing.com', 252),
 ('www.instagram.com', 238),
 ('cnb.cx', 236),
 ('alltheinternetofthings.com', 224),
 ('www.zdnet.com', 191),
 ('medium.com', 179),
 ('mklnd.com', 178),
 ('www.wsj.com', 174),
 ('ctt.ec', 173),
 ('www.cnbc.com', 169),
 ('engt.co', 169),
 ('www.journaldunet.com', 166),
 ('www.techrepublic.com', 160),
 ('binaryloom.com', 159),
 ('www.fastcompany.com', 159),
 ('eventbrite.com', 158),
 ('twib.in', 154),
 ('www.fool.com', 152),
 ('zpr.io', 152),
 ('japan.zdnet.com', 151),


In [4]:
pprint(valid_urls_counter.most_common(50))

[('http://www.optimalalgotrading.com/', 1375),
 ('http://www.ironbcg.com', 397),
 ('http://www.lemonde.fr/pixels/article/2017/03/12/a-la-rencontre-de-watson-l-intelligence-artificielle-star-d-ibm_5093342_4408996.html',
  371),
 ('https://www.engadget.com/2017/04/07/ibm-watson-tech-support-round-the-clock/',
  131),
 ('http://worldnewslatestsummary.blogspot.com/2017/05/move-over-watson-ibm-unveils-its-most.html',
  99),
 ('https://www.engadget.com/2017/04/25/ibm-watson-cognitive-photo-booth/', 93),
 ('https://www.fastcompany.com/3068600/ibms-new-cmo-michelle-peluso-talks-watson-the-cloud-and-ethics-of-ai',
  92),
 ('http://binaryloom.com/bmw-invites-watson-onboard-to-make-it-smarter/', 90),
 ('http://fb.com/FashionWeekNews', 81),
 ('http://adweek.it/2omXqEs', 79),
 ('http://www.journaldunet.com/economie/finance/1194465-ibm-watson-et-les-banques/',
  78),
 ('http://www.cnbc.com/2017/05/08/ibms-watson-is-a-joke-says-social-capital-ceo-palihapitiya.html',
  77),
 ('http://tcrn.ch/2n5h0aO',

In [None]:
"""
Test expand shortened URLs
"""
if 1 == 1:
    

In [None]:
"""
Check number of tweets contains URL
"""
