In [2]:
### Title: Twitter_Handler_Filtered.py

### Description: This program extracts various searches and fields
### from twitter data and compiles them into csv files.

# !pip install pandas
# !pip install time
# !pip install datetime
# !pip install numpy
# !pip install nltk
# !pip install pickle

### Extras to jupyter/datascience-notebook ???
# Comment out as necessary
# !pip install iso-639
# !pip install xlsxwriter
# !pip install wordcloud
# !pip install emoji==1.7.0
# !pip install regex

### Used for top_followers // memory efficient
import heapq
### json - Parsing the json file as an object - makes filtering easier
import json
### numpy - data processing functionality
import numpy as np
### os - used for directory and processing
import os
### cPickle - used to upload the classifier object for bot ID
import pickle
### datetime - processing date time objects
from datetime import datetime
###
from urllib.parse import urlparse

from lib import tools
from _write import *
from _return import *

import nltk
nltk.download('wordnet')

with open('../nltk_data/pstv.txt', 'r') as fd:
    positive_words = fd.read().splitlines()

with open('../nltk_data/ngtv.txt', 'r') as fd:
    negative_words = fd.read().splitlines()

def add_user_to_dict(analytics, user_id, name, screen_name, verified):
    user_info = analytics['user_info']
    if user_id not in user_info:
        user_info[user_id] = {
            'user': user_id,
            'name': name,
            'screen_name': screen_name,
            'verified': verified,
            'link_to_profile': 'https://twitter.com/' + screen_name
        }

def insert_retweet_analytics(rt_id, rt_text, user, analytics):
    analytics['tweet_retweet_frequencies']['retweet_count'].update( [rt_id] )
    analytics['tweet_retweet_frequencies']['retweet_user_count'].update( [user['id_str']] )
    ### if text is not in retweet text list
    if rt_id not in analytics['tweet_retweet_frequencies']['retweet_text']:
        analytics['tweet_retweet_frequencies']['retweet_text'][rt_id] = rt_text
        analytics['tweet_retweet_frequencies']['tweet_to_user_mapping'][rt_id] = user['id_str']
        add_user_to_dict(analytics, user['id_str'], user['name'], user['screen_name'], user['verified'])

def insert_quote_analytics(qt_id, analytics):
    analytics['tweet_quote_frequencies']['quote_count'].update( [qt_id] )

def insert_edge(source_id, source_screen_name, target_screen_name, group, analytics_edges):
    obj = {
        'source' : source_screen_name,
        'target': target_screen_name,
        'group': group,
        'link': 'https://twitter.com/i/web/status/' + str(source_id)
    }
    analytics['network']['vals'].update( [target_screen_name] )
    analytics['network']['edges'].append( obj )

def insert_node(screen_name, user_id, analytics):
    if user_id == 'N/A': return None
    obj = {
        'id': int(user_id),
        'screen_name': str(screen_name),
        'val': 1,
        'link_to_profile': 'https://twitter.com/' + screen_name
    }
    analytics['network']['nodes'][screen_name] = obj

def populate_prerequisite_data(tweet_data):
    ### define tweet type
    original_tweet, retweet, quote_retweet = return_tweet_type(tweet_data)

    primary_text = return_tweet_text(tweet_data, retweet)
    hashtags = return_hashtag_text(tweet_data, retweet)
    secondary_text = ''
    main_text = primary_text
    analysis_text = primary_text

    ### add to text if there is additional information
    if quote_retweet:
        analysis_text, main_text, secondary_text, quoted_hashtags = return_quote_data(tweet_data, primary_text)
        if quoted_hashtags:
            hashtags = hashtags + quoted_hashtags
    
    return original_tweet, retweet, quote_retweet, analysis_text, main_text, secondary_text, primary_text, hashtags

def populate_object(obj, i, main_text, analysis_text, hashtags):
    user = i['user']
    ms = int(i['timestamp_ms'])
    
    ### if has geo data
    if i['place'] is not None:
        location = str(i['place']['name'])
    else:
        location = user['location']

    location = location if location is not None else 'N/A'

    try:
        ratio_to_followers = user["followers_count"] / (user["followers_count"] + user["friends_count"])
    except ZeroDivisionError:
        ratio_to_followers = 0

    if bot_prediction:
        features_for_bot_predict = {
            'followers_count': user['followers_count'],
            'friends_count': user['friends_count'],
            'listed_count': user['listed_count'],
            'statuses_count': user['statuses_count'],
            'screenname_length': len(user['screen_name']),
            'digits_in_screenname': sum(c.isdigit() for c in user['screen_name']),
            'ratio_followers': ratio_to_followers
        }

        features = [np.array(list(features_for_bot_predict.values())).astype(float)]

    obj['user_id'] = user['id_str'] 
    obj['tweet_id'] = i['id_str']
    obj['name'] = user['name']
    obj['screen name'] = user['screen_name']
    obj['timestamp'] = datetime.fromtimestamp(ms/1000.0).strftime('%Y-%m-%d %H:%M:%S')
    obj['text'] = main_text
    obj['bio'] = user['description']
    obj['hashtags'] = hashtags
    obj['link to tweet'] = tools.return_tweet_link(obj['tweet_id'])
    obj['location'] = location
    obj['follower count'] = user['followers_count']
    obj['friends count'] = user['friends_count']
    obj['listed count'] = user['listed_count']
    obj['language_twitter'] = i['lang']
    obj['status count'] = user['statuses_count']
    obj['collocations_text'] = return_collocation_list(analysis_text, search_terms)
    obj['collocations_bio'] = return_collocation_list(obj['bio'], positive_words + negative_words)
    obj['verified'] = user['verified']
    obj['bot_prediction'] = clf.predict_proba(features)[:,1] if bot_prediction else 0

def twitter_parser_new(file, analytics, dataframe):
    with open(file, 'rb') as input:
        json_file = json.load(input)
        ### for every json object in the json
        for i in json_file:

            ### if not a valid tweet break
            if (len(i) <= 1): break

            ### -------------------------------
            ### -------- Prerequisite ---------
            ### -------------------------------

            ##### create empty dict
            obj = tools.return_empty_JSON()
            original_tweet, retweet, quote_retweet, analysis_text, main_text, secondary_text, primary_text, hashtags = populate_prerequisite_data(i)

            ### -------------------------------
            ### --------- Update JSON ---------
            ### -------------------------------

            populate_object(obj, i, main_text, analysis_text, hashtags)
            dataframe.append(obj)

            ### -------------------------------
            ### -------- Preprocessing --------
            ### -------------------------------

            tweet_frequencies = return_clean_text(analysis_text)
            tweet_emoji_frequencies = return_emoji_list(analysis_text)
            bio_frequencies = return_clean_text(obj['bio'])
            bio_emoji_frequencies = return_emoji_list(obj['bio'])
            name_emoji_frequencies = return_emoji_list(i['user']['name'])
            collocations_text_without_search_term = [[i[0], i[2]] for i in obj['collocations_text']]
            collocations_bio_without_search_term = [[i[0], i[2]] for i in obj['collocations_bio']]

            ### -------------------------------
            ### ------ Update Analytics -------
            ### -------------------------------

            analytics['counter']['words'] += len(return_clean_text(analysis_text, False))
            analytics['counter']['tweets'] += 1
            analytics['hashtag_frequencies'].update( obj['hashtags'] )
            analytics['location_frequencies'].update( [obj['location']] )
            analytics['language_frequencies'].update( [obj['language_twitter']] )
            analytics['timeline_frequencies'].append(int(i['timestamp_ms'])) ### TO-DO: take away refence to i
            analytics['coloc_text_frequency']['contextual'].update(obj['collocations_text'])
            analytics['coloc_bio_frequency']['contextual'].update(obj['collocations_bio'])
            analytics['coloc_text_frequency']['unique'].update(np.unique(collocations_text_without_search_term))
            analytics['coloc_bio_frequency']['unique'].update(np.unique(collocations_bio_without_search_term))
            analytics['emoji_frequency']['text_total'].update(tweet_emoji_frequencies)
            analytics['emoji_frequency']['text_unique'].update(np.unique(tweet_emoji_frequencies))
            analytics['emoji_frequency']['bio_total'].update(bio_emoji_frequencies)
            analytics['emoji_frequency']['bio_unique'].update(np.unique(bio_emoji_frequencies))
            analytics['emoji_frequency']['name_total'].update(name_emoji_frequencies)
            analytics['emoji_frequency']['name_unique'].update(np.unique(name_emoji_frequencies))
            analytics['word_frequency']['total'].update(tweet_frequencies)
            analytics['word_frequency']['unique'].update(np.unique(tweet_frequencies))
            analytics['bio_word_frequency'].update(np.unique(bio_frequencies))

            ### add users to hashap to look up at later analysis stages
            add_user_to_dict(analytics, obj['user_id'], obj['name'], obj['screen name'], obj['verified'])

            ### check if Tweet contains video or photos
            if 'media' in i['entities']:
                analytics['counter']['with_media'] += 1
                for media in i['entities']['media']:
                    analytics['media']['link'][media['media_url_https']] = media['expanded_url']
                    analytics['media']['count'].update( [media['media_url_https']] )

            ### does the tweet contain a URL
            if 'urls' in i['entities']:
                if len(i['entities']['urls']) != 0:
                    analytics['counter']['with_url'] += 1
                    for url in i['entities']['urls']:
                        ### avoid other Twitter replies URLs
                        if 'twitter.com' not in url['expanded_url']:
                            analytics['URLS']['unqiue'].update([url['expanded_url']])
                            analytics['URLS']['domain'].update([urlparse(url['expanded_url']).netloc])

            ### select node generation
            insert_node(i['user']['screen_name'], i['user']['id_str'], analytics)
            if retweet:
                analytics['counter']['retweets'] += 1
                insert_node(i['retweeted_status']['user']['screen_name'],  i['retweeted_status']['user']['id_str'], analytics)
                insert_edge(i['retweeted_status']['id'], i['user']['screen_name'], i['retweeted_status']['user']['screen_name'], 'retweet', analytics)
                insert_retweet_analytics(i['retweeted_status']['id'], obj['text'], i['retweeted_status']['user'], analytics)
            elif quote_retweet:
                analytics['counter']['quote_retweets'] += 1
                # insert_quote_node(i['id'], obj['user_id'], analytics)
                # ### difference between how API 1.1 and 2.0 handles quotes
                qt_screen_name = main_text.split()[1][1:-1]
                try:
                    quoted_id = i['quoted_status']['id']
                except KeyError: 
                    quoted_id = i['quoted_status_id']
                if qt_screen_name != 'N/A' or qt_screen_name != 'undefined':
                    ### TO-DO: Figure out how quote users IDs can be added here
                    insert_node(qt_screen_name,  0, analytics)
                    insert_edge(i['id'], i['user']['screen_name'], qt_screen_name, 'quote', analytics)
                ### TO-DO: Figure out how quote users can be added here
                insert_quote_analytics(quoted_id, analytics)

            ### increment count
            ### 'N/A' is reserved for tweets that had no user data 
            if obj['user_id'] != 'N/A':
                analytics['user_frequencies']['user_unique_tweet_count'].update( [ obj['user_id'] ] )
            
            ### add all original tweets to a timeline
            if original_tweet:
                analytics['counter']['original_tweets'] += 1
                for times in priority_tweets_in_timestamps:
                    if times[0] <= int(i['timestamp_ms']) < times[1]:
                        analytics['priority_tweets_from_timestamps'].append
                        (
                            {
                                'tweet id': obj['tweet_id'],
                                'text': obj['text'],
                                'user_id': obj['user_id']
                            }
                        )

            ### top users via the amount of followers they have using a heap
            ### the heap has a drawback that is may miss a few followers 
            ### gained / losted during data collection
            top_users_via_followers = analytics['user_frequencies']['top_users_via_followers']
            if len(top_users_via_followers) < top_users_size:
                if return_duplicate_check(top_users_via_followers, obj['user_id']) == False:
                    heapq.heappush(top_users_via_followers, (obj['follower count'], obj['user_id']))
            else:
                smallest_follower_count = heapq.nsmallest(1, top_users_via_followers)[-1][0]
                if smallest_follower_count < obj['follower count']:
                    if return_duplicate_check(top_users_via_followers, obj['user_id']) == False:
                        heapq.heappushpop(top_users_via_followers, (obj['follower count'], obj['user_id']))

            ### -------------------------------
            ### --------- Empty Buffer --------
            ### -------------------------------

            ### if buffer is beyond buffer_size; write to file
            if (len(dataframe) >= buffer_size):
                write_to_file(dataframe, to_dir, save_file_name)

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
    all_files = [
        '../analysis_config/event/example.json',
    ]

    for file in all_files:
        ### all user varibles that require changing for each run
        variables = json.load(open(file))
        ### -------------------------------------
        ###
        from_dir = variables['from_dir']
        ###
        to_dir = variables['to_dir']
        ###
        save_file_name = variables['save_file_name']
        ### the amount of CSV rows files each file output
        buffer_size = variables['buffer_size']
        ### the amount of retweets to write to file
        retweet_size = variables['retweet_size']
        ### 
        top_users_size = variables['top_users_size']
        ###
        bot_prediction = variables['bot_prediction']
        ### in milliseconds
        priority_tweets_in_timestamps = variables['priority_tweets_in_timestamps']
        ###
        search_terms = variables['search_terms']
        ### -------------------------------------

        ### do not recommend this feature in the current state
        if bot_prediction:
            with open('clf.pickle', 'rb') as f:
                clf = pickle.load(f)

        analytics = tools.return_empty_analytics()
        dataframe = []

        json_files = [from_dir + pos_json for pos_json in os.listdir(from_dir) if pos_json.endswith('.json')]
        num_of_file = len(json_files)

        for count, f in enumerate(json_files):
            # try:
            # print('{}/{} - {}'.format(count+1, num_of_file, f))
            twitter_parser_new(f, analytics, dataframe)
            # except ValueError:  # includes simplejson.decoder.JSONDecodeError:
            #     print ('JSON decode error')
            #     pass
        
        write_to_file(dataframe, to_dir, save_file_name)
        write_analytics_files(analytics, to_dir, retweet_size, top_users_size)

  df = pd.read_csv(to_dir + file, header=0, index_col=0, parse_dates=True)
  df = pd.read_csv(to_dir + file, header=0, index_col=0, parse_dates=True)
  df = pd.read_csv(to_dir + file, header=0, index_col=0, parse_dates=True)
