## Filter Twitter data to add new tweets.

This notebook refers to a Google sheet with a list of tweets, and compares with a dump of tweets in json format to find which should be added.



In [1]:
import io
import json
import pandas as pd
from typing import List
from ReadTweetsFromJson import get_tweets_from_json_file

Details of the sheet used for annotating tweets.

In [2]:
KEY = '1qKYUxq_UrUFJy45r5KGuXp6gq-2ZF_VDb6-e7tWagwA'
SHEET_NAME = 'Analysis'

Helper functions for reading the existing tweets in the sheet, and filtering new ones that have not yet been included.

In [46]:
def get_already_processed(key: str, sheet_name: str) -> set:
    '''Returns the IDs of tweets that are already in the analysis.'''
    csv_url = (f'https://docs.google.com/spreadsheets/d/{key}'
               f'/gviz/tq?tqx=out:csv&sheet={sheet_name}')
    df = pd.read_csv(csv_url)
    already_processed = set(df['Tweet ID'])
    return already_processed

def is_interesting(tweet: dict) -> bool:
    '''Applies simple filtering criteria to a tweet.'''
    text = tweet.get('full_text') or tweet.get('text')
    if not text:
        return False
    is_retweet = 'retweeted_status' in tweet
    contains_url = len(tweet['entities']['urls']) > 0
    return not (is_retweet or contains_url)
    
def get_new_tweets(tweets, already_processed) -> pd.DataFrame:
    '''Returns the tweets which are interesting and not yet included.'''
    new_tweets = pd.DataFrame()
    for tweet in tweets:
        text = tweet.get('full_text') or tweet.get('text')
        if is_interesting(tweet) and tweet['id'] not in already_processed:
            new_tweets = new_tweets.append(
                {'text': text,
                 'id': str(tweet['id']),
                 'time': tweet['created_at']},
                ignore_index=True)
            already_processed.add(tweet['id'])
    new_tweets.id = new_tweets.id.astype('int64')
    return new_tweets
            
def get_excluded_tweets(tweets: List[dict]) -> pd.DataFrame:
    '''Returns all tweets that were excluded.'''
    excluded_tweets = pd.DataFrame()
    for tweet in tweets:
        if not is_interesting(tweet):
            excluded_tweets = excluded_tweets.append(
                {'text': tweet.get('full_text') or tweet.get('text'),
                 'id': str(tweet['id']),
                 'time': tweet['created_at']},
                ignore_index=True)  
    excluded_tweets.id = excluded_tweets.id.astype('int64')
    return excluded_tweets

Process a dump of tweets and save files with the new tweets to be added, and those which were discarded as not interesting.

In [47]:
JSON_PATH = 'mask_tweets_v4.json'

tweets = get_tweets_from_json_file(JSON_PATH)
already_processed = get_already_processed(KEY, SHEET_NAME)
new_tweets = get_new_tweets(tweets, already_processed)
excluded_tweets = get_excluded_tweets(tweets)
# new_tweets.to_csv('new.csv')
# excluded_tweets.to_csv('excluded.csv')

In [48]:
new_tweets

Unnamed: 0,id,text,time
0,1265235187094433793,Put that mask on. \nLet's save ourselves and t...,Tue May 26 10:55:50 +0000 2020
1,1265234680439242752,@KagutaMuseveni has arrived in Mukono for the ...,Tue May 26 10:53:49 +0000 2020
2,1265233429655891971,Among the things we should avoid even as we be...,Tue May 26 10:48:51 +0000 2020
3,1265229010361028616,All is set for @KagutaMuseveni to launch LIDA ...,Tue May 26 10:31:17 +0000 2020
4,1265227531256508417,Schools quietly planning to add face mask fees...,Tue May 26 10:25:24 +0000 2020
...,...,...,...
102,1265266482797215746,"“No mask, No Entry” at Kabale Regional Referra...",Tue May 26 13:00:11 +0000 2020
103,1265263818898313221,Face Masks - The Don’ts \n\n-Do NOT pick masks...,Tue May 26 12:49:36 +0000 2020
104,1265262140149153793,"Stay healthy, stay positive and keep going but...",Tue May 26 12:42:56 +0000 2020
105,1265244250880557056,@MinofHealthUG issued guidelines for the use o...,Tue May 26 11:31:51 +0000 2020


In [51]:
# Re-order columns to match order in spreadsheet
cols = ['id', 'time', 'text']
new_tweets = new_tweets[cols]
excluded_tweets = new_tweets[cols]

# Create csv files without the index column for easy merging with the spreadsheet 
new_tweets.to_csv('new.csv', index=False)
excluded_tweets.to_csv('excluded.csv', index=False)