In [1]:
# !pip install git+https://github.com/deamonpog/gsdmm.git
# !pip install wordcloud
# !pip install --upgrade gensim
# !pip install --upgrade s3fs
# !pip install --upgrade boto3
# !pip install --upgrade numexpr
# !pip install --upgrade pandas

In [2]:
import string
import re
import html
import json
import gzip
import datetime
import itertools
import glob
import numpy as np
import pandas as pd

from wordcloud import WordCloud

import matplotlib.pyplot as plt

In [3]:
import gensim
from gsdmm import MovieGroupProcess

In [4]:
import nltk

from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")

# punctuation
nltk.download("punkt")

# pos tagging
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords 
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /home/ec2-user/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# s3 specific libraries
import boto3
s3 = boto3.resource("s3")

import s3fs
s3fs = s3fs.S3FileSystem(anon=False)

# data is at s3://mips-main-tests/chathura_tests/cache2/*cache*.json.gzip

In [6]:
from time_keeper import TimeKeeper

In [7]:
english_stop_words = set(stopwords.words('english'))

In [8]:
data_files = glob.glob("./cache2_data/*cache*.json.gzip")
data_files.sort()
len(data_files)

29317

In [9]:
class Maybe:
    def __init__(self, json_object):
        self.json_object = json_object
        
    def field(self, field):
        if self.json_object is not None and type(self.json_object) is dict and field in self.json_object:
            return Maybe(self.json_object[field])
        return Maybe(None)
    
    def index(self, index):
        if self.json_object is not None and type(self.json_object) is list and index < len(self.json_object):
            return Maybe(self.json_object[index])
        return Maybe(None)
    
    def array(self, func=lambda m: m, as_type=list):
        if self.json_object is not None and type(self.json_object) is list:
            return as_type([func(obj) for obj in self.json_object])
        return []
    
    def value(self):
        return self.json_object


In [10]:
def read_local_text_data_raw(file_path):
    with gzip.open(file_path, 'r') as fin:
        json_bytes = fin.read()
    json_str = json_bytes.decode('utf-8')
    data = json.loads(json_str)
    return data


def read_local_text_data_filtered(file_path):
    with gzip.open(file_path, 'r') as fin:
        json_bytes = fin.read()
    json_str = json_bytes.decode('utf-8')
    data = json.loads(json_str)
    return Maybe(data).field("data").value(), Maybe(data).field("includes").field("places").value()

In [11]:
np.random.seed(123)

In [12]:
def read_tweets_and_places(in_data_files):
    all_tweets = []
    all_places = []
    for data_file in in_data_files:
        tweets, places = read_local_text_data_filtered(data_file)
        if tweets is not None:
            all_tweets.extend(tweets)
        else:
            print(f"Tweets is None for {data_file}")
        if places is not None:
            all_places.extend(places)
    place_dict = {place["id"]: place["full_name"] for place in all_places}
    return all_tweets, place_dict

In [13]:
%%time
tweets, places = read_tweets_and_places(data_files)

Tweets is None for ./cache2_data/querylist_cache_50_403.json.gzip
CPU times: user 5min 32s, sys: 18 s, total: 5min 50s
Wall time: 22min 7s


In [14]:
# Should be equal to following
expected_all_fields = {  'attachments',
                         'author_id',
                         'conversation_id',
                         'created_at',
                         'edit_history_tweet_ids',
                         'entities',
                         'geo',
                         'id',
                         'in_reply_to_user_id',
                         'public_metrics',
                         'referenced_tweets',
                         'text',
                         'withheld'  }
possible_fields = set().union(*[set(tweet.keys()) for tweet in tweets])
print(possible_fields == expected_all_fields)
possible_fields

True


{'attachments',
 'author_id',
 'conversation_id',
 'created_at',
 'edit_history_tweet_ids',
 'entities',
 'geo',
 'id',
 'in_reply_to_user_id',
 'public_metrics',
 'referenced_tweets',
 'text',
 'withheld'}

In [15]:
expected_common_fields = {  'author_id',
                             'conversation_id',
                             'created_at',
                             'edit_history_tweet_ids',
                             'id',
                             'public_metrics',
                             'text'  }
common_fields = possible_fields.intersection(*[set(tweet.keys()) for tweet in tweets])
print(common_fields == expected_common_fields)
common_fields

True


{'author_id',
 'conversation_id',
 'created_at',
 'edit_history_tweet_ids',
 'id',
 'public_metrics',
 'text'}

In [16]:
column_names = ['id',
                'conversation_id',
                'edit_history_tweet_ids',
                'author_id',
                'created_at',
                'text',
                'impression_count',
                'like_count',
                'quote_count',
                'reply_count',
                'retweet_count',
                'quoted',
                'replied_to',
                'retweeted',
                'in_reply_to_user_id',
                'geo',
                'mentions']
def get_columns(tweet_json):
    quoted = []
    replied_to = []
    retweeted = []
    for ref_tweet in Maybe(tweet_json).field("referenced_tweets").array():
        if ref_tweet["type"] == "quoted":
            quoted.append(ref_tweet["id"])
        elif ref_tweet["type"] == "replied_to":
            replied_to.append(ref_tweet["id"])
        elif ref_tweet["type"] == "retweeted":
            retweeted.append(ref_tweet["id"])
    columns_values = [
        # tweet always has following keys
        tweet_json["id"],
        tweet_json["conversation_id"],
        tweet_json["edit_history_tweet_ids"], # list of tweetIds
        tweet_json["author_id"],
        tweet_json["created_at"],
        tweet_json["text"],
        tweet_json["public_metrics"]["impression_count"],
        tweet_json["public_metrics"]["like_count"],
        tweet_json["public_metrics"]["quote_count"],
        tweet_json["public_metrics"]["reply_count"],
        tweet_json["public_metrics"]["retweet_count"],
        # optional tweet data fields
        str(quoted),
        str(replied_to),
        str(retweeted),
        Maybe(tweet_json).field("in_reply_to_user_id").value(),
        Maybe(places).field( Maybe(tweet_json).field("geo").field("place_id").value() ).value(),
        Maybe(tweet_json).field("entities").field("mentions").array(lambda m: m["id"], str),
        # Maybe(tweet_json).field("attachments") # we dont take this field at the moment
    ]
    return columns_values


In [17]:
%%time
tdf = pd.DataFrame([get_columns(tw) for tw in tweets],  columns=column_names, dtype=str)
print(tdf.shape)
tdf.drop_duplicates(subset="id", inplace=True)
print(tdf.shape)
tdf.set_index("id", inplace=True)
tdf["created_at"] = tdf["created_at"].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%f%z"))
tdf.sort_values("created_at", inplace=True)
tdf

(14060535, 17)
(14060535, 17)
CPU times: user 6min 13s, sys: 10.8 s, total: 6min 24s
Wall time: 6min 24s


Unnamed: 0_level_0,conversation_id,edit_history_tweet_ids,author_id,created_at,text,impression_count,like_count,quote_count,reply_count,retweet_count,quoted,replied_to,retweeted,in_reply_to_user_id,geo,mentions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1465908518532182018,1465908518532182018,['1465908518532182018'],14885540,2021-12-01 05:00:00+00:00,While it's too early to know how the omicron v...,0,3,0,0,1,[],[],[],,,[]
1465908518356074502,1465908518356074502,['1465908518356074502'],13719342,2021-12-01 05:00:00+00:00,"According to the Associated Press, the pop sta...",0,0,0,0,1,[],[],[],,,[]
1465908518804881408,1465908518804881408,['1465908518804881408'],3301782494,2021-12-01 05:00:00+00:00,RT @EileenParkTV: https://t.co/qaVghYz2Ek,0,0,0,0,3,[],[],['1465892768052486154'],,,[]
1465908522109992964,1465908522109992964,['1465908522109992964'],21265939,2021-12-01 05:00:01+00:00,RT @jimmytcannon: This is so fucking predatory...,0,0,0,0,10,[],[],['1465895325135253509'],,,['4705865785']
1465908523209011209,1465908523209011209,['1465908523209011209'],501397421,2021-12-01 05:00:01+00:00,RT @B52Malmet: Dr. Oz is not a “celebrity surg...,0,0,0,0,2112,[],[],['1465860012404723713'],,,['2876041031']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1509757430414000132,1509757430414000132,['1509757430414000132'],199056422,2022-04-01 04:59:56+00:00,"RT @KariLake: The crowd here in Pearce, AZ is ...",0,0,0,0,359,[],[],['1509722039074009109'],,,['1082197856']
1509757437397458948,1509757437397458948,['1509757437397458948'],1158841136439427072,2022-04-01 04:59:57+00:00,RT @sandibachom: Reminder. They finally capped...,0,0,0,0,4472,[],[],['1509743297933291520'],,,['17027632']
1509757435950510080,1509757435950510080,['1509757435950510080'],3262551006,2022-04-01 04:59:57+00:00,RT @lindyli: 193 Republicans voted NO on lower...,0,0,0,0,2177,[],[],['1509727862475083781'],,,['270132611']
1509757436462346262,1509757436462346262,['1509757436462346262'],126075164,2022-04-01 04:59:57+00:00,RT @gnauski: Australia’s first offshore wind p...,0,0,0,0,2,[],[],['1509754010567573505'],,,"['978241567239553024', '456230845']"


In [18]:
tdf.shape

(14060535, 16)

In [19]:
%%time
tdf.to_csv("all_data_df_v1.csv")

CPU times: user 3min 20s, sys: 3.6 s, total: 3min 23s
Wall time: 3min 56s


In [20]:
tdf["text"].to_csv("all_text_df_v1.csv" ,header=False, index=False)