In [1]:
# Overview:
#
# Preprocess the Twitter COVID .json file (set of tweets) into a "data" file
# for NLP and make that data file availabe for others via google drive.
#
# To do this:
# The twitter file is .json format, resident on the 12G disk.
# This gets copied (just file expplorer) to google drive
# folder "My Drive/colab_datasets" https://drive.google.com/drive/folders/1tDPMhQoyxrW01Evcrt3f6l_zDOWhGBa8?usp=share_link
# The script here opens that local .json file,
# preprocesses it and will write out preprocessed data.txt file to that same google folder.
# 
# So - this script:
# Logs into google account
# Read/process the .json file writing to a colab-local file
# upload the preprocessed colab-local file to drive folder


In [2]:
!pip install ijson

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# python standard library imports
# from-imports follow regular imports
import re
import random
from math import sqrt
from collections import Counter
import datetime

# external libraries imports, grouped semantically
import torch
import numpy as np

import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from IPython.display import clear_output, display

import ijson

In [4]:
import pandas as pd

In [5]:
# authenticate to google drive

# reference https://towardsdatascience.com/different-ways-to-connect-google-drive-to-a-google-colab-notebook-pt-1-de03433d2f7a
from pydrive.auth import GoogleAuth
from google.colab import drive
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [6]:
# get the twitter data from cwinsor's google drive

# ID of the raw/large .json file (original twitter data)
fid_rd = "1ABfAhA6T1F1qlRLV_f1uqbLQsQ5MrtyG"
drive_file = drive.CreateFile({'id': fid_rd})

# Download the file to a local disc
local_filename = "covid_tweets_raw.json"
drive_file.GetContentFile(local_filename)
# df  = pd.read_csv("file.csv")
# df.head()

In [7]:
# read the entire file into a python array
with open(local_filename, 'rb') as f:
    data = f.readlines()

# # remove the trailing "\n" from each line
# data = map(lambda x: x.rstrip(), data)

In [9]:
# df = pd.DataFrame.from_records(data[0:10])

In [12]:
df = pd.json_normalize(data[0:10])

In [13]:


with open(local_filename, "rb") as f:
    objects = ijson.items(f, "", multiple_values=True)
    obj_num = 0
    for object in objects:
        obj_num += 1

        print(obj_num)
        
        if (found_reply is False) and (object["in_reply_to_status_id"] is not None):
            found_reply = True
            print_item(object, "reply")
        
        if (found_retweet is False) and (object['retweeted_status'] is not None):
            found_retweet = True
            print_item(object, "retweet")

        if (found_original is False) and (object["in_reply_to_status_id"] is None) and (object['retweeted_status'] is None):
            found_original = True
            print_item(object, "original")

        if (found_reply and found_retweet):
            break

        # raw_list.append(object["full_text"] + " ")

0
1
2
3
4
5
6
7
8
9


In [23]:
df = pd.json_normalize(data[0:1])

In [24]:
df


0


In [11]:
data[0]

b'{"created_at":"Sat Feb 01 06:19:40 +0000 2020","id":1223491082425684000,"id_str":"1223491082425683968","full_text":"RT @SenTomCotton: The coronavirus has become a global pandemic, yet the Chinese Communist Party continues to play politics by blocking Taiw\xe2\x80\xa6","truncated":false,"display_text_range":[0,140],"entities":{"hashtags":[],"symbols":[],"user_mentions":[{"screen_name":"SenTomCotton","name":"Tom Cotton","id":968650362,"id_str":"968650362","indices":[3,16]}],"urls":[]},"source":"<a href=\\"http://twitter.com/download/iphone\\" rel=\\"nofollow\\">Twitter for iPhone</a>","in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1162034666729140200,"id_str":"1162034666729140224","name":"CKL","screen_name":"sophia_ckl","location":"Hong Kong","description":"Go with the flow #followbackhongkong","url":null,"entities":{"description":{"urls":[]}},"protected":false,"follower

In [10]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10725,10726,10727,10728,10729,10730,10731,10732,10733,10734
0,123,34,99,114,101,97,116,101,100,95,...,,,,,,,,,,
1,123,34,99,114,101,97,116,101,100,95,...,110.0,103.0,34.0,58.0,34.0,101.0,110.0,34.0,125.0,10.0
2,123,34,99,114,101,97,116,101,100,95,...,,,,,,,,,,
3,123,34,99,114,101,97,116,101,100,95,...,,,,,,,,,,
4,123,34,99,114,101,97,116,101,100,95,...,,,,,,,,,,


In [11]:
data_df = pd.read_json(data)

ValueError: ignored

In [10]:
# each element of 'data' is an individual JSON object.
# i want to convert it into an *array* of JSON objects
# which, in and of itself, is one large JSON object
# basically... add square brackets to the beginning
# and end, and have all the individual business JSON objects
# separated by a comma
data_json_str = "[" + ','.join(data) + "]"

# now, load it into pandas
data_df = pd.read_json(data_json_str)

TypeError: ignored

In [None]:
df = pd.read_json(local_filename,  lines=True)


In [25]:
def print_item(obj, msg):
    print(f"------------- {msg} -----------------")
    for k, v in obj.items():
        print(f"{k} {v}")

In [1]:
df.summary()

NameError: ignored

In [28]:
raw_list = []
found_reply = False
found_retweet = False
found_original = False


with open(local_filename, "rb") as f:
    objects = ijson.items(f, "", multiple_values=True)
    obj_num = 0
    for object in objects:
        obj_num += 1

        print(obj_num)
        
        if (found_reply is False) and (object["in_reply_to_status_id"] is not None):
            found_reply = True
            print_item(object, "reply")
        
        if (found_retweet is False) and (object['retweeted_status'] is not None):
            found_retweet = True
            print_item(object, "retweet")

        if (found_original is False) and (object["in_reply_to_status_id"] is None) and (object['retweeted_status'] is None):
            found_original = True
            print_item(object, "original")

        if (found_reply and found_retweet):
            break

        # raw_list.append(object["full_text"] + " ")

print("number of tweets: ", len(raw_list))

1
------------- retweet -----------------
created_at Sat Feb 01 06:19:40 +0000 2020
id 1223491082425684000
id_str 1223491082425683968
full_text RT @SenTomCotton: The coronavirus has become a global pandemic, yet the Chinese Communist Party continues to play politics by blocking Taiw…
truncated False
display_text_range [0, 140]
entities {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'SenTomCotton', 'name': 'Tom Cotton', 'id': 968650362, 'id_str': '968650362', 'indices': [3, 16]}], 'urls': []}
source <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
in_reply_to_status_id None
in_reply_to_status_id_str None
in_reply_to_user_id None
in_reply_to_user_id_str None
in_reply_to_screen_name None
user {'id': 1162034666729140200, 'id_str': '1162034666729140224', 'name': 'CKL', 'screen_name': 'sophia_ckl', 'location': 'Hong Kong', 'description': 'Go with the flow #followbackhongkong', 'url': None, 'entities': {'description': {'urls': []}}, 'protec

KeyError: ignored

In [None]:
for l in raw_list[:1]:
  print(l)

RT @SenTomCotton: The coronavirus has become a global pandemic, yet the Chinese Communist Party continues to play politics by blocking Taiw… 


In [None]:
t_num = 5

def chunkstring(string, length):
    return (string[0+i:length+i] for i in range(0, len(string), length))

chunks = chunkstring(raw_list[t_num], 50)
for chunk in chunks:
    print(chunk)


Normala Sister talkin of viability gap funding , P
PP for creating more hospitals , medical backbone 


But Government of India 🇮🇳 asking @adgpi &amp; s
ister orga with scarce resources to prepare to bat
tle #coronavirus 

#Budget2020 

@fayedsouza 10s o
f orgs set up for emergencies 🤣😭 


In [None]:
# take enough tweets to end up with about 50k words of filtered data...
cutoff = len(raw_list) // 40
# cutoff = len(raw_list) // 20
print("number of tweets after cutoff", cutoff)
raw_data = ''.join(raw_list[:cutoff])
print("number of characters after cutoff: ", len(raw_data))

number of tweets after cutoff 5224
number of characters after cutoff:  696141


In [None]:
local_filename = "covid_tweets_preprocessed2.txt"
f = open(local_filename, "w")
f.write(raw_data)
f.close()

In [None]:
# Upload preprocessed twitter data to google drive

# ID of the preprocessed twitter data on google drive
folder_id_wt = "1tDPMhQoyxrW01Evcrt3f6l_zDOWhGBa8"

f = drive.CreateFile({"parents": [{"kind": "drive#fileLink", "id": folder_id_wt}]})
f.SetContentFile( local_filename )
f.Upload()
print("done")

done
