In [40]:
# Overview:
#
# For UML COMP 5300 (NLP) homework 2 we want to preprocess a large
# Twitter COVID .json file (set of tweets) into a NLP "data" file
# and make it availabe for others via google drive.
#
# To do this:
# The twitter file is .json format, resident on the 12G disk.
# This gets copied (just file expplorer) to google drive
# folder "My Drive/colab_datasets" https://drive.google.com/drive/folders/1tDPMhQoyxrW01Evcrt3f6l_zDOWhGBa8?usp=share_link
# The script here opens that private .json file,
# preprocesses it and will write out preprocessed data.txt file to that same google folder.
# 
# So - this script:
# Logs into google account
# Read/process the .json file writing to a colab-local file
# upload the preprocessed colab-local file to drive folder


In [41]:
!pip install ijson

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [42]:
# python standard library imports
# from-imports follow regular imports
import re
import random
from math import sqrt
from collections import Counter
import datetime

# external libraries imports, grouped semantically
import torch
import numpy as np

import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from IPython.display import clear_output, display

import ijson

In [43]:
# authenticate to google drive

# reference https://towardsdatascience.com/different-ways-to-connect-google-drive-to-a-google-colab-notebook-pt-1-de03433d2f7a
from pydrive.auth import GoogleAuth
from google.colab import drive
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [44]:
# get the twitter data from cwinsor's google drive

# ID of the raw/large .json file (original twitter data)
fid_rd = "1ABfAhA6T1F1qlRLV_f1uqbLQsQ5MrtyG"
drive_file = drive.CreateFile({'id': fid_rd})

# Download the file to a local disc
local_filename = "covid_tweets_raw.json"
drive_file.GetContentFile(local_filename)
# df  = pd.read_csv("file.csv")
# df.head()

In [45]:
raw_list = []
with open(local_filename, "rb") as f:
  objects = ijson.items(f, "", multiple_values=True)
  for object in objects:
    # id = object["id"]
    # created = object["created_at"]
    # print(id)
    # print(created)
    raw_list.append(object["full_text"] + " ")

print("number of tweets: ", len(raw_list))

number of tweets:  208960


In [52]:
for l in raw_list[:500]:
  print(l)

RT @SenTomCotton: The coronavirus has become a global pandemic, yet the Chinese Communist Party continues to play politics by blocking Taiw… 
RT @mediocregandhi: Massive respect for the medical team 
Loving my new ball stretcher https://t.co/PGr3OuLglO 
完全にビクター･ザーズにハマりました😇😇😇 
RT @ThobeArsenal: Sweden confirmed its first case of novel #coronavirus in Jönköping, central-southern Sweden on Fri, according to the Swed… 
Normala Sister talkin of viability gap funding , PPP for creating more hospitals , medical backbone 

But Government of India 🇮🇳 asking @adgpi &amp; sister orga with scarce resources to prepare to battle #coronavirus 

#Budget2020 

@fayedsouza 10s of orgs set up for emergencies 🤣😭 
RT @ShefVaidya: Can’t believe this guy is the president of Pakistan, telling the poor Pakistani students stuck in #Wuhan to fend for themse… 
RT @meyer_lucas: Dr. Chris Mackie says she wore masks from Wuhan to Canada, and went straight into home isolation even before she showed sy… 
* Oh Everythi

In [46]:
# take enough tweets to end up with about 50k words of filtered data...
cutoff = len(raw_list) // 40
# cutoff = len(raw_list) // 20
print("number of tweets after cutoff", cutoff)
raw_data = ''.join(raw_list[:cutoff])
print("number of characters after cutoff: ", len(raw_data))

number of tweets after cutoff 5224
number of characters after cutoff:  696141


In [47]:
local_filename = "covid_tweets_preprocessed.txt"
f = open(local_filename, "w")
f.write(raw_data)
f.close()

In [48]:
# Upload preprocessed twitter data to google drive

# ID of the preprocessed twitter data on google drive
folder_id_wt = "1tDPMhQoyxrW01Evcrt3f6l_zDOWhGBa8"

f = drive.CreateFile({"parents": [{"kind": "drive#fileLink", "id": folder_id_wt}]})
f.SetContentFile( local_filename )
f.Upload()
print("done")

done
