In [43]:
import pandas as pd
import numpy as np
import os
import ujson
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [10]:
%load_ext lab_black
%matplotlib inline

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [45]:
class PreprocessTwitter:
    def __init__(self):
        self.base_path = os.environ["SCRATCH"]

        input_folder = "covid-map/twitter-dataset-covid-all"
        self.input_folder_path = os.path.join(self.base_path, input_folder)

        output_folder = "covid-map/twitter-dataset-processed-1"
        self.output_folder_path = os.path.join(self.base_path, output_folder)

        # csv path list splited by month
        self.tweets_filepath_set = self._read_dirs(self.input_folder_path)
        self.sample_json_file_path = self.tweets_filepath_set["2020-01"][0]
        
        self.tweet_columns = []

    def _read_dirs(self, input_path):
        tweets_file_set = {}
        for month_folder in os.listdir(input_path):
            if month_folder.startswith("2020") and not month_folder.endswith(".zip"):
                tweets_file_set[month_folder] = []
                month_folder_path = os.path.join(input_path, month_folder)
                # print(month_folder_path)
                for tweets_file in os.listdir(month_folder_path):
                    if tweets_file.endswith("json") and tweets_file.find(")") == -1:
                        # some file is duplicated
                        tweets_file_path = os.path.join(month_folder_path, tweets_file)
                        tweets_file_set[month_folder].append(tweets_file_path)

        print("filepath:", tweets_file_set.keys())
        # print("all file count", sum([len(tweets_file_set[x]) for x in tweets_file_set]))
        return tweets_file_set

    def _clean_text(self, text):
        # Check characters to see if they are in punctuation
        nopunc = [char for char in text if char not in string.punctuation]
        # Join the characters again to form the string
        nopunc = "".join(nopunc)
        # convert text to lower-case
        nopunc = nopunc.lower()
        # remove URLs
        nopunc = re.sub(
            "((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))", "", nopunc
        )
        nopunc = re.sub(r"http\S+", "", nopunc)
        # remove usernames
        nopunc = re.sub("@[^\s]+", "", nopunc)
        # remove the # in #hashtag
        nopunc = re.sub(r"#([^\s]+)", r"\1", nopunc)
        # remove numbers
        nopunc = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", nopunc)
        nopunc = re.sub("\d", "", nopunc)
        # remove repeated characters
        nopunc = re.sub("(corona|covid|virus)", "", nopunc)
        nopunc = word_tokenize(nopunc)
        # remove stopwords from final word list
        nopunc = [word for word in nopunc if word not in stopwords.words("english")]
        text = " ".join([str(elem) for elem in nopunc]) + "\n"
        return text
    
    def tweets_filter(self, json_obj):
        if json_obj.get("CountyId") is None:
            return None
        if json_obj.get('')

    def read_one_json(self, json_path):
        with open(json_path, "r") as j:
            json_obj =  ujson.load(j)
            json_obj = self.json_filter(json_obj)

    def read_all_json(self, json_path_list):
        print("readling all")
        for json_path in json_path_list:
            tweet_list = self.read_one_json(json_path)

In [39]:
PT = PreprocessTwitter()
len(PT.tweets_filepath_set)

filepath: dict_keys(['2020-01', '2020-04', '2020-03', '2020-02'])


4

In [40]:
sample = PT.sample_json_file_path

In [41]:
PT.read_one_json(sample)
"
created_at
id
full_text
entities
retweet_count
favorite_count
CountyId

name
followers_count
friends_count
listed_count
favourites_count

{'created_at': 'Thu Jan 30 17:04:30 +0000 2020',
 'id': 1222928586493087744,
 'id_str': '1222928586493087744',
 'full_text': 'RT @WHO: @DrTedros @WHOWPRO @WHOSEARO @WHO_Europe @pahowho @WHOEMRO @WHOAFRO Q: How does the International Health Regulations Emergency Com…',
 'truncated': False,
 'display_text_range': [0, 140],
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [{'screen_name': 'WHO',
    'name': 'World Health Organization (WHO)',
    'id': 14499829,
    'id_str': '14499829',
    'indices': [3, 7]},
   {'screen_name': 'DrTedros',
    'name': 'Tedros Adhanom Ghebreyesus',
    'id': 189868631,
    'id_str': '189868631',
    'indices': [9, 18]},
   {'screen_name': 'WHOWPRO',
    'name': 'World Health Organization Western Pacific',
    'id': 3794682452,
    'id_str': '3794682452',
    'indices': [19, 27]},
   {'screen_name': 'WHOSEARO',
    'name': 'WHO South-East Asia',
    'id': 1545915336,
    'id_str': '1545915336',
    'indices': [28, 37]},
   {'screen_name': 