In [3]:
import pandas as pd
import numpy as np
import os
import ujson
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm

In [4]:
%load_ext lab_black
%matplotlib inline

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [5]:
class ResampleTwitter:
    
    def __init__(self):

        self.base_path = os.environ["SCRATCH"]

        input_folder = "covid-map/twitter-dataset-processed-1"
        self.input_folder_path = os.path.join(self.base_path, input_folder)

        self.input_df_path = [
            os.path.join(self.input_folder_path, x)
            for x in os.listdir(self.input_folder_path)
            if x.endswith(".csv")
        ]

        output_folder = "covid-map/twitter-dataset-processed-2"
        self.output_folder_path = os.path.join(self.base_path, output_folder)

        self.sample_df_path = os.path.join(self.base_path, "covid-map/concated_df.csv")

    def get_df(self, df_path):
        df = pd.read_csv(df_path, lineterminator="\n")
        df["created_at"] = df["created_at"].apply(lambda x: pd.to_datetime(x))
        df.set_index("created_at", inplace=True)
        # df = df.drop_duplicates(subset="cleaned_text")
        return df

    def split_windows(self, df):
        print("RESAMPLING")
        for i in df.resample("5D"):
            date = str(i[0].date())
            idf = i[1]
            output_filename = date + ".csv"
            output_path = os.path.join(self.output_folder_path, output_filename)
            print(output_path, idf.shape)
            self.write_to_csv(idf, output_path)

    def write_to_csv(self, df, output_path):
        df.to_csv(output_path, index="created_at")

    def start_one(self, df_path):
        return self.get_df(df_path)
        # self.split_windows(df)

    def start_all(self, df_path_list):
        nums = len(df_path_list)
        concated_df = pd.DataFrame()
        with tqdm(total=nums) as pbar:
            for df_path in df_path_list:
                df = self.start_one(df_path)
                concated_df = pd.concat([concated_df, df])
                pbar.update(1)
        print("concat done.")
        return concated_df

In [6]:
RT = ResampleTwitter()
concated_df = RT.start_all(RT.input_df_path)

100%|██████████| 1114/1114 [38:30<00:00,  2.07s/it]

concat done.





In [7]:
RT.split_windows(concated_df)

RESAMPLING
/scratch/user/diya.li/covid-map/twitter-dataset-processed-2/2020-01-21.csv (165527, 14)
/scratch/user/diya.li/covid-map/twitter-dataset-processed-2/2020-01-26.csv (579410, 14)
/scratch/user/diya.li/covid-map/twitter-dataset-processed-2/2020-01-31.csv (263968, 14)
/scratch/user/diya.li/covid-map/twitter-dataset-processed-2/2020-02-05.csv (141279, 14)
/scratch/user/diya.li/covid-map/twitter-dataset-processed-2/2020-02-10.csv (150162, 14)
/scratch/user/diya.li/covid-map/twitter-dataset-processed-2/2020-02-15.csv (158341, 14)
/scratch/user/diya.li/covid-map/twitter-dataset-processed-2/2020-02-20.csv (104271, 14)
/scratch/user/diya.li/covid-map/twitter-dataset-processed-2/2020-02-25.csv (355454, 14)
/scratch/user/diya.li/covid-map/twitter-dataset-processed-2/2020-03-01.csv (1047835, 14)
/scratch/user/diya.li/covid-map/twitter-dataset-processed-2/2020-03-06.csv (531521, 14)
/scratch/user/diya.li/covid-map/twitter-dataset-processed-2/2020-03-11.csv (532746, 14)
/scratch/user/diya.l

In [8]:
concated_df.shape

(5296400, 14)

In [62]:
temp_df = pd.read_csv(
    RT.base_path + "/covid-map/twitter-dataset-processed-2/2020-01-30.csv"
)

In [63]:
temp_df.head()

Unnamed: 0,created_at,id,full_text,cleaned_text,entities,retweet_count,favorite_count,CountyId,user_name,user_followers_count,user_friends_count,user_listed_count,favourites_count,user_location,geo
0,2020-01-30 17:00:00+00:00,1222927452722233344,🎙 @DrNinaRadcliff broke down #Coronavirus for ...,🎙 drninaradcliff broke jrzyjoepiscopo morning ...,"{'hashtags': [{'text': 'Coronavirus', 'indices...",1,15,36061,The Joe Piscopo Show,4980,121,23,2458,"New York, NY",
1,2020-01-30 17:00:00+00:00,1222927453141618690,"Images of sick, suffering animals in markets h...",images sick suffering animals markets created ...,"{'hashtags': [], 'symbols': [], 'user_mentions...",9,29,11001,National Geographic Magazine,421444,784,3373,2736,"Washington, DC",
2,2020-01-30 17:00:00+00:00,1222927453334564874,So many suffer #coronavirus #supplychain #logi...,many suffer supplychain logistics risk apple c...,"{'hashtags': [{'text': 'coronavirus', 'indices...",0,0,4013,Rob Morris,122,251,1,46,"Gilbert, AZ",
3,2020-01-30 17:00:00+00:00,1222927453976330243,Do masks offer protection from new coronavirus...,masks offer protection new depends\n,"{'hashtags': [], 'symbols': [], 'user_mentions...",0,1,35047,FOX5 Las Vegas,246132,573,1450,10236,Las Vegas,
4,2020-01-30 17:00:00+00:00,1222927454135676929,"In this week's #Opinion piece, News Editor Mat...",weeks opinion piece news editor matthew knott ...,"{'hashtags': [{'text': 'Opinion', 'indices': [...",0,0,48267,StudyTravel Ltd,6397,1642,102,1511,"London, England",
