In [2]:
import gzip
from io import StringIO
import pandas as pd
from datetime import datetime, timedelta
import re
import string

In [3]:
def gzip_file_to_documents_list(file_name, languages_filter=['en']):
    with gzip.open(file_name, 'rb') as f:
        decompressed_data = f.read()
    df_data = pd.read_csv(StringIO(str(decompressed_data,'utf-8')), index_col=0)
    documents = df_data[df_data['language'].isin(languages_filter)]['text'].tolist()
    return documents

In [4]:
def uctd_file_name_by_date(month, day):
    return '0'*int(month < 10) + str(month) + '0'*int(day < 10) + str(day) + "_UkraineCombinedTweetsDeduped.csv.gzip"

In [5]:
def get_uctd_documents_between_dates(start: str, end: str,
                                     languages_filter=['en'], verbose: int = 0):
    # Dates in the 'YYYY-DD-MM' format
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')
    delta_days = date_end - date_start
    all_documents = []
    for i_d in range(delta_days.days + 1):
        date_current = date_start + timedelta(days=i_d)
        uctd_file_name = uctd_file_name_by_date(date_current.month, date_current.day)
        all_documents += gzip_file_to_documents_list(uctd_file_name, languages_filter=languages_filter)
        if verbose == 1:
            print(f'--Documents for the day {date_current.date()} processed')
    return all_documents

In [6]:
all_documents = get_uctd_documents_between_dates('2023-08-19','2023-08-23',verbose=1)

--Documents for the day 2023-08-19 processed
--Documents for the day 2023-08-20 processed
--Documents for the day 2023-08-21 processed
--Documents for the day 2023-08-22 processed
--Documents for the day 2023-08-23 processed


In [7]:
all_documents[0]

'Dear vaccine advocate\n\nDo take the COVID19 mRNA shot and boosters, but do know that @OurWorldInData data shows it offers zero protection, actually accelerates death of vaccinated.\n\nRegards\n#Pfizer #AstraZeneca #Moderna #NWO #Agenda2030 #COP27 #Biden #Obama #Trudeau #Jacinda #life https://t.co/VTbfuqiDvu'

In [41]:
def remove_links_content(text):
    text = re.sub(r"http\S+", "", text)
    return text

def remove_emails(text):
    return re.sub('\S+@\S*\s?', '', text)  # noqa

def remove_punctuation(text):
    """https://stackoverflow.com/a/37221663"""
    table = str.maketrans({key: None for key in string.punctuation})
    return text.translate(table)

def remove_multiple_space(text):
    return re.sub("\s\s+", " ", text)

In [None]:
document = remove_links_content(document)
    document = remove_emails(document)
    document = remove_multiple_space(document)
    document = remove_hashtags(document)
    document = remove_punctuation(document)
    document = remove_multiple_space(document)

In [48]:
def remove_hashtags(text):
    old_text = text + '\n'
    new_text = text
    while len(new_text) < len(old_text):
        old_text = new_text
        new_text = re.sub('(?<=[\s\n])#\S+\s*$', '', new_text)
    return new_text

In [23]:
all_documents[:10]

['Dear vaccine advocate\n\nDo take the COVID19 mRNA shot and boosters, but do know that @OurWorldInData data shows it offers zero protection, actually accelerates death of vaccinated.\n\nRegards\n#Pfizer #AstraZeneca #Moderna #NWO #Agenda2030 #COP27 #Biden #Obama #Trudeau #Jacinda #life https://t.co/VTbfuqiDvu',
 'Animal shelter Dogs and Cats, we need your help!\nRaising funds food for animals.\nPayPal: dogandcat.helper@gmail.com\nhttps://t.co/Z3re0ItTfy\nhttps://t.co/I9dbwRrtg0\nhttps://t.co/71pErM8xBZ\n\n#Ukraine #Patreon #dogsoftwitter #Shelter #Dogs #Cats #Cute #Pets #Funny\n#Dogsarefamily https://t.co/HLEnTp9yk7',
 'Welcome to our shelter!\nLocated in Ukraine, Kyiv\nOur shelter needs your help!\nRaising funds food for animals.\nPayPal: dogandcat.helper@gmail.com\nhttps://t.co/RH0peqvaXT\nhttps://t.co/rTtTVpoCi1\n\n#Ukraine #Kyiv #Shelter #Dogs #Cats #Pets #DogsofTwittter #patreoncreator #Patreon https://t.co/rRWH17R813',
 '👇 Good news you may have missed: First @WFP shipment of wh

In [64]:
document = all_documents[13]
print(document)
print('-----')
document = remove_multiple_space(remove_links_content(remove_emails(document)))
print(document)
print('-----')
print(remove_hashtags(document))

@RBReich Some day other countries will make really interesting movies about this country, and they won't have to make it up. 
#movie #usa #italy #france #russia
-----
@RBReich Some day other countries will make really interesting movies about this country, and they won't have to make it up. #movie #usa #italy #france #russia
-----
@RBReich Some day other countries will make really interesting movies about this country, and they won't have to make it up. 
