In [156]:
import internetarchive
import os
import tarfile
import csv
from datetime import datetime
from tqdm.notebook import tqdm
import pandas as pd

In [186]:
def control_op(control_file):

    #if file doesn't exist I'll create it
    if not os.path.exists(control_file):
        with open(control_file, 'w', newline='') as control_csv:
            writer = csv.DictWriter(control_csv, fieldnames=['name','datetime'])
            writer.writeheader()
        control_csv.close()
        print("{} created.".format(control_file))
            
    #Reading the control file
    with open(control_file, 'r', newline='') as control_csv:
        reader = csv.DictReader(control_csv)
        reader_data = [r for r in reader]
        control_csv.seek(0)
    control_csv.close()
    
    return reader_data

In [222]:
def check_folder(name_folder):            
    #Creating folder if that doesn't exist
    p = %pwd
    p = p + f'/{name_folder}'
    path = os.path.expanduser(p)
    if not os.path.exists(path):
        os.makedirs(path)
        print("{} created.".format(path))
        
    return path

In [203]:
def get_file_name(item_name, ext='*tar'):
    
    file_names = [f.name for f in internetarchive.get_files(item_name, glob_pattern= ext)]
    
    return file_names

In [228]:
def download_file(item_name, file_name, download_folder = 'Downloads'):
    
    path = check_folder(download_folder)
    
    #calling function for getting control file
    control_path = path + '/control_download.csv'    
    reader_data = control_op(control_path)
    
    #Getteing Itens names
    item = internetarchive.get_item(item_name)
        
    #openning control file to add file names on the list
    with open(control_path, 'a', newline='') as control_download:
        writer = csv.DictWriter(control_download, fieldnames=['name','datetime'])
        
        #check if the file has been downloaded before
        if not any(row['name'] == file_name for row in reader_data):
            
            #downloading
            r = item.download(
                destdir=path,  # The directory to download files to
                ignore_existing=True,  # Skip files that already exist locally
                checksum=True,  # Skip files based on checksum
                verbose=True,  # Print progress to stdout
                retries=100,  # The number of times to retry on failed requests
                no_directory=True,  # Download withtout the identifier
                files = file_name)
            
            #Adding file name on control list
            row = {'name' : t, 'datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
            writer.writerow(row)
            
    control_download.close()
    
    path_file = path + f'/{file_name}'
    
    return path_file

In [230]:
def tar_file(tar_file, file_name, extration_folder='Extration'):

    path = check_folder(extration_folder)
    
    tar = tar_file
    
    #calling function for getting control file
    control_path = path + '/control_extraction.csv'    
    reader_data = control_op(control_path)

    #openning control file to add file names on the list
    with open(control_path, 'a', newline='') as control_tar:
        writer = csv.DictWriter(control_tar, fieldnames=['name','datetime'])
        
        #check if the file has been extrated before
        if not any(row['name'] == file_name for row in reader_data):

            #Extracting file
            tar.extract(file_name, path=path)

            #Saving files name on control file
            row = {'name' : file_name, 'datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
            writer.writerow(row)
    control_tar.close()
    
    path_file = path + f'/{file_name}'
    
    return path_file

In [299]:
def get_data(file_path, word_bag = False, lang = 'en'):
    
    #reading json
    df = pd.read_json(file_path, lines=True, compression='gzip')
    
    #Filtering language
    df = df[df.lang == lang]
    
    #taking away columns unnecessary
    df = df.loc[:, ['created_at', 'text', 'entities']]
    
    #filtering Tweets using subject word bag if a word bag was sent
    if not word_bag == False:
        bow = '|'.join(word_bag) # bag of word
        df = df[df['text'].str.contains(bow, case=False)]
    
    return df

In [None]:
item_name = 'archiveteam-twitter-stream-2022-10'

#Getting Itens names
file_names = get_file_name(item_name = item_name)#, ext = '*jpg')

x = 0

for dlf in tqdm(file_names, desc='Download progress'):
    
    if x < 1: #for testing 
        #downloading the file
        file = download_file(item_name = item_name, file_name = dlf)

        #Open tar file
        tar = tarfile.open(file, "r")

        #Getting json files' names
        tar_file_names = list(filter(lambda t: t.endswith('.json.gz'), tar.getnames()))

        for tf in tqdm(tar_file_names, desc='Unzip progress'):
            js = tar_file(tar_file = tar, file_name = tf)
            

        tar.close()
        
        x =+ 1

In [301]:
file_path = '/home/hduser/Desktop/CA 2.2/Extration/20221001/20221001000000.json.gz'

bow = ['vaccine', ' vac ' ]


df = get_data(file_path = file_path)#, word_bag = bow)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 929 entries, 0 to 3005
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   created_at  929 non-null    datetime64[ns, UTC]
 1   text        929 non-null    object             
 2   entities    929 non-null    object             
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 29.0+ KB


In [259]:
bow = '|'.join(['vaccine', ' vac ' ]) # bag of word
df_f = df[df['text'].str.contains(bow, case=False)]
df_f.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 244 to 2267
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   created_at  4 non-null      datetime64[ns, UTC]
 1   text        4 non-null      object             
dtypes: datetime64[ns, UTC](1), object(1)
memory usage: 96.0+ bytes


In [293]:
df.entities[1760]

{'hashtags': [{'text': 'LISA', 'indices': [33, 38]},
  {'text': 'MONEY', 'indices': [39, 45]},
  {'text': 'LALISA', 'indices': [46, 53]}],
 'urls': [],
 'user_mentions': [{'screen_name': 'TheManoban_Walk',
   'name': "💍 Manoban's walk",
   'id': 1468880172199800832,
   'id_str': '1468880172199800832',
   'indices': [3, 19]}],
 'symbols': [],
 'media': [{'id': 1575793530974666752,
   'id_str': '1575793530974666752',
   'indices': [54, 77],
   'media_url': 'http://pbs.twimg.com/media/Fd5X3hUagAAJjZo.jpg',
   'media_url_https': 'https://pbs.twimg.com/media/Fd5X3hUagAAJjZo.jpg',
   'url': 'https://t.co/iFFsTyTSke',
   'display_url': 'pic.twitter.com/iFFsTyTSke',
   'expanded_url': 'https://twitter.com/TheManoban_Walk/status/1575793533679591424/photo/1',
   'type': 'photo',
   'sizes': {'large': {'w': 772, 'h': 1114, 'resize': 'fit'},
    'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
    'small': {'w': 471, 'h': 680, 'resize': 'fit'},
    'medium': {'w': 772, 'h': 1114, 'resize': 'fit'}}

In [294]:
df.text[1760]

'RT @TheManoban_Walk: Oh my.. 😳💖\n\n#LISA #MONEY #LALISA https://t.co/iFFsTyTSke'