# Requirements

In [32]:
!pip install -q internetarchive

# Imports

In [33]:
import internetarchive
import os
import tarfile
import csv
from datetime import datetime
from tqdm.notebook import tqdm
import pandas as pd
import configparser
import json

import warnings

# Suppressing the warnings
warnings.filterwarnings('ignore') 

# Functions

## General

Mount your Google drive, so you can save controls files there and you won't lose it

In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


put here the path where you want to save the controls file in your Google Drive after '/content/drive/MyDrive/'

In [35]:
g_path = '/content/drive/MyDrive/CCT_CA/Files'

This function will be used for controls files and temp files, so it has an option of put the full path.

In [36]:
def check_folder(name_folder, complete_path = False):            
    #Creating folder if that doesn't exist
    if complete_path == False:
      p = %pwd
      p = f'{p}/{name_folder}'
    else:
      p = f'{complete_path}/{name_folder}'

    path = os.path.expanduser(p)
    if not os.path.exists(path):
        os.makedirs(path)
        print("{} created.".format(path))
        
    return path

In [37]:
#This function will get the day (files) inside the month (item)
def get_file_name(item_name, ext='*tar'):
    
    file_names = [f.name for f in internetarchive.get_files(item_name, glob_pattern= ext)]
    
    return file_names

In [38]:
#This function will convert the size so will be easer ready
def humanize(size_bytes):
    KB = 1 << 10
    MB = 1 << 20
    GB = 1 << 30

    if size_bytes < KB:
        return '{} B'.format(size_bytes)
    elif size_bytes < MB:
        return '{:.2f} KiB'.format(size_bytes/KB)
    elif size_bytes < GB:
        return '{:.2f} MiB'.format(size_bytes/MB)
    else:
        return '{:.2f} GiB'.format(size_bytes/GB)

In [39]:
#by default for control
def create_csv(file_path, file_name = 'control.csv', fieldnames=['name','datetime', 'type', 'size', 'count_total', 'count_filtered']):
    
    file = f"{file_path}/{file_name}"
    
    #if file doesn't exist I'll create it
    if not os.path.exists(file):
        with open(file, 'w', newline='') as control_csv:
            writer = csv.DictWriter(control_csv, fieldnames = fieldnames)
            writer.writeheader()
        control_csv.close()
        print("{} created.".format(file))
    
    return file

In [40]:
def read_csv(file_path):
    
    #Reading the control file
    with open(file_path, 'r', newline='') as csv_file:
        reader = csv.DictReader(csv_file)
        reader_data = [r for r in reader]
        csv_file.seek(0)
    csv_file.close()

    return reader_data

In [41]:
def insert_csv(csv_path, row, fieldnames=['name','datetime', 'type', 'size','count_total', 'count_filtered']):
    
    #openning control file to add file names on the list
    with open(csv_path, 'a', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames = fieldnames)
        writer.writerow(row)
    csv_file.close()

    return csv_path

In [42]:
#updating name as default
def update_csv(csv_path, val_filter, col_update, val_update, col_filter = 'name'):
  
  df = pd.read_csv(csv_path)

  df.loc[df[col_filter] == val_filter, col_update] = val_update

  df.to_csv(csv_path, index=False)

  return csv_path


In [43]:
#this fuction will fiter the data, using a bag of word that you select for your assessment
def filter_data(file_path, file_name, word_bag = False, lang = False, control_path = False, get_col = ['created_at', 'text', 'entities']):
    
    #reading json
    df = pd.read_json(file_path, lines=True, compression='gzip')
    
    #updating total tweets
    if not control_path == False:
      update_csv(csv_path = control_path, val_filter = file_name, col_update = 'count_total', val_update = len(df))
    
    #Filtering language if a language was sent
    if not lang == False:
        df = df[df.lang == lang]
    
    #taking away columns unnecessary
    df = df.loc[:, get_col]
    
    #filtering Tweets using subject word bag if a word bag was sent
    if not word_bag == False:
        bow = '|'.join(word_bag) # bag of word
        df = df[df['text'].str.contains(bow, case=False)]
        
        #updating filtered tweets
        if not control_path == False:
          update_csv(csv_path = control_path, val_filter = file_name, col_update = 'count_filtered', val_update = len(df))
    
    return df

In [44]:
def download_file_csv(item_name, file_name, download_folder = 'Downloads', control_file = 'Local'):
    
    path = check_folder(download_folder)
    
    if not control_file == 'Local':
      control_path = control_file

    else: 
      #calling function for getting control file
      control_path = create_csv(control_path = path, file_name = '/control_download.csv') 
    
    reader_data = read_csv(control_path)
    
    #Getting Itens names
    item = internetarchive.get_item(item_name)

    #check if the file has been downloaded before
    if not any(row['name'] == file_name for row in reader_data):
        
      #downloading
      r = item.download(
          destdir=path,  # The directory to download files to
          ignore_existing=True,  # Skip files that already exist locally
          checksum=True,  # Skip files based on checksum
          verbose=True,  # Print progress to stdout
          retries=100,  # The number of times to retry on failed requests
          no_directory=True,  # Download withtout the identifier
          files = file_name)
      
      #Adding file name on control list
      #getting Metadata
      metadata = list(filter(lambda p: p["name"] == file_name, item.item_metadata['files']))
      row = {'name':file_name, 'datetime':datetime.now().strftime('%Y-%m-%d %H:%M:%S'),'type':'download', 'size':metadata[0]['size']}
      insert_csv(csv_path = control_path, row = row)
        
      path_file = path + f'/{file_name}'
    
    else:
      path_file = False
    
    return path_file

In [45]:
def tar_file_csv(tar_file, file_name, extration_folder='Extraction', control_file='Local'):

    path = check_folder(extration_folder)
    
    if not control_file == 'Local':
      control_path = control_file

    else: 
      #calling function for getting control file
      control_path = create_csv(control_path = path, file_name = '/control_extraction.csv') 
    
    reader_data = read_csv(control_path)

    #check if the file has been extrated before
    if not any(row['name'] == file_name for row in reader_data):

        #Extracting file
        tar_file.extract(file_name, path=path)

        #Saving files name on control file
        row = {'name' : file_name, 'datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'type' : 'extraction', 'size' : tar_file.getmember(name=file_name).size}
        insert_csv(csv_path = control_path, row = row)
        
        path_file = path + f'/{file_name}'
        
    else:
        
        path_file = False
    
    return path_file

#Code

In [None]:
month = '202201'
item_name = 'archiveteam-twitter-stream-2022-01'
tweets_col = ['created_at', 'text', 'entities']

#file control in Google Drive
gdrive_path = check_folder(month, g_path)
control_file = create_csv(gdrive_path)
tweets = create_csv(gdrive_path, file_name = f'{month}_tweets.csv', fieldnames = tweets_col)


#Getting Itens names
file_names = get_file_name(item_name = item_name)#, ext = '*jpg')

x = 0 #for testing 

for dlf in tqdm(file_names, desc='Download progress'):
    
    #downloading the file
    file = download_file_csv(item_name = item_name, file_name = dlf, control_file = control_file)
    
    if not file == False:

        #Open tar file
        tar = tarfile.open(file, "r")

        #Getting json files' names
        tar_file_names = list(filter(lambda t: t.endswith('.json.gz'), tar.getnames()))

        for tf in tqdm(tar_file_names, desc='Unzip progress'):

            #call funcition to extract files
            js = tar_file_csv(tar_file = tar, file_name = tf, control_file = control_file)

            #if the file has already been unzipped that part will be skipped
            if not js == False:

                #Getting, cleaning and storing Data
                bow = ['vaccination', 'vaccines', 'vaccine']
                df = filter_data(file_path = js, file_name = tf, word_bag = bow, lang = 'en', control_path = control_file, get_col = tweets_col)

                if not df.empty:

                    for index, row in df.iterrows():
                       row_d = {'created_at': row['created_at'],'text': row['text'],'entities': json.dumps(row['entities'])}
                       insert_csv(csv_path = tweets, row = row_d, fieldnames=tweets_col)

                #deleting json file after processing
                os.remove(js)

        #closing tar file
        tar.close()

        #removing file downloaded
        os.remove(file)
        

Download progress:   0%|          | 0/31 [00:00<?, ?it/s]

archiveteam-twitter-stream-2022-01:

 downloading twitter-stream-20220127.tar:   0%|          | 0.00/2.67G [00:00<?, ?iB/s][A
 downloading twitter-stream-20220127.tar:   0%|          | 1.00M/2.67G [00:00<34:00, 1.41MiB/s][A
 downloading twitter-stream-20220127.tar:   0%|          | 2.00M/2.67G [00:00<17:56, 2.66MiB/s][A
 downloading twitter-stream-20220127.tar:   0%|          | 3.00M/2.67G [00:01<15:53, 3.00MiB/s][A
 downloading twitter-stream-20220127.tar:   0%|          | 4.00M/2.67G [00:01<12:28, 3.83MiB/s][A
 downloading twitter-stream-20220127.tar:   0%|          | 5.00M/2.67G [00:01<12:48, 3.72MiB/s][A
 downloading twitter-stream-20220127.tar:   0%|          | 6.00M/2.67G [00:01<10:54, 4.37MiB/s][A
 downloading twitter-stream-20220127.tar:   0%|          | 7.00M/2.67G [00:02<11:37, 4.10MiB/s][A
 downloading twitter-stream-20220127.tar:   0%|          | 8.00M/2.67G [00:02<10:13, 4.66MiB/s][A
 downloading twitter-stream-20220127.tar:   0%|          | 9.00M/2.67G [00:02<11:

Unzip progress:   0%|          | 0/1440 [00:00<?, ?it/s]

/content/Extraction created.


archiveteam-twitter-stream-2022-01:

 downloading twitter-stream-20220128.tar:   0%|          | 0.00/2.56G [00:00<?, ?iB/s][A
 downloading twitter-stream-20220128.tar:   0%|          | 1.00M/2.56G [00:01<1:11:06, 644kiB/s][A
 downloading twitter-stream-20220128.tar:   0%|          | 2.00M/2.56G [00:03<1:11:15, 642kiB/s][A
 downloading twitter-stream-20220128.tar:   0%|          | 3.00M/2.56G [00:05<1:26:04, 531kiB/s][A
 downloading twitter-stream-20220128.tar:   0%|          | 4.00M/2.56G [00:08<1:36:04, 476kiB/s][A
 downloading twitter-stream-20220128.tar:   0%|          | 5.00M/2.56G [00:10<1:36:28, 474kiB/s][A
 downloading twitter-stream-20220128.tar:   0%|          | 6.00M/2.56G [00:12<1:38:51, 462kiB/s][A
 downloading twitter-stream-20220128.tar:   0%|          | 7.00M/2.56G [00:15<1:44:52, 435kiB/s][A
 downloading twitter-stream-20220128.tar:   0%|          | 8.00M/2.56G [00:17<1:44:24, 437kiB/s][A
 downloading twitter-stream-20220128.tar:   0%|          | 9.00M/2.56G [0