# Requirements

# Imports

In [1]:
import internetarchive
import os
import tarfile
import csv
from datetime import datetime
from tqdm.notebook import tqdm
import pandas as pd
import configparser
import mysql.connector

In [73]:
import warnings

# Suppressing the warnings
warnings.filterwarnings('ignore') 

# Functions

## General

In [2]:
def check_folder(name_folder):            
    #Creating folder if that doesn't exist
    p = %pwd
    p = p + f'/{name_folder}'
    path = os.path.expanduser(p)
    if not os.path.exists(path):
        os.makedirs(path)
        print("{} created.".format(path))
        
    return path

In [3]:
def get_file_name(item_name, ext='*tar'):
    
    file_names = [f.name for f in internetarchive.get_files(item_name, glob_pattern= ext)]
    
    return file_names

## Cleaning and Transfomations

In [4]:
def get_data(file_path, word_bag = False, lang = False):
    
    #reading json
    df = pd.read_json(file_path, lines=True, compression='gzip')
    
    #Filtering language if a language was sent
    if not lang == False:
        df = df[df.lang == lang]
    
    #taking away columns unnecessary
    df = df.loc[:, ['created_at', 'text', 'entities']]
    
    #filtering Tweets using subject word bag if a word bag was sent
    if not word_bag == False:
        bow = '|'.join(word_bag) # bag of word
        df = df[df['text'].str.contains(bow, case=False)]
    
    return df

## Control Operations

### CSV

In [5]:
def control_op_csv(control_file):

    #if file doesn't exist I'll create it
    if not os.path.exists(control_file):
        with open(control_file, 'w', newline='') as control_csv:
            writer = csv.DictWriter(control_csv, fieldnames=['name','datetime'])
            writer.writeheader()
        control_csv.close()
        print("{} created.".format(control_file))
            
    #Reading the control file
    with open(control_file, 'r', newline='') as control_csv:
        reader = csv.DictReader(control_csv)
        reader_data = [r for r in reader]
        control_csv.seek(0)
    control_csv.close()
    
    return reader_data

In [63]:
path = check_folder('Downloads')

#calling function for getting control file
control_path = path + '/control_download.csv'    
reader_data = control_op_csv(control_path)
reader_data

[{'name': 'twitterbird_thumb.jpg', 'datetime': '2023-04-28 17:39:13'},
 {'name': 'twitterbird_thumb.jpg', 'datetime': '2023-04-28 17:41:06'},
 {'name': 'twitterbird_thumb.jpg', 'datetime': '2023-04-28 23:46:38'},
 {'name': 'twitterbird_thumb.jpg', 'datetime': '2023-04-29 00:09:23'},
 {'name': 'twitterbird_thumb.jpg', 'datetime': '2023-04-29 00:14:17'},
 {'name': 'twitterbird_thumb.jpg', 'datetime': '2023-04-29 00:15:21'},
 {'name': 'twitterbird_thumb.jpg', 'datetime': '2023-04-29 00:15:46'},
 {'name': 'twitterbird_thumb.jpg', 'datetime': '2023-04-29 00:21:21'},
 {'name': 'twitterbird_thumb.jpg', 'datetime': '2023-04-29 00:23:22'},
 {'name': 'twitter-stream-20221001.tar', 'datetime': '2023-04-29 16:02:46'}]

In [6]:
def download_file_csv(item_name, file_name, download_folder = 'Downloads'):
    
    path = check_folder(download_folder)
    
    #calling function for getting control file
    control_path = path + '/control_download.csv'    
    reader_data = control_op_csv(control_path)
    
    #Getting Itens names
    item = internetarchive.get_item(item_name)
        
    #openning control file to add file names on the list
    with open(control_path, 'a', newline='') as control_download:
        writer = csv.DictWriter(control_download, fieldnames=['name','datetime'])
        
        #check if the file has been downloaded before
        if not any(row['name'] == file_name for row in reader_data):
            
            #downloading
            r = item.download(
                destdir=path,  # The directory to download files to
                ignore_existing=True,  # Skip files that already exist locally
                checksum=True,  # Skip files based on checksum
                verbose=True,  # Print progress to stdout
                retries=100,  # The number of times to retry on failed requests
                no_directory=True,  # Download withtout the identifier
                files = file_name)
            
            #Adding file name on control list
            row = {'name' : file_name, 'datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
            writer.writerow(row)
            
    control_download.close()
    
    path_file = path + f'/{file_name}'
    
    return path_file

In [7]:
def tar_file_csv(tar_file, file_name, extration_folder='Extraction'):

    path = check_folder(extration_folder)
    
    #calling function for getting control file
    control_path = path + '/control_extraction.csv'    
    reader_data = control_op(control_path)

    #openning control file to add file names on the list
    with open(control_path, 'a', newline='') as control_tar:
        writer = csv.DictWriter(control_tar, fieldnames=['name','datetime'])
        
        #check if the file has been extrated before
        if not any(row['name'] == file_name for row in reader_data):

            #Extracting file
            tar_file.extract(file_name, path=path)

            #Saving files name on control file
            row = {'name' : file_name, 'datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
            writer.writerow(row)
            
            path_file = path + f'/{file_name}'
            
        else:
            
            path_file = False
    
    control_tar.close()
    
    
    return path_file

### Mysql

In [8]:
def open_connection():

    #getting configuration
    config = configparser.ConfigParser()
    config.read('config.ini')

    # Connect to Mysql
    conn = mysql.connector.connect(
        host = config['mysql']['host'],
        user = config['mysql']['user'],
        password = config['mysql']['password'],
        database = config['mysql']['database']
    )
        
    return conn 

In [66]:
def download_file_db(item_name, file_name, download_folder = 'Downloads'):
    
    path = check_folder(download_folder)
    
    #openning connection with mysql
    conn = open_connection()
    cursor = conn.cursor()
    
    #query to get information from Mysql
    reader_data = pd.read_sql_query("SELECT * FROM control WHERE type ='download'", conn).to_dict(orient='records')
    
    #getting Itens names
    item = internetarchive.get_item(item_name)
    
    #check if the file has been downloaded before
    if not any(row['name'] == file_name for row in reader_data):

        #downloading
        r = item.download(
            destdir=path,  # The directory to download files to
            ignore_existing=True,  # Skip files that already exist locally
            checksum=True,  # Skip files based on checksum
            verbose=True,  # Print progress to stdout
            retries=100,  # The number of times to retry on failed requests
            no_directory=True,  # Download withtout the identifier
            files = file_name)

        #Adding file name on control list
        row = {'name' : file_name, 'datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}
        cursor.execute(f"INSERT INTO control (name, datetime, type) VALUES ('{row['name']}', '{row['datetime']}', 'download')")
    
    conn.commit()
    cursor.close()
    conn.close()
    
    path_file = path + f'/{file_name}'
    
    return path_file

In [70]:
def tar_file_db(tar_file, file_name, extration_folder='Extraction'):
    
    path = check_folder(extration_folder)
    
    #openning connection with mysql
    conn = open_connection()
    cursor = conn.cursor()
    
    #query to get information from Mysql about control extaction
    reader_data = pd.read_sql_query("SELECT * FROM control WHERE type ='extraction'", conn).to_dict(orient='records')
    
    
    #check if the file has been unzipped before
    if not any(row['name'] == file_name for row in reader_data):

        #Extracting file
        tar_file.extract(file_name, path=path)

        #Adding file name on control list
        row = {'name' : file_name, 'datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}
        cursor.execute(f"INSERT INTO control (name, datetime, type) VALUES ('{row['name']}', '{row['datetime']}', 'extraction')")
        
        path_file = path + f'/{file_name}'
    
    else: #case the file has been unzipped
            
        path_file = False
    
    
    conn.commit()
    cursor.close()
    conn.close()
    
    return path_file

# Code

In [67]:
#dataframe to store the tweets
tweets = pd.DataFrame({'created_at': [], 'text': [], 'entities': []})

In [74]:
item_name = 'archiveteam-twitter-stream-2022-10'

#Getting Itens names
file_names = get_file_name(item_name = item_name)#, ext = '*jpg')

x = 0 #for testing 

for dlf in tqdm(file_names, desc='Download progress'):
    
    if x < 1: #for testing 
        #downloading the file
        file = download_file_db(item_name = item_name, file_name = dlf)

        #Open tar file
        tar = tarfile.open(file, "r")

        #Getting json files' names
        tar_file_names = list(filter(lambda t: t.endswith('.json.gz'), tar.getnames()))

        for tf in tqdm(tar_file_names, desc='Unzip progress'):
            js = tar_file_db(tar_file = tar, file_name = tf)
            
            #Case file has already been unzipped
            if not js == False:
            
                #Getting, cleaning and storing Data
                bow = ['vaccination', 'vaccines', 'vaccine']
                df = get_data(file_path = js, lang = 'en', word_bag = bow)
                if len(df) > 0:
                    tweets = pd.concat([tweets,df], ignore_index=True)

                #deleting json file after processing
                os.remove(js)

        tar.close()
        
        x =+ 1

Download progress:   0%|          | 0/31 [00:00<?, ?it/s]

Unzip progress:   0%|          | 0/1421 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [61]:
def query_db(table, query = 'SELECT * '):
    
    
    conn = open_connection()
    
    #query to get information from Mysql
    df = pd.read_sql_query(f'{query}FROM {table}', conn)

    # Close connection with Mysql
    conn.close()
    
    return df

In [75]:
query_db(table = 'control')

Unnamed: 0,idnew_table,name,datetime,type
0,4,Teste,2023-04-29 21:39:28,download
1,5,twitter-stream-20221001.tar,2023-04-29 21:50:55,download
2,6,twitter-stream-20221001.tar,2023-04-29 21:52:10,download
3,7,twitter-stream-20221001.tar,2023-04-29 21:52:45,download
4,8,20221001/20221001235900.json.gz,2023-04-29 22:07:46,extraction
...,...,...,...,...
376,380,20221001/20221001174500.json.gz,2023-04-29 22:13:19,extraction
377,381,20221001/20221001174800.json.gz,2023-04-29 22:13:20,extraction
378,382,20221001/20221001174700.json.gz,2023-04-29 22:13:21,extraction
379,383,20221001/20221001174900.json.gz,2023-04-29 22:13:21,extraction


In [26]:
conn = open_connection()
cursor = conn.cursor()

now = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')

cursor.execute(f"INSERT INTO control (name, datetime, type) VALUES ('Teste', '{now}', 'download')")

conn.commit()
cursor.close()
conn.close()