# Tarfile Test

## Overview
Just experimenting with combining multiple pandas Dataframe into a single tarfile and uploading it to S3

In [121]:
import os
import pandas as pd
import tarfile
from io import BytesIO
from tempfile import TemporaryDirectory
import boto3

print("Import complete.")

Import complete.


In [122]:
# Constants
input_bucket = 'marites-comprehend-input'

In [69]:
# Create dataframes intended to be pushed
users = pd.read_csv('users.csv')
following = pd.read_csv('following.csv')
posts = pd.read_csv('posts.csv')

## Write archive to S3

In [129]:
def save_frames(tar_filename, frame_dict):
    tar_buffer = BytesIO()
    
    # Create a tarfile into which frames can be added
    with tarfile.open(fileobj=tar_buffer, mode='w:gz') as tfo:
    
        # Loop over all dataframes to be saved
        for file_name, df in frame_dict.items():

            # Compute the full path of the output file within the archive
            archive_name = os.path.join(tar_filename, file_name)

            # Create a temporary directory for packaging into a tar_file
            with TemporaryDirectory(prefix='rev_processing__') as temp_dir:
                
                # Write a csv dump of the dataframe to a temporary file
                temp_file_name = os.path.join(temp_dir, archive_name)
                os.makedirs(os.path.dirname(temp_file_name), exist_ok=True)
                df.to_csv(temp_file_name, index=False)

                # Add the temp file to the tarfile
                tfo.add(temp_file_name, arcname=archive_name)
    
    return tar_buffer


In [130]:
archived_frames = save_frames('output', { 'users.csv': users, 'following.csv': following, 'posts.csv': posts })

In [131]:
filename = 'tigergraph/output.tar.gz'

s3_resource = boto3.resource('s3')
s3_resource.Object(input_bucket, filename).put(Body=archived_frames.getvalue())

{'ResponseMetadata': {'RequestId': 'XX2DVH1APB7P73DV',
  'HostId': 'Z+OLXX/9q7B45i+sLlFVylv2csl4P5kKsXLd2GAN/Mrz0nCAMslHQTkF6NYg5pxh09xroVJah3k=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'Z+OLXX/9q7B45i+sLlFVylv2csl4P5kKsXLd2GAN/Mrz0nCAMslHQTkF6NYg5pxh09xroVJah3k=',
   'x-amz-request-id': 'XX2DVH1APB7P73DV',
   'date': 'Fri, 15 Apr 2022 01:49:15 GMT',
   'x-amz-expiration': 'expiry-date="Sun, 17 Apr 2022 00:00:00 GMT", rule-id="comprehend-bucket-lifecycle"',
   'etag': '"8b2057cabb75509a30d9dff064ad2472"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'Expiration': 'expiry-date="Sun, 17 Apr 2022 00:00:00 GMT", rule-id="comprehend-bucket-lifecycle"',
 'ETag': '"8b2057cabb75509a30d9dff064ad2472"'}

## Read Archive from S3

In [132]:
s3_client = boto3.client('s3')
input_tar_file = s3_client.get_object(Bucket=input_bucket, Key=filename)
input_tar_content = input_tar_file['Body'].read()

In [137]:
tar = tarfile.open(fileobj=BytesIO(input_tar_content))

for tar_resource in tar:
    filename = tar_resource.name
    df = pd.read_csv(tar.extractfile(tar_resource), header=0)
    print(df.head())

                   id                  name         username
0            44196397             Elon Musk         elonmusk
1            34097500                beeple           beeple
2           276540738  𝔊𝔯𝔦𝔪𝔢𝔰 (⌛️,⏳) ᚷᚱᛁᛗᛖᛋ         Grimezsz
3          1231406720        Michael Sheetz  thesheetztweetz
4  959471389282578432         Eva Fox 🦊❤️🇺🇦          EvaFoxU
       user        following      date
0  elonmusk           beeple  04-14-22
1  elonmusk         Grimezsz  04-14-22
2  elonmusk  thesheetztweetz  04-14-22
3  elonmusk          EvaFoxU  04-14-22
4  elonmusk           planet  04-14-22
              tweet_id  username                created_at  \
0  1512886651940491270  elonmusk  2022-04-09T20:14:20.000Z   
1  1512886157876600833  elonmusk  2022-04-09T20:12:22.000Z   
2  1512813698011836422  elonmusk  2022-04-09T15:24:26.000Z   
3  1512787864458870787  elonmusk  2022-04-09T13:41:47.000Z   
4  1512785529712123906  elonmusk  2022-04-09T13:32:31.000Z   

                           