In [1]:
import boto3
import pyarrow as pa
import pyarrow.parquet as pq  # For handling Arrow formats
from io import BytesIO
import pandas as pd

# Set pandas to display all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Define the S3 bucket and file key
S3_BUCKET = "ishtar-ai"
S3_KEY = "data/raw/data-00000-of-00001.arrow"

# Initialize boto3 client
s3 = boto3.client("s3")


def load_arrow_data_from_s3(bucket, key):
    try:
        # Fetch the file from S3
        response = s3.get_object(Bucket=bucket, Key=key)
        file_content = response["Body"].read()

        # Load data using PyArrow
        arrow_table = pa.ipc.open_stream(BytesIO(file_content)).read_all()
        print("Data successfully loaded from S3!")
        return arrow_table
    except Exception as e:
        print(f"Error loading data from S3: {e}")
        return None


# Load the data
arrow_data = load_arrow_data_from_s3(S3_BUCKET, S3_KEY)

# Convert to Pandas DataFrame if needed
if arrow_data:
    df = arrow_data.to_pandas()


Data successfully loaded from S3!


In [2]:
df.head(5)

Unnamed: 0,id,publish_channel,title,start_duration,end_duration,lengthSeconds,total_comments,total_unique_users,comments
0,FUzLx1l37RE,NBCNews,Blinken reiterates Israel support but urges re...,2023-11-03 19:38:34,2023-11-04 17:38:34,241,25,21,"[{'author': '@LMLewis', 'channel': 'UCNUa3f8nw..."
1,PcQaG4sC9BM,NBCNews,Full Special Report: Israel declares war after...,2023-10-07 19:42:03,2023-11-04 12:38:36,1484,21905,14860,"[{'author': '@TuN77', 'channel': 'UC2lMdM6_BZY..."
2,X2t6NyGP1-w,NBCNews,Hamas official claims it's their 'legal right'...,2023-11-03 19:49:36,2023-11-04 19:46:34,335,1853,1170,"[{'author': '@Dolfan', 'channel': 'UC9xsEyJFc6..."
3,cd24N0tF-jY,NBCNews,FBI director warns of growing domestic threat ...,2023-11-01 19:50:24,2023-11-04 18:50:41,143,987,670,"[{'author': '@michaelwells1783', 'channel': 'U..."
4,ICR5ml2YPkI,NBCNews,Israel’s president: 23-year-old Israeli hostag...,2023-10-31 19:51:23,2023-11-04 19:44:35,162,5275,3475,"[{'author': '@shakeybeatz', 'channel': 'UCVTh8..."
