# Data Ingestion

## AWS Open Data

In [None]:
import boto3
import pandas as pd
from io import BytesIO
from botocore import UNSIGNED
from botocore.config import Config
import os

In [None]:
bucket_name = "abeja-cc-ja"
output_folder = '../../../datasets/AWS'

In [None]:
def read_aws_public_csv(bucket_name, object_key):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket_name, Key=object_key)
    df = pd.read_csv(BytesIO(obj['Body'].read()))
    return df

In [None]:
def read_aws_public_csv_in_chunks(bucket_name, object_key, chunksize=10**6):
    s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
    obj = s3.get_object(Bucket=bucket_name, Key=object_key)
    stream = obj['Body']  # StreamingBody object
    
    # Return iterator of dataframes (chunks)
    return pd.read_csv(stream, chunksize=1000, delimiter=';', on_bad_lines='skip')

In [None]:
def list_all_objects(bucket_name):
    s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name)

    keys = []
    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                keys.append(obj['Key'])
    return keys


all_keys = list_all_objects(bucket_name)

print(f"Found {len(all_keys)} files in the bucket.")


In [None]:
# TODO: STOP HERE IF YOU DON'T WANT TO DOWNLOAD ALL FILES
# Limited to 1 first keys 
for key in all_keys[:1]:
    print(f"Reading {key} in chunks ...")
    chunk_iter = read_aws_public_csv_in_chunks(bucket_name, key)
    first_chunk = next(chunk_iter)  # get first chunk (a DataFrame)
    print(first_chunk.head())       # now you can call head()   
    base_filename = os.path.basename(key).replace('/', '_').replace('.gz', '')  # safe filename

    os.makedirs(output_folder, exist_ok=True)

    for i, chunk in enumerate(chunk_iter):
    # Limited to 5 first chunks 
        if i >= 3:
            break  # stop after 3 chunks

        chunk_file = os.path.join(output_folder, f"{base_filename}_chunk{i}.csv")
        chunk.to_csv(chunk_file, index=False)
        print(f"Saved chunk {i} of {key} to {chunk_file}")


## Kaggle

In [4]:
#Not working first time, it works at second
import os
from kaggle.api.kaggle_api_extended import KaggleApi

In [2]:
data_links = [
    'sahideseker/tweet-sentiment-classification-dataset',
    'abdallahwagih/emotion-dataset',
    'simaanjali/emotion-analysis-based-on-text',
    'durgeshrao9993/twitter-analysis-dataset-2022',
    'aadyasingh55/twitter-emotion-classification-dataset',
    'mgmitesh/sentiment-analysis-dataset'
]

In [5]:
#os.environ['KAGGLE_CONFIG_DIR'] = '/root/.config/kaggle/'
import json

#import json in kaggle/kaggle.json
with open("../../../kaggle/kaggle.json", "r") as f:
    creds = json.load(f)

os.environ['KAGGLE_USERNAME'] = creds['username']
os.environ['KAGGLE_KEY'] = creds['key']

api = KaggleApi()
api.authenticate()

In [6]:
for link in data_links:
    files = api.dataset_list_files(link).files
    extensions = {os.path.splitext(f.name)[1].lstrip('.') for f in files}
    folder = next(iter(extensions), 'unknown')  
    save_path = f'../../../datasets/Kaggle/{folder}'
    api.dataset_download_files(link, path=save_path, unzip=True)



Dataset URL: https://www.kaggle.com/datasets/sahideseker/tweet-sentiment-classification-dataset
Dataset URL: https://www.kaggle.com/datasets/abdallahwagih/emotion-dataset
Dataset URL: https://www.kaggle.com/datasets/simaanjali/emotion-analysis-based-on-text
Dataset URL: https://www.kaggle.com/datasets/durgeshrao9993/twitter-analysis-dataset-2022
Dataset URL: https://www.kaggle.com/datasets/aadyasingh55/twitter-emotion-classification-dataset
Dataset URL: https://www.kaggle.com/datasets/mgmitesh/sentiment-analysis-dataset


## UCI ML

In [11]:
import requests
import zipfile

In [12]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip"
dir = "../../../datasets/uci"

In [None]:
def unzip(local_filename, save_dir):
    with zipfile.ZipFile(local_filename, 'r') as zip_ref:
        zip_ref.extractall(save_dir)

In [14]:

def download_uci_dataset(url, save_dir=dir):
    os.makedirs(save_dir, exist_ok=True)
    local_filename = os.path.join(save_dir, url.split("/")[-1])
    response = requests.get(url)

    with open(local_filename, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded: {local_filename}")

    unzip(local_filename, save_dir)
    
    return local_filename

download_uci_dataset(url)


Downloaded: ../../../datasets/uci/sentiment%20labelled%20sentences.zip


'../../../datasets/uci/sentiment%20labelled%20sentences.zip'