In [None]:
# Below install will be necessary in every notebook on AWS

In [None]:
# Need script to convert to numeric for everything to input into model

In [1]:
!pip install transformers torch scikit-learn boto3 sagemaker

Collecting transformers
  Downloading transformers-4.48.2-py3-none-any.whl.metadata (44 kB)
Collecting torch
  Downloading torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.28.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from t

In [2]:
import argparse
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import os
import numpy as np
import sagemaker
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.image_uris import retrieve
import boto3



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
# Setup S3 paths, chunk size, etc.

role = "arn:aws:iam::221082214706:role/MYLabRole" # Hardcode, maybe avoids errors
region = "us-east-1"

# Boto3 + SageMaker session 
boto_sess = boto3.Session(region_name=region)
session = sagemaker.Session(boto_session=boto_sess)
print(f"Using region: {region}")
print(f"USing role: {role}")

bucket_name = "arxiv-project-bucket"
prefix_in = "processed-data/"
prefix_out = "processed-data/embeddings/"

Using region: us-east-1
USing role: arn:aws:iam::221082214706:role/MYLabRole


In [4]:
# Note: https://github.com/aws/deep-learning-containers/blob/master/available_images.md#hugging-face-training-containers
# If you go to the link, and check huggingface tgraining containers, all require GPU, so set access to GPU use

In [5]:
# HuggingFace DLC for embeddings instead of pip install like data_preprocessing
huggingface_image_uri = (
    "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04"
)

In [6]:
# Rest should be similar to preprocessing script

In [7]:
embedding_processor = ScriptProcessor(
    image_uri=huggingface_image_uri,
    command=["python3"],
    role=role,
    instance_count=1,
    instance_type="ml.g4dn.2xlarge",  # I set permissions, can probably run on reduced model size
    volume_size_in_gb=30,
    max_runtime_in_seconds=20500,
    sagemaker_session=session
)

In [8]:
# Run embedding job in .py file containing script logic
embedding_processor.run(
    code="embedding_script.py",
    inputs=[
        ProcessingInput(
            source=f"s3://{bucket_name}/{prefix_in}",
            destination="/opt/ml/processing/input"
        )
    ],
    outputs=[
        ProcessingOutput(
            source="/opt/ml/processing/output",
            destination=f"s3://{bucket_name}/{prefix_out}"
        )
    ],
    arguments=[
        "--input-csv-dir", "/opt/ml/processing/input",
        "--output-csv-dir", "/opt/ml/processing/output"
    ]
)

[34mCSV chunk files: ['arxiv_preprocessed_part8.csv', 'arxiv_preprocessed_part12.csv', 'arxiv_preprocessed_part13.csv', 'arxiv_preprocessed_part4.csv', 'arxiv_preprocessed_part2.csv', 'arxiv_preprocessed_part6.csv', 'arxiv_preprocessed_part11.csv', 'arxiv_preprocessed_part0.csv', 'arxiv_preprocessed_part5.csv', 'arxiv_preprocessed_part7.csv', 'arxiv_preprocessed_part3.csv', 'arxiv_preprocessed_part1.csv', 'arxiv_preprocessed_part10.csv', 'arxiv_preprocessed_part9.csv'][0m
[34mStart embedding arxiv_preprocessed_part8.csv (rows: 200000)[0m
[34m...arxiv_preprocessed_part8.csv: processed 10000 / 200000 rows[0m
[34m...arxiv_preprocessed_part8.csv: processed 20000 / 200000 rows[0m
[34m...arxiv_preprocessed_part8.csv: processed 30000 / 200000 rows[0m
[34m...arxiv_preprocessed_part8.csv: processed 40000 / 200000 rows[0m
[34m...arxiv_preprocessed_part8.csv: processed 50000 / 200000 rows[0m
[34m...arxiv_preprocessed_part8.csv: processed 60000 / 200000 rows[0m
[34m...arxiv_prepro

In [9]:
# Double check
job_desc = embedding_processor.jobs[-1].describe()
job_desc

{'ProcessingInputs': [{'InputName': 'input-1',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://arxiv-project-bucket/processed-data/',
    'LocalPath': '/opt/ml/processing/input',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'code',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-221082214706/huggingface-pytorch-training-2025-02-01-19-56-07-321/input/code/embedding_script.py',
    'LocalPath': '/opt/ml/processing/input/code',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}}],
 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'output-1',
    'S3Output': {'S3Uri': 's3://arxiv-project-bucket/processed-data/embeddings/',
     'LocalPath': '/opt/ml/processing/output',
     'S3UploadMode': 'EndOfJob'},
    'AppManaged': False}]},
 'ProcessingJobName': 'hugg

 - Should result in CSV with an additional column abstract_embedding, at s3://my-arxiv-project-bucket/processed-data/embeddings/embeddings.csv.