In [None]:
'''
File structure reference, need to modify
arxiv_project/
  ├─ requirements.txt
  ├─ scripts/
  │   ├─ data_preprocessing.py #Clean, transform JSON -> CSV
  │   ├─ embedding_script.py #Generate embeddings via Transformers
  │   ├─ train_preparation.py #Convert embeddings to numeric CSV
  │   ├─ cluster_evaluation.py #Evaluate silhouette, Davies-Bouldin
  │   ├─ pipeline_definition.py #SageMaker Pipeline definition
  └─ buildspec.yml #For building up
'''

In [17]:
!pip install transformers torch scikit-learn boto3 sagemaker



In [14]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.image_uris import retrieve
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum
import json
import pandas as pd
import os
from datetime import datetime

 - Because data is in a .json and needs to be converted to .csv for model

In [3]:
#REgion, role, session below
role = "arn:aws:iam::221082214706:role/MYLabRole" # Hardcode, maybe avoids errors
region = "us-east-1"

# Boto3 + SageMaker session 
boto_sess = boto3.Session(region_name=region)
session = sagemaker.Session(boto_session=boto_sess)
print(f"Using region: {region}")
print(f"USing role: {role}")

Using region: us-east-1
USing role: arn:aws:iam::221082214706:role/MYLabRole


In [5]:
# Setup S3 paths, chunk size, etc.
bucket_name = "arxiv-project-bucket"
raw_data_key = "arxiv-metadata-oai-snapshot.json"  # 4GB JSON file, need 4xlarge to parse
processed_prefix = "processed-data"
chunk_size = 200000 #  Reduce chunks of data flow in because, again, 1.7 million, 4GB dataset

Note: Bucket created already in previous steps, super easy, becareful

In [None]:
Note: Data Ingestion, Script processing, etc.

In [6]:
raw_data_s3_uri = f"s3://{bucket_name}/{raw_data_key}"
processed_data_s3_uri = f"s3://{bucket_name}/{processed_prefix}/"

In [8]:
# Script Processor to run data_preprocessing.py file
processing_image_uri = retrieve(
    framework='sklearn',
    region=region,
    version='0.23-1'
)

script_processor = ScriptProcessor(
    image_uri=processing_image_uri,
    command=['python3'],
    role=role,
    instance_count=1,
    instance_type='ml.m5.4xlarge', # Need larger memory, maybe 8x size? uh oh to money
    volume_size_in_gb=200, # disk memory for temporary files pls work
    max_runtime_in_seconds=6*3600, # 6 hours runtimejust in case
    env={"AWS_DEFAULT_REGION": region}
)

In [23]:
script_processor.run(
    code='data_preprocessing.py', # .py script with streaming/chunking logic
    inputs=[
        ProcessingInput(
            source=raw_data_s3_uri,
            destination="/opt/ml/processing/input"
        )
    ],
    outputs=[
        ProcessingOutput(
            source="/opt/ml/processing/output",
            destination=processed_data_s3_uri
        )
    ],
    arguments=[
        "--input-data", "/opt/ml/processing/input",
        "--output-data", "/opt/ml/processing/output",
        "--chunk-size", str(chunk_size)
    ]
)

..............[34mCollecting transformers==4.26.1
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 100.5 MB/s eta 0:00:00[0m
[34mCollecting torch==1.13.1
  Downloading torch-1.13.1-cp37-cp37m-manylinux1_x86_64.whl (887.5 MB)[0m
[34m     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 887.5/887.5 MB 1.3 MB/s eta 0:00:00[0m
[34mCollecting sagemaker
  Downloading sagemaker-2.229.0-py3-none-any.whl (1.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.5/1.5 MB 103.8 MB/s eta 0:00:00[0m
[34mCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 133.9 MB/s eta 0:00:00[0m
[34mCollecting tqdm>=4.27
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.5/78.5 kB 28.2 MB/s eta 0:00:00[0m
[34mCollecting pyyaml>=5.1
  Downloading PyYA

In [24]:
# Inspect job hopefully actually done
job_desc = script_processor.jobs[-1].describe()
job_desc

{'ProcessingInputs': [{'InputName': 'input-1',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://arxiv-project-bucket/arxiv-metadata-oai-snapshot.json',
    'LocalPath': '/opt/ml/processing/input',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'code',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-221082214706/sagemaker-scikit-learn-2025-01-30-05-51-58-714/input/code/data_preprocessing.py',
    'LocalPath': '/opt/ml/processing/input/code',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}}],
 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'output-1',
    'S3Output': {'S3Uri': 's3://arxiv-project-bucket/processed-data/',
     'LocalPath': '/opt/ml/processing/output',
     'S3UploadMode': 'EndOfJob'},
    'AppManaged': False}]},
 'ProcessingJobName': 'sa

In [26]:
!pip show sagemaker

Name: sagemaker
Version: 2.237.3
Summary: Open source library for training and deploying models on Amazon SageMaker.
Home-page: https://github.com/aws/sagemaker-python-sdk
Author: Amazon Web Services
Author-email: 
License: 
Location: /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages
Requires: attrs, boto3, cloudpickle, docker, fastapi, google-pasta, importlib-metadata, jsonschema, numpy, omegaconf, packaging, pandas, pathos, platformdirs, protobuf, psutil, pyyaml, requests, sagemaker-core, schema, smdebug-rulesconfig, tblib, tqdm, urllib3, uvicorn
Required-by: 


In [27]:
!pip install --upgrade sagemaker

Collecting sagemaker
  Downloading sagemaker-2.238.0-py3-none-any.whl.metadata (16 kB)
Downloading sagemaker-2.238.0-py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.237.3
    Uninstalling sagemaker-2.237.3:
      Successfully uninstalled sagemaker-2.237.3
Successfully installed sagemaker-2.238.0


In [39]:
!pip install --upgrade --force-reinstall sagemaker

Collecting sagemaker
  Using cached sagemaker-2.238.0-py3-none-any.whl.metadata (16 kB)
Collecting attrs<24,>=23.1.0 (from sagemaker)
  Using cached attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting boto3<2.0,>=1.35.75 (from sagemaker)
  Downloading boto3-1.36.9-py3-none-any.whl.metadata (6.6 kB)
Collecting cloudpickle>=2.2.1 (from sagemaker)
  Using cached cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting docker (from sagemaker)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting fastapi (from sagemaker)
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting google-pasta (from sagemaker)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting importlib-metadata<7.0,>=1.4.0 (from sagemaker)
  Using cached importlib_metadata-6.11.0-py3-none-any.whl.metadata (4.9 kB)
Collecting jsonschema (from sagemaker)
  Using cached jsonschema-4.23.0-py3-none-any.whl.metadata (7.9 kB)
Collecting numpy<2.0,>=1.

In [9]:
print("Actual runtime version:", sagemaker.__version__)

Actual runtime version: 2.237.3


In [21]:
# Feature Group name, record ID, event time, all the good stuff
# 2) Basic Feature Group parameters
feature_group_name = "arxiv-feature-group_v2"
record_identifier_name = "id"
event_time_name = "event_time"

# manual definition for features, cannot be dict because outdated sagemaker somehow
feature_definitions = [
    FeatureDefinition(feature_name="id", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="abstract", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="update_date",feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="year", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="month", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="authors", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="event_time", feature_type=FeatureTypeEnum.STRING),
]

# FeatureGroup object
feature_group = FeatureGroup(
    name=feature_group_name,
    sagemaker_session=session
)
# Directly assign the feature definitions (instead of load_feature_definitions)
feature_group.feature_definitions = feature_definitions

s3_uri = f"s3://{bucket_name}/feature-store-offline/{feature_group_name}"


# 7) Now create the Feature Group
try:
    feature_group.create(
        s3_uri,
        record_identifier_name=record_identifier_name,
        event_time_feature_name=event_time_name,
        role_arn=role,
        enable_online_store=True,          
        # Store in the Online store for low-latency access
        # OfflineStoreConfig=offline_store_config comment out for now, somehow the version of Sagemaker 
        # Is outdated and different than the actual runtime nevironment somehow. 
        # May need to consider a forceful !pip install --upgrade --force-reinstall sagemaker
        # followed by importing and printing print("Version: ", sagemaker.__version__)
    )
    # Wait until the Feature Group is created
    print(f"Feature group '{feature_group_name}' created successfully!")
except Exception as e:
    print("Error creating Feature Group:", e)

Feature group 'arxiv-feature-group_v2' created successfully!


In [22]:
!aws sagemaker list-feature-groups --region us-east-1

{
    "FeatureGroupSummaries": [
        {
            "FeatureGroupName": "neighborhood-feature-group-26-03-30-30",
            "FeatureGroupArn": "arn:aws:sagemaker:us-east-1:221082214706:feature-group/neighborhood-feature-group-26-03-30-30",
            "CreationTime": 1737862233.01,
            "FeatureGroupStatus": "Created",
            "OfflineStoreStatus": {
                "Status": "Active"
            }
        },
        {
            "FeatureGroupName": "arxiv-feature-group_v2",
            "FeatureGroupArn": "arn:aws:sagemaker:us-east-1:221082214706:feature-group/arxiv-feature-group_v2",
            "CreationTime": 1738289451.937,
            "FeatureGroupStatus": "Creating"
        },
        {
            "FeatureGroupName": "arxiv-feature-group",
            "FeatureGroupArn": "arn:aws:sagemaker:us-east-1:221082214706:feature-group/arxiv-feature-group",
            "CreationTime": 1738288517.203,
            "FeatureGroupStatus": "Created"
        }
    ]
}
