In [None]:
import os
import sagemaker
from sagemaker import get_execution_role

bucket = os.getenv('BUCKET_NAME')
endpoint_name = os.getenv('ENDPOINT_NAME')
sagemaker_session = sagemaker.Session(default_bucket=bucket)

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()
region = sagemaker_session.boto_session.region_name

s3_output_key_prefix = "training_output"

# Import Data

In [None]:
import pandas as pd

data_input = f's3://{bucket}/AI_ML_Challenge_Training_Data_Set_1_v1.csv'

df = pd.read_csv(data_input)

df = df.drop_duplicates(subset='Clause Text')

df = df[df['Clause ID'] != 5250]

In [None]:
from sklearn.model_selection import train_test_split

X = df['Clause Text']
y = df['Classification']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=12,
    stratify=y
)

In [None]:
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)

train_channel = s3_output_key_prefix + '/train'
test_channel = s3_output_key_prefix + '/test'

train_input = sagemaker_session.upload_data(path='train.csv', bucket=bucket, key_prefix=train_channel)
test_input = sagemaker_session.upload_data(path='test.csv', bucket=bucket, key_prefix=test_channel)

In [None]:
# use this for training
train_input