In [2]:
import pandas as pd
import os
import numpy as np
pd.set_option('display.max_colwidth', None)
import boto3
import swifter
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from bertopic.representation import MaximalMarginalRelevance
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from awswrangler import config, s3
from sagemaker import get_execution_role, Session
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from textblob import TextBlob
from sklearn.preprocessing import normalize
from sagemaker.model import Model
import io
import re
import json

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/dbcordeiro@sefaz.al.gov.br/.config/sagemaker/config.yaml


In [3]:
SESSION: str = boto3.Session(profile_name="dbcordeiro_projects", region_name="us-east-1")

RUNTIME_CLIENT: str = boto3.client('runtime.sagemaker')

SAGEMAKER_SESSION: str = Session(boto_session=SESSION)

role: str = "arn:aws:iam::513734873949:role/service-role/AmazonSageMaker-ExecutionRole-20240209T185677"

image_uri: str = "513734873949.dkr.ecr.us-east-1.amazonaws.com/topic_modelling_reviews_model:latest"
image_uri_inference: str = "513734873949.dkr.ecr.us-east-1.amazonaws.com/topic_modelling_reviews_deploy:latest"

S3_BUCKET: str = "s3://topic-modelling-reviews"
SAVE_DATA_S3_PATH: str = f"{S3_BUCKET}/data/test_model"

In [5]:
df = pd.read_parquet('../data/df_after_lang_detect.parquet')

df = df[df['language'] != 'ERROR']
df['title_text_review'] = df['title_text_review'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))
df['text_length'] = df['title_text_review'].swifter.apply(lambda x: len(x))
df = df[df['text_length'] > 20]
df = df[df['language'] == 'en']
df = df.sample(150000, random_state=42)
df['review_sentiment'] = df['title_text_review'].swifter.apply(lambda x: TextBlob(x).sentiment.polarity)
df = df[df['review_sentiment'] < -.3]
df.drop(columns=['text_length', 'language', 'review_sentiment', 'parent_asin'], inplace=True)


sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(df['title_text_review'].to_list())
embeddings = normalize(embeddings, 'l2', axis=1)

df['embeddings'] = list(embeddings)

Pandas Apply:   0%|          | 0/6763024 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/150000 [00:00<?, ?it/s]

In [15]:
s3.to_parquet(
    df=df,
    path=SAVE_DATA_S3_PATH,
    dataset=True,
    filename_prefix="test_model_reviews_",
    mode="overwrite",
    boto3_session=SESSION
)

{'paths': ['s3://topic-modelling-reviews/data/test_model/test_model_reviews_cbbc577098404d20a4a859886c9974ac.snappy.parquet'],
 'partitions_values': {}}

In [16]:
train_input = TrainingInput(
    "s3://topic-modelling-reviews/data/test_model/",
    content_type="application/x-parquet"
)

In [None]:
estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type="ml.c5.2xlarge",
    sagemaker_session=SAGEMAKER_SESSION,
    base_job_name="model"
)

estimator.fit({'training': train_input})

In [15]:
model = Model(
    image_uri=image_uri_inference,
    model_data="s3://sagemaker-us-east-1-513734873949/model-2024-07-10-16-55-21-398/output/model.tar.gz",
    role=role,
    name='inference-model',
    source_dir='../model_deploy_test',
    entry_point='inference.py',
    sagemaker_session=SAGEMAKER_SESSION
)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.c5.large",
    endpoint_name="topic-modelling-reviews-endpoint-test"
)

Using already existing model: inference-model


------!

In [9]:
endpoint_name = 'topic-modelling-reviews-endpoint-test'
test = df.iloc[42,1]

# Serialize the input data to a byte array
input_data = io.BytesIO()
np.save(input_data, test)
input_data.seek(0)

# Invoke the endpoint
response = RUNTIME_CLIENT.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='application/x-npy',
    Body=input_data.read()
)

# Parse the response
result = json.loads(response['Body'].read().decode())
print("Predicted Topics:", result)

Predicted Topics: garmin
wrong garmin
nt buy alternate
probably overall annoying
prime viberates sadly
poorly fails needs
probably unfortunately wasted
outdated buy use
nt cloudy tortuously
pocketfinder instead annual


In [14]:
SAGEMAKER_SESSION.delete_endpoint("topic-modelling-reviews-endpoint-test")