In [1]:
pip install scikit-learn==1.0.2

Note: you may need to restart the kernel to use updated packages.


In [2]:
import sagemaker
import boto3
import pandas as pd
import numpy as np
from sagemaker import Session
from sagemaker.sklearn.estimator import SKLearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import os

# Inicializando sessão no SageMaker
session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'taxi-fare-prediction'
role = sagemaker.get_execution_role()

df = pd.read_csv('train.csv')

# Adicionando feature de velocidade média (km/h)
df['speed'] = np.where(
    df['trip_duration'] > 0,
    df['distance_traveled'] / (df['trip_duration'] / 3600),
    0
)
# Adicionando feature do valor base da corrida / pela distância em KMs
df['fare_per_mile'] = np.where(
    df['distance_traveled'] > 0,
    df['fare'] / df['distance_traveled'],
    df['fare']
)
# Adicionando feature do valor base da corrida / pela duração da viagem em minutos
df['fare_per_minute'] = np.where(
    df['trip_duration'] > 0,
    df['fare'] / (df['trip_duration'] / 60),
    df['fare']
)

# Removendo outliers
df = df[df['trip_duration'] < 10000]
df = df[df['trip_duration'] > 0]
df = df[df['distance_traveled'] > 0]

# Garantindo que não hajam valores infinitos/NaN
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

# Definindo as features (colunas) e a coluna alvo (preço total da corrida)
features = [
    'trip_duration', 'distance_traveled', 'num_of_passengers',
    'fare', 'tip', 'miscellaneous_fees', 'surge_applied',
    'speed', 'fare_per_mile', 'fare_per_minute'
]

X = df[features]
y = df['total_fare']

# Separando dados para treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Treinando modelo pelo algoritmo Random Forest (devido a sua incompatibilidade não linearidade nos dados)
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)

# Avaliando o modelo
train_score = model.score(X_train_scaled, y_train)
test_score = model.score(X_test_scaled, y_test)
print(f"Training R²: {train_score:.4f}")
print(f"Testing R²: {test_score:.4f}")

# Save model and scaler
model_dir = './model'
os.makedirs(model_dir, exist_ok=True)
joblib.dump(model, os.path.join(model_dir, 'model.joblib'), protocol=4)
joblib.dump(scaler, os.path.join(model_dir, 'scaler.joblib'), protocol=4)

print("Model training completed!")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Training R²: 0.9980
Testing R²: 0.9970
Model training completed!


In [3]:
import tarfile

# Criando package model.tar.gz 
with tarfile.open('model.tar.gz', 'w:gz') as tar:
    tar.add(model_dir, arcname='.')

print("Model package created: model.tar.gz")

# Fazendo upload do package para um bucket S3
s3_model_path = session.upload_data(
    path='model.tar.gz',
    bucket=bucket,
    key_prefix=f'{prefix}/model'
)

print(f"Model uploaded to S3: {s3_model_path}")

Model package created: model.tar.gz
Model uploaded to S3: s3://sagemaker-us-east-1-801065765644/taxi-fare-prediction/model/model.tar.gz


In [4]:
from sagemaker.sklearn.model import SKLearnModel

# Criando modelo
sklearn_model = SKLearnModel(
    model_data='s3://{}/{}/model/model.tar.gz'.format(bucket, prefix),
    role=role,
    entry_point='inference.py',
    framework_version='1.2-1',
    py_version='py3'
)

# Deploy do endpoint
predictor = sklearn_model.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large'
)

print("Endpoint deployed:", predictor.endpoint_name)

------!Endpoint deployed: sagemaker-scikit-learn-2025-09-10-16-59-16-098


In [4]:
import sklearn
print(f"scikit-learn version: {sklearn.__version__}")

scikit-learn version: 1.7.1
