I'm following the AWS example from this [site](https://docs.aws.amazon.com/sagemaker/latest/dg/lightgbm.html)

In [97]:
import boto3
import sagemaker
from sagemaker import image_uris, model_uris, script_uris
from sagemaker import get_execution_role
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
from sagemaker.estimator import Estimator
from sagemaker.utils import name_from_base
from datetime import datetime
from sagemaker import hyperparameters
aws_role = get_execution_role()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()
import joblib
import tarfile
import lightgbm
import pandas as pd
from sklearn.metrics import roc_auc_score

In [3]:
train_model_id, train_model_version, train_scope = "lightgbm-classification-model", "*", "training"
training_instance_type = "ml.m5.xlarge"

In [6]:
#Retrieve the image
train_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    model_id=train_model_id,
    model_version=train_model_version,
    image_scope=train_scope,
    instance_type=training_instance_type
)

In [7]:
# Retrieve the training script
train_source_uri = script_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, script_scope=train_scope
)
train_model_uri = model_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, model_scope=train_scope
)

In [9]:
# Sample training data is available in this bucket
training_data_bucket = "starbucks-project-ttg"
training_data_prefix = "data"

training_dataset_s3_path = f"s3://{training_data_bucket}/{training_data_prefix}/train.csv" 
validation_dataset_s3_path = f"s3://{training_data_bucket}/{training_data_prefix}/val.csv" 
output_bucket = "starbucks-project-ttg"
output_prefix = "training_results"
s3_output_location = f"s3://{output_bucket}/{output_prefix}/output"

In [13]:
# Retrieve the default hyperparameters for training the model
hyperparameters = hyperparameters.retrieve_default(
    model_id=train_model_id, model_version=train_model_version
)

In [14]:
# [Optional] Override default hyperparameters with custom values
hyperparameters[
    "num_boost_round"
] = "500"
print(hyperparameters)

{'num_boost_round': '500', 'early_stopping_rounds': '30', 'metric': 'auto', 'learning_rate': '0.009', 'num_leaves': '67', 'feature_fraction': '0.74', 'bagging_fraction': '0.53', 'bagging_freq': '5', 'max_depth': '11', 'min_data_in_leaf': '26', 'max_delta_step': '0.0', 'lambda_l1': '0.0', 'lambda_l2': '0.0', 'boosting': 'gbdt', 'min_gain_to_split': '0.0', 'scale_pos_weight': '1.0', 'tree_learner': 'serial', 'feature_fraction_bynode': '1.0', 'is_unbalance': 'False', 'max_bin': '255', 'num_threads': '0', 'verbosity': '1', 'use_dask': 'False'}


In [23]:
time_stamp = datetime.strftime(datetime.now(), '%Y-%m-%d-%H-%M')

In [24]:
training_job_name = name_from_base(f"built-in-algo-{train_model_id}-training-{time_stamp}")

In [25]:
# Create SageMaker Estimator instance
tabular_estimator = Estimator(
    role=aws_role,
    image_uri=train_image_uri,
    source_dir=train_source_uri,
    model_uri=train_model_uri,
    entry_point="transfer_learning.py",
    instance_count=1, # for distributed training, specify an instance_count greater than 1
    instance_type=training_instance_type,
    max_run=360000,
    hyperparameters=hyperparameters,
    output_path=s3_output_location
)

In [26]:
# Launch a SageMaker Training job by passing the S3 path of the training data
tabular_estimator.fit(
    {
        "train": training_dataset_s3_path,
        "validation": validation_dataset_s3_path,
    }, logs=True, job_name=training_job_name
)

INFO:sagemaker:Creating training-job with name: built-in-algo-lightgbm-classification-m-2023-03-27-12-52-14-989


2023-03-27 12:54:37 Starting - Starting the training job...
2023-03-27 12:54:53 Starting - Preparing the instances for training...
2023-03-27 12:55:37 Downloading - Downloading input data...
2023-03-27 12:55:57 Training - Downloading the training image...
2023-03-27 12:56:32 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-03-27 12:56:38,585 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-03-27 12:56:38,587 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-03-27 12:56:38,597 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-03-27 12:56:38,599 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-03-27 12:56:39,027 sagemaker-training-toolki

# Out of Sample Testing

In [32]:
s3_output_location

's3://starbucks-project-ttg/training_results/output'

In [35]:
! aws s3 cp s3://starbucks-project-ttg/training_results/output/built-in-algo-lightgbm-classification-m-2023-03-27-12-52-14-989/output/model.tar.gz /root/starbucks_offer_response_model/trained_models/model.tar.gz

download: s3://starbucks-project-ttg/training_results/output/built-in-algo-lightgbm-classification-m-2023-03-27-12-52-14-989/output/model.tar.gz to ../trained_models/model.tar.gz


In [36]:
tar_file_path = '/root/'+'starbucks_offer_response_model/trained_models/model.tar.gz'

In [37]:
t = tarfile.open(tar_file_path)

In [40]:
t.extractall('/root/'+'starbucks_offer_response_model/trained_models/')

In [44]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.5
[0m

In [46]:
model = joblib.load('/root/starbucks_offer_response_model/trained_models/model.pkl')

In [62]:
col_num = len(pd.read_csv('/root/starbucks_offer_response_model/data/curated_data/test.csv', header = None).columns)

In [87]:
col_list = ['offer_successful']
for i in range(col_num-1):
    col_list.append(f'feature_{i}')

In [88]:
col_list

['offer_successful',
 'feature_0',
 'feature_1',
 'feature_2',
 'feature_3',
 'feature_4',
 'feature_5',
 'feature_6',
 'feature_7',
 'feature_8',
 'feature_9',
 'feature_10',
 'feature_11',
 'feature_12',
 'feature_13',
 'feature_14',
 'feature_15',
 'feature_16',
 'feature_17',
 'feature_18',
 'feature_19',
 'feature_20',
 'feature_21',
 'feature_22',
 'feature_23']

In [90]:
test_df = pd.read_csv('/root/starbucks_offer_response_model/data/curated_data/test.csv', header = None)
test_df.columns = col_list

In [91]:
test_df.head()

Unnamed: 0,offer_successful,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23
0,0,2.0,10.0,7.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0,0,0,1,0,0,1,0,0,1
1,0,0.0,0.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,1,0,0,0,1,0,0,2
2,0,10.0,10.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0,0,0,1,0,1,0,0,0,2
3,1,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1,0,0,0,0,1,0,0,0,2
4,1,5.0,20.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0,0,1,0,0,1,0,0,0,3


In [93]:
test_x_df = test_df.iloc[:,1:]
test_x_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23
0,2.0,10.0,7.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,1,0,0,1,0,0,1
1,0.0,0.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0,0,1,0,0,0,1,0,0,2
2,10.0,10.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0,0,0,1,0,1,0,0,0,2
3,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,1,0,0,0,0,1,0,0,0,2
4,5.0,20.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0,0,1,0,0,1,0,0,0,3


In [99]:
y_actual = test_df['offer_successful'].to_numpy()
y_actual

array([0, 0, 0, ..., 1, 0, 0])

In [94]:
y_pred = model.predict(test_x_df)

In [96]:
y_pred

array([0.42256093, 0.00341378, 0.68275327, ..., 0.6491945 , 0.31746096,
       0.1851326 ])

In [101]:
roc_auc_score(y_actual, y_pred)

0.8389541353491653