In [1]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/hyejin/Downloads/my-semi-structure-recsys-67625-3fa813796d71.json"



## Dataset
The Google Analytics Sample dataset, which is hosted publicly on BigQuery, is a dataset that provides 12 months (August 2016 to August 2017) of obfuscated Google Analytics 360 data from the Google Merchandise Store, a real e-commerce store that sells Google-branded merchandise.

## Objective
By the end of this notebook, you will know how to:

- pre-process data into the correct format needed to create a recommender system using BigQuery ML
- train (and deploy) the matrix factorization model in BigQuery ML
- evaluate the model
- make predictions using the model
- take action on the predicted recommendations:
    - for activation via Google Ads, Display & Video 360 and Search Ads 360
    - for activation via emails
    - export predictions to a pandas dataframe
    - export predictions into Google Cloud Storage

In [2]:
PROJECT_ID = "my-semi-structure-recsys-67625"
REGION = 'US'

In [4]:
# !bq mk --location=$REGION --dataset $PROJECT_ID:ecommerce

In [4]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.61.2-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 1.5 MB/s 
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.61.2
You should consider upgrading via the '/usr/local/opt/python@3.8/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [3]:
import pandas_gbq

# TODO: Set project_id to your Google Cloud Platform project ID.
# project_id = "my-project"

query = """
SELECT 
  CONCAT(fullVisitorID,'-',CAST(visitNumber AS STRING)) AS visitorId,
  hitNumber,
  time,
  page.pageTitle,
  type,
  productSKU,
  v2ProductName,
  v2ProductCategory,
  productPrice/1000000 as productPrice_USD

FROM 
  `bigquery-public-data.google_analytics_sample.ga_sessions_20160801`, 
  UNNEST(hits) AS hits,
  UNNEST(hits.product) AS hits_product
LIMIT 5
"""
df = pandas_gbq.read_gbq(query, project_id=PROJECT_ID)
df.head()

  record_batch = self.to_arrow(


Unnamed: 0,visitorId,hitNumber,time,pageTitle,type,productSKU,v2ProductName,v2ProductCategory,productPrice_USD
0,7194065619159478122-1,2,30507,Gift Cards,PAGE,GGOEGGCX056299,Gift Card - $25.00,Home/Gift Cards/,25.0
1,7194065619159478122-1,2,30507,Gift Cards,PAGE,GGOEGGCX056499,Gift Card - $50.00,Home/Gift Cards/,50.0
2,7194065619159478122-1,2,30507,Gift Cards,PAGE,GGOEGGCX056199,Gift Card- $100.00,Home/Gift Cards/,100.0
3,7194065619159478122-1,2,30507,Gift Cards,PAGE,GGOEGGCX056399,Gift Card - $250.00,Home/Gift Cards/,250.0
4,8159312408158297118-151,2,289206,Men's-T-Shirts,PAGE,GGOEGAAX0104,Google Men's 100% Cotton Short Sleeve Hero Tee...,Home/Apparel/Men's/Men's-T-Shirts/,16.99


In [5]:
from google.cloud import bigquery
client = bigquery.Client()
query = """
        CREATE OR REPLACE TABLE `my-semi-structure-recsys-67625.ecommerce.visitor_item_duration` AS (
  WITH
    durations AS (
      --calculate pageview durations
      SELECT
        CONCAT(fullVisitorID,'-', 
             CAST(visitNumber AS STRING),'-', 
             CAST(hitNumber AS STRING) ) AS visitorId_session_hit,
        LEAD(time, 1) OVER (
          PARTITION BY CONCAT(fullVisitorID,'-',CAST(visitNumber AS STRING))
          ORDER BY
          time ASC ) - time AS pageview_duration
      FROM
        `bigquery-public-data.google_analytics_sample.ga_sessions_2016*`,
        UNNEST(hits) AS hit 
    ),
      
    prodview_durations AS (
      --filter for product detail pages only
      SELECT
        CONCAT(fullVisitorID,'-',CAST(visitNumber AS STRING)) AS visitorId,
          SAFE_CAST(visitStartTime+hits.time/1000 AS INT64) as timestamp,
        productSKU AS itemId,
        IFNULL(dur.pageview_duration,
          1) AS pageview_duration,
      FROM
        `bigquery-public-data.google_analytics_sample.ga_sessions_2016*` t,
        UNNEST(hits) AS hits,
        UNNEST(hits.product) AS hits_product
      JOIN
        durations dur
      ON
        CONCAT(fullVisitorID,'-',
               CAST(visitNumber AS STRING),'-',
               CAST(hitNumber AS STRING)) = dur.visitorId_session_hit
      WHERE
      #action_type: Product detail views = 2
      eCommerceAction.action_type = "2" 
    ),
    
    aggregate_web_stats AS(
      --sum pageview durations by visitorId, itemId, timestamp
      SELECT
        visitorId, 
        itemId,
        timestamp, 
        'duration' as type,
        SUM(pageview_duration) AS session_duration
      FROM
        prodview_durations
      GROUP BY
        visitorId,
        itemId, 
        timestamp,
        type )
    SELECT
      *
    FROM
      aggregate_web_stats
);
SELECT
  *
FROM
  `my-semi-structure-recsys-67625.ecommerce.visitor_item_duration`
"""
query_job = client.query(query)
interaction_df = query_job.result().to_dataframe()

interaction_df.head()

Unnamed: 0,visitorId,itemId,timestamp,type,session_duration
0,5953034565631333907-1,GGOEGAAX0031,1479714147,duration,1307
1,8104799165934477164-1,GGOEGAAX0031,1481227520,duration,14737
2,7131184833921807733-1,GGOEGAAX0031,1475586115,duration,2970
3,7712146195017051213-3,GGOEGAAX0031,1480454420,duration,3477
4,8173388962245475850-1,GGOEGAAX0031,1479683834,duration,11075


In [29]:
interaction_df.shape

(137583, 5)

In [30]:
query = """
      CREATE OR REPLACE TABLE `my-semi-structure-recsys-67625.ecommerce.visitor_info` AS (
        SELECT
        CONCAT(fullVisitorID,'-',CAST(visitNumber AS STRING)) AS visitorId,
        device.browser as browser,
        device.operatingSystem as os,
        device.deviceCategory as device,
        geoNetwork.country as country
      FROM
        `bigquery-public-data.google_analytics_sample.ga_sessions_2016*` t
);
SELECT
  *
FROM
  `my-semi-structure-recsys-67625.ecommerce.visitor_info`
"""
query_job = client.query(query)
user_df = query_job.result().to_dataframe()

user_df.head()

Unnamed: 0,visitorId,browser,os,device,country
0,2963183455498627045-2,Opera Mini,Android,mobile,Chad
1,2963183455498627045-3,Opera Mini,Android,mobile,Chad
2,2963183455498627045-1,Opera Mini,Android,mobile,Chad
3,7774328397116233788-1,Opera Mini,Android,mobile,Chad
4,9899657684683608581-1,Opera Mini,Samsung,mobile,Chad


In [31]:
data_dir = '../../../data/ecommerce'
users_filename = 'visitor.csv'
user_df.to_csv((data_dir +"/"+users_filename), index=False)

In [32]:
interactions_filename = 'duration.csv'
interaction_df.to_csv((data_dir +"/"+interactions_filename), index=False)

In [18]:
# !pip install google-cloud-bigquery

In [17]:
from google.cloud import bigquery
import pandas as pd, numpy as np
import io
import scipy.sparse as ss
import json
import time
import datetime
import os
# import sagemaker.amazon.common as smac
import boto3
import uuid
from botocore.exceptions import ClientError
from botocore.config import Config




In [2]:
bucket_name = 'start-personalize-hj'
my_config = Config(
        region_name='us-east-2',
        max_pool_connections=50
    )

In [33]:
interaction_df.rename(columns = {'visitorId':'USER_ID', 'itemId':'ITEM_ID',
                              'timestamp':'TIMESTAMP', 'type': 'EVENT_TYPE', 'session_duration': 'EVENT_VALUE'}, inplace = True) 
interaction_df.head()

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,EVENT_TYPE,EVENT_VALUE
0,206446769049487-1,GGOEGAAX0574,1482280340,duration,5533
1,218703529647171-1,GGOEGFKQ020399,1471092990,duration,2545
2,218703529647171-1,GGOEGFKA022299,1471092990,duration,2347
3,348832906142419-1,GGOEGBRB073899,1476224469,duration,6795
4,348832906142419-1,GGOEGAAX0606,1476224469,duration,8805


In [34]:
data_dir = '../../../data/ecommerce'
interactions_filename = 'interaction.csv'
interaction_df.to_csv((data_dir +"/"+interactions_filename), index=False)

In [35]:
user_df.rename(columns = {'visitorId':'USER_ID', 'browser':'BROWSER',
                              'device':'DEVICE', 'os': 'OS', 'country':'COUNTRY'}, inplace = True) 
user_df.head()

Unnamed: 0,USER_ID,BROWSER,OS,DEVICE,COUNTRY
0,2963183455498627045-2,Opera Mini,Android,mobile,Chad
1,2963183455498627045-3,Opera Mini,Android,mobile,Chad
2,2963183455498627045-1,Opera Mini,Android,mobile,Chad
3,7774328397116233788-1,Opera Mini,Android,mobile,Chad
4,9899657684683608581-1,Opera Mini,Samsung,mobile,Chad


In [36]:
data_dir = '../../../data/ecommerce'
users_filename = 'user.csv'
user_df.to_csv((data_dir +"/"+users_filename), index=False)

In [37]:
s3 = boto3.client('s3',aws_access_key_id=aws_access_key_id
    ,aws_secret_access_key=aws_secret_access_key)
account_id = boto3.client('sts',aws_access_key_id=aws_access_key_id
    ,aws_secret_access_key=aws_secret_access_key).get_caller_identity().get('Account')
suffix = str('2021-07-27')


In [11]:
account_id

'104151821460'

In [12]:
suffix

'2021-07-27'

In [13]:
bucket_name = 'start-personalize-hj'
bucket_dir = 'ecommerce'

In [38]:
boto3.Session().resource(
    's3'    
    , config=my_config
    , aws_access_key_id=aws_access_key_id
    , aws_secret_access_key=aws_secret_access_key
).Bucket(bucket_name).Object((bucket_dir +"/"+users_filename)).upload_file((data_dir +"/"+users_filename))

In [39]:
boto3.Session().resource(
    's3'    
    , config=my_config
    , aws_access_key_id=aws_access_key_id
    , aws_secret_access_key=aws_secret_access_key
).Bucket(bucket_name).Object((bucket_dir +"/"+interactions_filename)).upload_file((data_dir +"/"+interactions_filename))

In [9]:
personalize = boto3.client(
    service_name='personalize'
    , config=my_config
    , aws_access_key_id=aws_access_key_id
    , aws_secret_access_key=aws_secret_access_key    
)

personalize_runtime = boto3.client(
    service_name='personalize-runtime'
    , config=my_config
    , aws_access_key_id=aws_access_key_id
    , aws_secret_access_key=aws_secret_access_key    
)

personalize_events = boto3.client(
    service_name='personalize-events'
    , config=my_config
    , aws_access_key_id=aws_access_key_id
    , aws_secret_access_key=aws_secret_access_key    
)

In [16]:
dataset_group_name = "ecommerce-dataset-group-" + suffix

create_dataset_group_response = personalize.create_dataset_group(
    name = dataset_group_name
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

dataset_group_arn = 'arn:aws:personalize:us-east-2:104151821460:dataset-group/ecommerce-dataset-group-2021-07-27'

In [52]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(
        datasetGroupArn = dataset_group_arn
    )
    status = describe_dataset_group_response["datasetGroup"]["status"]
    print("DatasetGroup: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(20)

DatasetGroup: ACTIVE


In [17]:
schema_name="ecommerce-interaction-schema-"+suffix

In [43]:
schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        },
        {
          "name": "EVENT_TYPE",
          "type": "string"
        },
        {
          "name": "EVENT_VALUE",
          "type": "float"
        }
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = schema_name,
    schema = json.dumps(schema)
)

schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))
schema_arn = 'arn:aws:personalize:us-east-2:104151821460:schema/ecommerce-interaction-schema-2021-07-27' 

In [44]:
dataset_type = "INTERACTIONS"
create_dataset_response = personalize.create_dataset(
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = schema_arn,
    name = "ecommerce-dataset-interactions-" + suffix
)

interactions_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-2:104151821460:dataset/ecommerce-dataset-group-2021-07-27/INTERACTIONS",
  "ResponseMetadata": {
    "RequestId": "ebaf7c78-3021-4dd4-9ba5-7a3b09dcdf9c",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 27 Jul 2021 08:56:33 GMT",
      "x-amzn-requestid": "ebaf7c78-3021-4dd4-9ba5-7a3b09dcdf9c",
      "content-length": "115",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [45]:
metadata_schema_name="ecommerce-users-schema-"+suffix

In [47]:
metadata_schema = {
    "type": "record",
    "name": "Users",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "COUNTRY",
            "type": "string",
            "categorical": True
        },
        {
            "name": "OS",
            "type": "string",
            "categorical": True
        },
        {
            "name": "DEVICE",
            "type": "string",
            "categorical": True
        },
        {
            "name": "BROWSER",
            "type": "string",
            "categorical": True
        }
    ],
    "version": "1.0"
}

create_metadata_schema_response = personalize.create_schema(
    name = metadata_schema_name,
    schema = json.dumps(metadata_schema)
)

metadata_schema_arn = create_metadata_schema_response['schemaArn']
print(json.dumps(create_metadata_schema_response, indent=2))

schemaArn = "arn:aws:personalize:us-east-2:104151821460:schema/ecommerce-users-schema-2021-07-27"

In [48]:
dataset_type = "USERS"
create_metadata_dataset_response = personalize.create_dataset(
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = metadata_schema_arn,
    name = "ecommerce-metadata-dataset-users-" + suffix
)

metadata_dataset_arn = create_metadata_dataset_response['datasetArn']
print(json.dumps(create_metadata_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-2:104151821460:dataset/ecommerce-dataset-group-2021-07-27/USERS",
  "ResponseMetadata": {
    "RequestId": "0693cdac-cfab-49f0-90b6-3fe260a19262",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 27 Jul 2021 09:01:06 GMT",
      "x-amzn-requestid": "0693cdac-cfab-49f0-90b6-3fe260a19262",
      "content-length": "108",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [49]:
s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id
    , aws_secret_access_key=aws_secret_access_key    
)

policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com"
            },
            "Action": [
                "s3:GetObject",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket_name),
                "arn:aws:s3:::{}/*".format(bucket_name)
            ]
        }
    ]
}

s3.put_bucket_policy(Bucket=bucket_name, Policy=json.dumps(policy));

In [52]:
iam = boto3.client("iam", aws_access_key_id=aws_access_key_id
    , aws_secret_access_key=aws_secret_access_key    
)

role_name = "PersonalizeS3Role-"+suffix
assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "personalize.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
    ]
}
try:
    create_role_response = iam.create_role(
        RoleName = role_name,
        AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
    );

    iam.attach_role_policy(
        RoleName = role_name,
        PolicyArn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
    );

    role_arn = create_role_response["Role"]["Arn"]
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        role_arn = iam.get_role(RoleName=role_name)['Role']['Arn']
    else:
        raise
        
# sometimes need to wait a bit for the role to be created
time.sleep(45)
print(role_arn)

In [51]:
role_arn = 'arn:aws:iam::104151821460:role/PersonalizeS3Role-93934'

In [53]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "ecommerce-dataset-import-job-"+suffix,
    datasetArn = interactions_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/ecommerce/{}".format(bucket_name, interactions_filename)
    },
    roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-2:104151821460:dataset-import-job/ecommerce-dataset-import-job-2021-07-27",
  "ResponseMetadata": {
    "RequestId": "83054796-8cee-428d-b3a8-fedb20f33a67",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 27 Jul 2021 09:03:24 GMT",
      "x-amzn-requestid": "83054796-8cee-428d-b3a8-fedb20f33a67",
      "content-length": "127",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [54]:
user_metadata_file = 'user.csv'
create_metadata_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "ecommerce-users-metadata-dataset-import-job-"+suffix,
    datasetArn = metadata_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/ecommerce/{}".format(bucket_name, user_metadata_file)
    },
    roleArn = role_arn
)

metadata_dataset_import_job_arn = create_metadata_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_metadata_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-2:104151821460:dataset-import-job/ecommerce-users-metadata-dataset-import-job-2021-07-27",
  "ResponseMetadata": {
    "RequestId": "f7e895db-ff42-4b12-bc61-e8e4dcfdf06a",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 27 Jul 2021 09:05:49 GMT",
      "x-amzn-requestid": "f7e895db-ff42-4b12-bc61-e8e4dcfdf06a",
      "content-length": "142",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [55]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_import_job_arn
    )
    
    dataset_import_job = describe_dataset_import_job_response["datasetImportJob"]
    if "latestDatasetImportJobRun" not in dataset_import_job:
        status = dataset_import_job["status"]
        print("DatasetImportJob: {}".format(status))
    else:
        status = dataset_import_job["latestDatasetImportJobRun"]["status"]
        print("LatestDatasetImportJobRun: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetImportJob: ACTIVE


In [56]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = metadata_dataset_import_job_arn
    )
    
    dataset_import_job = describe_dataset_import_job_response["datasetImportJob"]
    if "latestDatasetImportJobRun" not in dataset_import_job:
        status = dataset_import_job["status"]
        print("DatasetImportJob: {}".format(status))
    else:
        status = dataset_import_job["latestDatasetImportJobRun"]["status"]
        print("LatestDatasetImportJobRun: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: ACTIVE


In [57]:
recipe_list = personalize.list_recipes()
for recipe in recipe_list['recipes']:
    print(recipe['recipeArn'])

arn:aws:personalize:::recipe/aws-hrnn
arn:aws:personalize:::recipe/aws-hrnn-coldstart
arn:aws:personalize:::recipe/aws-hrnn-metadata
arn:aws:personalize:::recipe/aws-personalized-ranking
arn:aws:personalize:::recipe/aws-popularity-count
arn:aws:personalize:::recipe/aws-sims
arn:aws:personalize:::recipe/aws-user-personalization


In [47]:
# recipe_arn = "arn:aws:personalize:::recipe/aws-hrnn-metadata"
recipe_arn = "arn:aws:personalize:::recipe/aws-user-personalization"

In [59]:
dataset_group_arn


'arn:aws:personalize:us-east-2:104151821460:dataset-group/ecommerce-dataset-group-2021-07-27'

In [48]:
create_solution_response = personalize.create_solution(
    name = "ecommerce-user-personalization-solution-HPO",
    datasetGroupArn = dataset_group_arn,
    recipeArn = recipe_arn,
    performHPO=True
)

solution_arn = create_solution_response['solutionArn']
print(json.dumps(create_solution_response, indent=2))

{
  "solutionArn": "arn:aws:personalize:us-east-2:104151821460:solution/ecommerce-user-personalization-solution-HPO",
  "ResponseMetadata": {
    "RequestId": "f740bcff-56e7-41a8-bd45-9291f998db56",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Wed, 28 Jul 2021 04:31:15 GMT",
      "x-amzn-requestid": "f740bcff-56e7-41a8-bd45-9291f998db56",
      "content-length": "113",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [49]:
# create_solution_response = personalize.create_solution(
#     name = "ecommerce-hrnn-metadata-solution-HPO",
#     datasetGroupArn = dataset_group_arn,
#     recipeArn = recipe_arn,
#     performHPO=True
# )

# solution_arn = create_solution_response['solutionArn']
# print(json.dumps(create_solution_response, indent=2))

In [50]:
create_solution_version_response = personalize.create_solution_version(
    solutionArn = solution_arn
)

solution_version_arn = create_solution_version_response['solutionVersionArn']
print(json.dumps(create_solution_version_response, indent=2))

{
  "solutionVersionArn": "arn:aws:personalize:us-east-2:104151821460:solution/ecommerce-user-personalization-solution-HPO/7ffc103f",
  "ResponseMetadata": {
    "RequestId": "42618d78-8eda-4faa-ad7d-5f78c007ec34",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Wed, 28 Jul 2021 04:32:44 GMT",
      "x-amzn-requestid": "42618d78-8eda-4faa-ad7d-5f78c007ec34",
      "content-length": "129",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [51]:
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_solution_version_response = personalize.describe_solution_version(
        solutionVersionArn = solution_version_arn
    )
    status = describe_solution_version_response["solutionVersion"]["status"]
    print("SolutionVersion: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_P

In [52]:
get_solution_metrics_response = personalize.get_solution_metrics(
    solutionVersionArn = solution_version_arn
)

print(json.dumps(get_solution_metrics_response, indent=2))


{
  "solutionVersionArn": "arn:aws:personalize:us-east-2:104151821460:solution/ecommerce-user-personalization-solution-HPO/7ffc103f",
  "metrics": {
    "coverage": 0.872,
    "mean_reciprocal_rank_at_25": 0.2337,
    "normalized_discounted_cumulative_gain_at_10": 0.3144,
    "normalized_discounted_cumulative_gain_at_25": 0.3569,
    "normalized_discounted_cumulative_gain_at_5": 0.2783,
    "precision_at_10": 0.0442,
    "precision_at_25": 0.0245,
    "precision_at_5": 0.0671
  },
  "ResponseMetadata": {
    "RequestId": "79640f62-ebf6-442f-b7aa-7ffe24b36468",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Wed, 28 Jul 2021 05:29:29 GMT",
      "x-amzn-requestid": "79640f62-ebf6-442f-b7aa-7ffe24b36468",
      "content-length": "426",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [64]:
create_campaign_response = personalize.create_campaign(
    name = "ecommerce-campaign",
    solutionVersionArn = solution_version_arn,
    minProvisionedTPS = 2,    
)

campaign_arn = create_campaign_response['campaignArn']
print(json.dumps(create_campaign_response, indent=2))

{
  "campaignArn": "arn:aws:personalize:us-east-2:104151821460:campaign/ecommerce-campaign",
  "ResponseMetadata": {
    "RequestId": "52d6376e-5792-43ce-b823-ddf1b55e37fa",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 27 Jul 2021 11:02:41 GMT",
      "x-amzn-requestid": "52d6376e-5792-43ce-b823-ddf1b55e37fa",
      "content-length": "88",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [56]:
solution_version_arn

'arn:aws:personalize:us-east-2:104151821460:solution/ecommerce-user-personalization-solution-HPO/7ffc103f'

In [58]:
create_campaign_response = personalize.create_campaign(
    name = "ecommerce-user-personalization-campaign",
    solutionVersionArn = solution_version_arn,
    minProvisionedTPS = 2,    
)

campaign_arn = create_campaign_response['campaignArn']
print(json.dumps(create_campaign_response, indent=2))
campaign_arn

'arn:aws:personalize:us-east-2:104151821460:campaign/ecommerce-user-personalization-campaign'

In [61]:
campaign_arn = "arn:aws:personalize:us-east-2:104151821460:campaign/ecommerce-campaign"

In [62]:
solution_version_arn = "arn:aws:personalize:us-east-2:104151821460:solution/ecommerce-hrnn-metadata-solution-HPO/a8d933bd"

In [59]:
# Update DF rendering
pd.set_option('display.max_rows', 30)

def get_new_recommendations_df_users(recommendations_df, user_id):
    
#   Context Recommendations
    get_recommendations_response = personalize_runtime.get_recommendations(
            campaignArn = campaign_arn,
            userId = str(user_id),
        )
        # Build a new dataframe of recommendations
    item_list = get_recommendations_response['itemList']
    recommendation_list = []
    for item in item_list:
        recommendation_list.append(item['itemId'])
#     print(recommendation_list)
    new_rec_DF = pd.DataFrame(recommendation_list)
    # Add this dataframe to the old one
    recommendations_df = pd.concat([recommendations_df, new_rec_DF])
    recommendations_df.rename(columns={0:'itemId'}, inplace=True)
    return recommendations_df

In [21]:
data_dir = '../../../data/ecommerce'
users_filename = 'user.csv'
users_df = pd.read_csv((data_dir +"/"+users_filename))

In [69]:
campaign_arn = "arn:aws:personalize:us-east-2:104151821460:campaign/ecommerce-user-personalization-campaign"
status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_campaign_response = personalize.describe_campaign(
        campaignArn = campaign_arn
    )
    status = describe_campaign_response["campaign"]["status"]
    print("Campaign: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

Campaign: ACTIVE


In [70]:
## User-personalization recipe
campaign_arn = "arn:aws:personalize:us-east-2:104151821460:campaign/ecommerce-user-personalization-campaign"
recommendations_df_users = pd.DataFrame()
users = users_df.sample()
print(users)
users= users['USER_ID'].tolist()
for user in users:
    recommendations_df_users = get_new_recommendations_df_users(recommendations_df_users, user)

recommendations_df_users

                      USER_ID BROWSER       OS  DEVICE  COUNTRY
152116  7198114500600198235-1  Chrome  Android  mobile  Ireland


Unnamed: 0,itemId
0,GGOEGAAX0104
1,GGOEGAAX0105
2,GGOEGAAX0358
3,GGOEGAAX0318
4,GGOEGBRA037499
5,GGOEGBRJ037299
6,GGOEGFKQ020399
7,GGOEGAAX0331
8,GGOEGAAX0106
9,GGOEGAAX0279


In [71]:
describe_campaign_response = personalize.describe_campaign(campaignArn = campaign_arn)
campaign_summary = describe_campaign_response["campaign"]
campaign_summary

{'name': 'ecommerce-user-personalization-campaign',
 'campaignArn': 'arn:aws:personalize:us-east-2:104151821460:campaign/ecommerce-user-personalization-campaign',
 'solutionVersionArn': 'arn:aws:personalize:us-east-2:104151821460:solution/ecommerce-user-personalization-solution-HPO/7ffc103f',
 'minProvisionedTPS': 2,
 'campaignConfig': {},
 'status': 'ACTIVE',
 'creationDateTime': datetime.datetime(2021, 7, 28, 14, 29, 51, 345000, tzinfo=tzlocal()),
 'lastUpdatedDateTime': datetime.datetime(2021, 7, 28, 14, 40, 42, 740000, tzinfo=tzlocal())}

In [72]:
rec_response = personalize_runtime.get_recommendations(campaignArn = campaign_arn, userId = '7198114500600198235-1')
print(rec_response['recommendationId'])

RID-3fe27f59-b7a2-4a22-884a-3c29d28db351


In [73]:
rec_response['itemList']

[{'itemId': 'GGOEGAAX0104', 'score': 0.0606777},
 {'itemId': 'GGOEGAAX0105', 'score': 0.0387958},
 {'itemId': 'GGOEGAAX0358', 'score': 0.035235},
 {'itemId': 'GGOEGAAX0318', 'score': 0.0332038},
 {'itemId': 'GGOEGBRA037499', 'score': 0.0308894},
 {'itemId': 'GGOEGBRJ037299', 'score': 0.0281512},
 {'itemId': 'GGOEGFKQ020399', 'score': 0.0206251},
 {'itemId': 'GGOEGAAX0331', 'score': 0.0201175},
 {'itemId': 'GGOEGAAX0106', 'score': 0.0173928},
 {'itemId': 'GGOEGAAX0279', 'score': 0.0164255},
 {'itemId': 'GGOEGBRJ037399', 'score': 0.0139645},
 {'itemId': 'GGOEGFKA022299', 'score': 0.0131914},
 {'itemId': 'GGOEGBRB013899', 'score': 0.0127659},
 {'itemId': 'GGOEGAAX0317', 'score': 0.012709},
 {'itemId': 'GGOEGAAX0359', 'score': 0.0114401},
 {'itemId': 'GGOEGAAX0362', 'score': 0.0112911},
 {'itemId': 'GGOEGAAX0325', 'score': 0.0111855},
 {'itemId': 'GGOEGAAX0335', 'score': 0.0106974},
 {'itemId': 'GGOEGEVR014999', 'score': 0.0102837},
 {'itemId': 'GGOEGEVB070599', 'score': 0.010272},
 {'item

In [66]:
## HRNN-Metadata Recipe
campaign_arn = "arn:aws:personalize:us-east-2:104151821460:campaign/ecommerce-campaign"
recommendations_df_users = pd.DataFrame()
users = users_df.sample()
print(users)
users= users['USER_ID'].tolist()
for user in users:
    recommendations_df_users = get_new_recommendations_df_users(recommendations_df_users, user)

recommendations_df_users

                      USER_ID BROWSER       OS  DEVICE        COUNTRY
310980  7829834644356171543-1  Chrome  Android  tablet  United States


Unnamed: 0,itemId
0,GGOEGAAX0104
1,GGOEGBRJ037299
2,GGOEGAAX0105
3,GGOEGAAX0318
4,GGOEGAAX0358
5,GGOEGFKQ020399
6,GGOEGFKA022299
7,GGOEGAAX0037
8,GGOEGAAX0331
9,GGOEGAAX0279


In [74]:
dataset_group_arn = 'arn:aws:personalize:us-east-2:104151821460:dataset-group/ecommerce-dataset-group-2021-07-27'

In [76]:
# response = personalize.create_event_tracker(
#     name='ecommerce-EventsTracker',
#     datasetGroupArn=dataset_group_arn
# )

# print(response['eventTrackerArn'])
# print(response['trackingId'])

TRACKING_ID = response['trackingId']
event_tracker_arn = response['eventTrackerArn']

In [77]:
TRACKING_ID

'31c3b75f-a4de-45d8-8666-875064dd2385'

In [78]:
event_tracker_arn

'arn:aws:personalize:us-east-2:104151821460:event-tracker/3beccd82'

In [84]:
personalize_events.put_events(
     trackingId = TRACKING_ID,
     userId= '7198114500600198235-1',
     sessionId = '1',
     eventList = [{
     'sentAt': int(time.time()),
     'eventType' : 'duration',
     'itemId' : 'GGOEGGOA017399',
     "eventValue": 1000,
     }]
    )

{'ResponseMetadata': {'RequestId': 'dceb2233-c939-4f68-aeb6-ce816b096787',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/json',
   'date': 'Wed, 28 Jul 2021 05:53:30 GMT',
   'x-amzn-requestid': 'dceb2233-c939-4f68-aeb6-ce816b096787',
   'content-length': '0',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

In [86]:
recommendations_df_users = pd.DataFrame()
for user in users:
    recommendations_df_users = get_new_recommendations_df_users(recommendations_df_users, user)

recommendations_df_users

Unnamed: 0,itemId
0,GGOEGOAQ020099
1,GGOEGAAX0074
2,GGOEGOAR013099
3,GGOEGOAQ018099
4,GGOEGOAA017199
5,GGOEGFAQ016699
6,GGOEGOAR021899
7,GGOEGOAQ012899
8,GGOEGOAB021699
9,GGOEGODR017799


In [34]:
session_dict = {}

def send_user_rating(USER_ID, ITEM_ID):
    """
    Simulates a click as an envent
    to send an event to Amazon Personalize's Event Tracker
    """
    # Configure Session
    try:
        session_ID = session_dict[str(USER_ID)]
    except:
        session_dict[str(USER_ID)] = str(uuid.uuid1())
        session_ID = session_dict[str(USER_ID)]
        
    # Configure Properties:
    event = {
        "itemId": str(ITEM_ID),
        "eventValue": 10,
    }
    event_json = json.dumps(event)
        
    # Make Call
    personalize_events.put_events(
        trackingId = TRACKING_ID,
        userId= str(USER_ID),
        sessionId = session_ID,
        eventList = [{
            'sentAt': int(time.time()),
            'eventType': 'duration',
            'properties': event_json
            }]
    )


In [35]:
session_dict

{}

In [36]:
uuid.uuid1()

UUID('3a75471c-ef47-11eb-82bc-8c85905c605b')

In [38]:
recommendations_df_users = pd.DataFrame()
users = users_df.sample()
print(users)
users= users['USER_ID'].tolist()
for user in users:
    recommendations_df_users = get_new_recommendations_df_users(recommendations_df_users, user)
user_id = users[0]
recommendations_df_users

                      USER_ID BROWSER       OS  DEVICE        COUNTRY
293406  3179913491640929255-1  Chrome  Android  mobile  United States


Unnamed: 0,itemId
0,GGOEGAAX0104
1,GGOEGBRJ037299
2,GGOEGAAX0358
3,GGOEGAAX0105
4,GGOEGAAX0318
5,GGOEGAAX0331
6,GGOEGAAX0279
7,GGOEGAAX0362
8,GGOEGFKQ020399
9,GGOEGBRA037499


In [40]:
interactions_filename = 'interaction.csv'
interactions_df = pd.read_csv((data_dir +"/"+interactions_filename))

In [41]:
# Next generate 3 random items
items = interactions_df.sample(3)['ITEM_ID'].tolist()

In [45]:
def get_new_recommendations_df_users_real_time(recommendations_df, user_id, item_id):
    # Interact with items
    # Sending a rating of 10 with that user
    send_user_rating(USER_ID=user_id, ITEM_ID=item_id)
    
    
    #   Context Recommendations
    get_recommendations_response = personalize_runtime.get_recommendations(
        campaignArn = campaign_arn,
        userId = str(user_id),
    )
    # Build a new dataframe of recommendations
    item_list = get_recommendations_response['itemList']
    recommendation_list = []
    for item in item_list:
        recommendation_list.append(item['itemId'])
    new_rec_DF = pd.DataFrame(recommendation_list, columns = [item_id])
    recommendations_df = pd.concat([recommendations_df, new_rec_DF], axis=1)
    return recommendations_df


In [46]:
user_recommendations_df = pd.DataFrame()
# Note this will take about 15 seconds to complete due to the sleeps
for item in items:
    user_recommendations_df = get_new_recommendations_df_users_real_time(user_recommendations_df, user_id, item)
    time.sleep(2)
print(user_id)
user_recommendations_df

3179913491640929255-1


Unnamed: 0,GGOEGBRJ037299,GGOEADHB014799,GGOEGAAX0605
0,GGOEGBRJ037399,GGOEGBRJ037399,GGOEGDHG014499
1,GGOEGBRA037499,GGOEGBRA037499,GGOEGAAX0127
2,GGOEGBRB013899,GGOEGBRB013899,GGOEADHH073999
3,GGOEGAAX0358,GGOEGAAX0358,GGOEADHH055999
4,GGOEGFKA022299,GGOEGFKA022299,GGOEGFPR113699
5,GGOEGEVB070599,GGOEGEVB070599,GGOEGDHB072199
6,GGOEGAAX0362,GGOEGAAX0362,GGOEGDHC015299
7,GGOEGBMB073799,GGOEGBMB073799,GGOEGDHB071999
8,GGOEGEVA022399,GGOEGEVA022399,GGOEAOCH014199
9,GGOEGDHG014499,GGOEGDHG014499,GGOEGDHC017999
