In [10]:
import numpy as np
import pandas as pd
import boto3
import json

In [2]:
client = boto3.resource('dynamodb', region_name='us-west-2')
personalize = boto3.client(service_name='personalize', endpoint_url='https://personalize.us-west-2.amazonaws.com')
personalize_runtime = boto3.client(service_name='personalize-runtime', endpoint_url='https://personalize-runtime.us-west-2.amazonaws.com')

bucket = 'aws-summit-hk-personalens'
user_metadata = "personalize_user_metadata.csv"
item_metadata = "personalize_item_metadata.csv"
user_interaction = "personalize_user_interaction.csv"

In [16]:
s3 = boto3.client("s3")

policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com"
            },
            "Action": [
                "s3:GetObject",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket),
                "arn:aws:s3:::{}/*".format(bucket)
            ]
        }
    ]
}

s3.put_bucket_policy(Bucket=bucket, Policy=json.dumps(policy));

In [17]:
iam = boto3.client("iam")

role_name = "aws-summit-hk-personalize-role"
assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "personalize.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
    ]
}

create_role_response = iam.create_role(
    RoleName = role_name,
    AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
);

iam.attach_role_policy(
    RoleName = role_name,
    PolicyArn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
);

role_arn = create_role_response["Role"]["Arn"]
print(role_arn)

arn:aws:iam::866716849012:role/aws-summit-hk-personalize-role


# Prepare and Upload Data
### Upload Movie Item Data

In [3]:
movies_df = pd.read_csv('./movies.csv', dtype={0:'object',1:'object', 2:'object'})
movies_df['release'] = movies_df['title'].str.extract('.*\(([0-9]+)\)', expand=True)
movies_df['title'] = movies_df['title'].str.extract('(.*) \([0-9]+\)')
movies_df.columns = ['ITEM_ID', 'MOVIE_TITLE', 'GENRE', 'RELEASE_DATE']
movies_df.head()

Unnamed: 0,ITEM_ID,MOVIE_TITLE,GENRE,RELEASE_DATE
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [4]:
movies_df.to_csv(item_metadata, index=False)
boto3.Session().resource('s3').Bucket(bucket).Object(item_metadata).upload_file(item_metadata)

### Upload User Data

In [27]:
from faker import Faker
fake = Faker()

users_df = pd.DataFrame(pd.read_csv('ratings.csv')['userId'].unique())

for i, row in users_df.iterrows():
    profile = fake.profile(fields=None, sex=None)
    users_df.at[i, 'OCCUPATION'] = profile['job']
    users_df.at[i, 'GENDER'] = profile['sex']

users_df.columns = ['USER_ID', 'OCCUPATION', 'GENDER']
    
users_df.head()

Unnamed: 0,USER_ID,OCCUPATION,GENDER
0,1,Diagnostic radiographer,M
1,2,"Therapist, sports",M
2,3,Catering manager,F
3,4,"Psychologist, sport and exercise",F
4,5,Visual merchandiser,F


In [28]:
users_df.to_csv(user_metadata, index=False)
boto3.Session().resource('s3').Bucket(bucket).Object(user_metadata).upload_file(user_metadata)

### Upload User Interaction Data

In [7]:
ratings_df = pd.read_csv('ratings.csv')
ratings_df = ratings_df[ratings_df['rating']>3]
ratings_df = ratings_df.drop(columns='rating')
ratings_df.columns = ['USER_ID', 'ITEM_ID', 'TIMESTAMP']
ratings_df.head()

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP
0,1,2,1112486027
1,1,29,1112484676
2,1,32,1112484819
3,1,47,1112484727
4,1,50,1112484580


In [8]:
ratings_df.to_csv(user_interaction, index=False)
boto3.Session().resource('s3').Bucket(bucket).Object(user_interaction).upload_file(user_interaction)

# Create Dataset Group

In [12]:
create_dataset_group_response = personalize.create_dataset_group(
    name = "aws-summit-hk-personalens-dataset-group"
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

{
  "datasetGroupArn": "arn:aws:personalize:us-west-2:866716849012:dataset-group/aws-summit-hk-personalens-dataset-group",
  "ResponseMetadata": {
    "RequestId": "b8ccb3be-9b60-4c45-a961-78ffe2812668",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 21 May 2019 14:15:04 GMT",
      "x-amzn-requestid": "b8ccb3be-9b60-4c45-a961-78ffe2812668",
      "content-length": "118",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


# Import interaction data

### Create Interaction Schema

In [11]:
interactions_schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        }
        
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = "personalens-interactions-schema",
    schema = json.dumps(interactions_schema)
)

schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-west-2:866716849012:schema/personalens-interactions-schema",
  "ResponseMetadata": {
    "RequestId": "34314ed3-d218-4988-b7d2-7b1f70bb8dcc",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 21 May 2019 14:13:32 GMT",
      "x-amzn-requestid": "34314ed3-d218-4988-b7d2-7b1f70bb8dcc",
      "content-length": "97",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [15]:
dataset_type = "Interactions"
create_dataset_response = personalize.create_dataset(
    name = "movielens_ratings",
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = schema_arn
)

dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-west-2:866716849012:dataset/aws-summit-hk-personalens-dataset-group/INTERACTIONS",
  "ResponseMetadata": {
    "RequestId": "df5e5b26-aca1-4b46-bc64-61fb92a50b4e",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 21 May 2019 14:24:44 GMT",
      "x-amzn-requestid": "df5e5b26-aca1-4b46-bc64-61fb92a50b4e",
      "content-length": "120",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [18]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "movielens-ratings-import-job-1",
    datasetArn = dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket, user_interaction)
    },
    roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-west-2:866716849012:dataset-import-job/movielens-ratings-import-job-1",
  "ResponseMetadata": {
    "RequestId": "87a1d1b8-5c3b-4200-b6d7-2f55e09135b0",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 21 May 2019 14:27:09 GMT",
      "x-amzn-requestid": "87a1d1b8-5c3b-4200-b6d7-2f55e09135b0",
      "content-length": "118",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


# Import User Data

In [31]:
user_schema = {
    "type": "record",
    "name": "Users",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "OCCUPATION",
            "type": "string",
            "categorical": True
            
        },
        {
            "name": "GENDER",
            "type": "string",
            "categorical": True
        }
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = "personalens-user-schema",
    schema = json.dumps(user_schema)
)

schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-west-2:866716849012:schema/personalens-user-schema",
  "ResponseMetadata": {
    "RequestId": "5817c06c-e34d-4bb6-9332-83df93f4b94e",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 21 May 2019 15:45:23 GMT",
      "x-amzn-requestid": "5817c06c-e34d-4bb6-9332-83df93f4b94e",
      "content-length": "89",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [32]:
dataset_type = "Users"
create_dataset_response = personalize.create_dataset(
    name = "movielens_users",
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = schema_arn
)

dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-west-2:866716849012:dataset/aws-summit-hk-personalens-dataset-group/USERS",
  "ResponseMetadata": {
    "RequestId": "003a06f6-417c-4db6-a100-5d69b469596e",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 21 May 2019 15:46:15 GMT",
      "x-amzn-requestid": "003a06f6-417c-4db6-a100-5d69b469596e",
      "content-length": "113",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [33]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "movielens-users-import-job-1",
    datasetArn = dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket, user_metadata)
    },
    roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-west-2:866716849012:dataset-import-job/movielens-users-import-job-1",
  "ResponseMetadata": {
    "RequestId": "66a3bb85-c2eb-4c1b-9673-f65f3f9c8785",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 21 May 2019 15:46:41 GMT",
      "x-amzn-requestid": "66a3bb85-c2eb-4c1b-9673-f65f3f9c8785",
      "content-length": "116",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


# Import Item Data

### Item Schema

In [38]:
item_schema = {
    "type": "record",
    "name": "Items",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "MOVIE_TITLE",
            "type": "string",
            "categorical": True
        },
        {
            "name": "GENRE",
            "type": "string",
            "categorical": True
        },
        {
            "name": "RELEASE_DATE",
            "type": "string",
            "categorical": True
        }
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = "personalens-item-schema",
    schema = json.dumps(item_schema)
)

schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-west-2:866716849012:schema/personalens-item-schema",
  "ResponseMetadata": {
    "RequestId": "a20714bb-80fe-4731-9614-1aad03ef0fd4",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 21 May 2019 16:03:32 GMT",
      "x-amzn-requestid": "a20714bb-80fe-4731-9614-1aad03ef0fd4",
      "content-length": "89",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [39]:
dataset_type = "Items"
create_dataset_response = personalize.create_dataset(
    name = "movielens_items",
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = schema_arn
)

dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-west-2:866716849012:dataset/aws-summit-hk-personalens-dataset-group/ITEMS",
  "ResponseMetadata": {
    "RequestId": "163a0ad7-c36d-4e89-b2b6-c644a61e466b",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 21 May 2019 16:04:27 GMT",
      "x-amzn-requestid": "163a0ad7-c36d-4e89-b2b6-c644a61e466b",
      "content-length": "113",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [40]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "movielens-items-import-job-1",
    datasetArn = dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket, item_metadata)
    },
    roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-west-2:866716849012:dataset-import-job/movielens-items-import-job-1",
  "ResponseMetadata": {
    "RequestId": "7018c2b3-725d-47c6-8fe8-7346926482b1",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 21 May 2019 16:05:05 GMT",
      "x-amzn-requestid": "7018c2b3-725d-47c6-8fe8-7346926482b1",
      "content-length": "116",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}
