# PCA on MovieLens
Principal Component Analysis (PCA) is a dimension reductionality algorithm. It's
often applied as a preliminary step before regression or classification. Let's use it on the
protobuf dataset built in the Factorization Machines example

#### Same steps as before example

In [2]:
%%sh
wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
unzip -o ml-100k.zip

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test         


--2021-05-03 02:16:27--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’

     0K .......... .......... .......... .......... ..........  1%  109K 44s
    50K .......... .......... .......... .......... ..........  2%  215K 33s
   100K .......... .......... .......... .......... ..........  3% 9.91M 22s
   150K .......... .......... .......... .......... ..........  4% 5.97M 16s
   200K .......... .......... .......... .......... ..........  5%  222K 17s
   250K .......... .......... .......... .......... ..........  6% 10.4M 14s
   300K .......... .......... .......... .......... ..........  7% 7.31M 12s
   350K .......... .......... .......... .......... ..........  8% 5.26M 10s
   400K .......... .........

In [3]:
%cd ml-100k
!shuf ua.base -o ua.base.shuffled
!head -5 ua.base.shuffled

/media/bilal/My Passport/Study/New/AWS/Practice/ml-100k
124	28	3	890287068
458	193	4	886396460
767	657	4	891462917
867	7	5	880078604
911	82	2	892840888


### Build training set and test set

In [4]:
num_users=943
num_movies=1682
num_features=num_users+num_movies

num_ratings_train=90570
num_ratings_test=9430

In [5]:
import csv
import numpy as np
from scipy.sparse import lil_matrix

def loadDataset(filename, lines, columns):
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((lines, columns)).astype('float32')
    # Labels are stored in a vector
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter='\t')
        for userId,movieId,rating,timestamp in samples:
            X[line,int(userId)-1] = 1
            X[line,int(num_users)+int(movieId)-1] = 1
            Y.append(int(rating))
            line=line+1       
    Y=np.array(Y).astype('float32')
    return X,Y

In [6]:
X_train, Y_train = loadDataset('ua.base.shuffled', num_ratings_train, num_features)
X_test, Y_test = loadDataset('ua.test', num_ratings_test, num_features)

In [7]:
print(X_train.shape)
print(Y_train.shape)
assert X_train.shape == (num_ratings_train, num_features)
assert Y_train.shape == (num_ratings_train, )

print(X_test.shape)
print(Y_test.shape)
assert X_test.shape  == (num_ratings_test, num_features)
assert Y_test.shape  == (num_ratings_test, )

(90570, 2625)
(90570,)
(9430, 2625)
(9430,)


In [8]:
import sagemaker

bucket = sagemaker.Session().default_bucket()
prefix = 'fm-movielens'

train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test')

output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [9]:
import io, boto3
import sagemaker.amazon.common as smac

def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    # use smac.write_numpy_to_dense_tensor(buf, feature, label) for numpy arrays
    buf.seek(0)
    print(buf)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    
test_data  = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    
  
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

<_io.BytesIO object at 0x7fce42e64360>
<_io.BytesIO object at 0x7fce42808040>
s3://sagemaker-us-east-1-603012210694/fm-movielens/train/train.protobuf
s3://sagemaker-us-east-1-603012210694/fm-movielens/test/test.protobuf
Output: s3://sagemaker-us-east-1-603012210694/fm-movielens/output


In [11]:
# Extra step for local user only

import boto3
region = boto3.Session().region_name

def resolve_sm_role():
    client = boto3.client('iam', region_name=region)
    response_roles = client.list_roles(
        PathPrefix='/',
        # Marker='string',
        MaxItems=999
    )
    for role in response_roles['Roles']:
        if role['RoleName'].startswith('AmazonSageMaker-ExecutionRole-'):
            #print('Resolved SageMaker IAM Role to: ' + str(role))
            return role['Arn']
    raise Exception('Could not resolve what should be the SageMaker role to be used')

role = resolve_sm_role()
print(role)

arn:aws:iam::603012210694:role/service-role/AmazonSageMaker-ExecutionRole-20210304T123661


### Run training job

In [12]:
import boto3
from sagemaker import image_uris

region = boto3.Session().region_name    
container = image_uris.retrieve('pca', region)

In [13]:
pca = sagemaker.estimator.Estimator(container,
                                   role=role,#sagemaker.get_execution_role(),
                                   instance_count=1, 
                                   instance_type='ml.c5.xlarge',
                                   output_path=output_prefix
                                   )

We then set the hyperparameters. The required ones are the initial number of
features, the number of principal components to compute, and the batch size:

In [None]:
pca.set_hyperparameters(feature_dim=num_features,
                      num_components=64,
                      mini_batch_size=1024)

In [14]:
pca.fit({'train': train_data, 'test': test_data})

2021-05-02 21:24:45 Starting - Starting the training job...
2021-05-02 21:24:55 Starting - Launching requested ML instancesProfilerReport-1619990684: InProgress
......
2021-05-02 21:26:08 Starting - Preparing the instances for training......
2021-05-02 21:27:28 Downloading - Downloading input data
2021-05-02 21:27:28 Training - Downloading the training image.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/02/2021 21:27:39 INFO 139732926084928] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'algorithm_mode': 'regular', 'subtract_mean': 'true', 'extra_components': '-1', 'force_dense': 'true', 'epochs': 1, '_log_level': 'info', '_kvstore': 'dist_sync', '_num_kv_servers': 'auto', '_num_gpus': 'auto'}[0m
[34m[05/02/2021 21:27:39 INFO 139732926084928] Merging with provided configuration from /opt/ml/input/config/hyperparameters.json: {'feature_dim


2021-05-02 21:28:05 Uploading - Uploading generated training model
2021-05-02 21:28:05 Completed - Training job completed
[34m[2021-05-02 21:27:52.870] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/test", "epoch": 1, "duration": 337, "num_examples": 10, "num_bytes": 603520}[0m
[34m#metrics {"StartTime": 1619990872.5285003, "EndTime": 1619990872.8704154, "Dimensions": {"Algorithm": "PCA", "Host": "algo-1", "Operation": "training", "Meta": "test_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 9430.0, "count": 1, "min": 9430, "max": 9430}, "Total Batches Seen": {"sum": 10.0, "count": 1, "min": 10, "max": 10}, "Max Records Seen Between Resets": {"sum": 9430.0, "count": 1, "min": 9430, "max": 9430}, "Max Batches Seen Between Resets": {"sum": 10.0, "count": 1, "min": 10, "max": 10}, "Reset Count": {"sum": 1.0, "count": 1, "min": 1, "max": 1}, "Number of Records Since Last Reset": {"sum": 9430.0, "count": 1, "min": 9430, "max": 9430}, "Number of Batches Since 

### Deploying model

In [15]:
endpoint_name = 'pca-movielens-100k'
pca_predictor = pca.deploy(endpoint_name=endpoint_name,
                         instance_type='ml.t2.medium', initial_instance_count=1)

-------------------!

In [16]:
import json

from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer

class PCASerializer(JSONSerializer):
    def serialize(self, data):
       js = {'instances': []}
       for row in data:
              js['instances'].append({'features': row.tolist()})
       return json.dumps(js)

pca_predictor.serializer = PCASerializer()
pca_predictor.deserializer = JSONDeserializer()

### Running Prediction

In [17]:
result = pca_predictor.predict(X_test[0].toarray())
print(result)

{'projections': [{'projection': [-0.008711372502148151, 0.0019895541481673717, 0.002355781616643071, 0.012406938709318638, -0.0069608548656105995, -0.009556426666676998, 0.0070395139046013355, 0.0014258784940466285, -0.014954577200114727, 0.006284230388700962, 0.001228088280186057, 0.0033577263820916414, -0.005306658800691366, 0.003560103476047516, -0.005722153931856155, 0.0018947564531117678, -0.018347417935729027, 0.005859722383320332, -0.0051197693683207035, 0.005412592086941004, 0.002981008030474186, -0.0070180222392082214, -0.004825756885111332, 0.0006951577961444855, -0.002631745534017682, 0.0026822059880942106, -0.00016326206969097257, -0.002161189913749695, 0.007496879436075687, -0.010350828990340233, 0.009461312554776669, -0.007941177114844322, 0.008525246754288673, -0.005494360346347094, 0.002860172651708126, -0.00023960997350513935, 0.00014624283357989043, -0.005788157694041729, 0.010191304609179497, -0.0024550503585487604, 0.005202359054237604, -0.0032088235020637512, -0.00

### Deleting endpoint

In [18]:
pca_predictor.delete_endpoint()