# Put Customer Reviews On Kinesis Data Firehose

In [1]:
import boto3
import sagemaker
import pandas as pd
import json

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)
firehose = boto3.Session().client(service_name='firehose', region_name=region)
kinesis_analytics = boto3.Session().client(service_name='kinesisanalytics', region_name=region)


In [2]:
%store -r firehose_name

In [3]:
print(firehose_name)

dsoaws-kinesis-data-firehose


In [4]:
%store -r firehose_arn

In [5]:
print(firehose_arn)

arn:aws:firehose:us-east-1:806570384721:deliverystream/dsoaws-kinesis-data-firehose


In [6]:
%store -r iam_role_kinesis_arn

In [7]:
print(iam_role_kinesis_arn)

arn:aws:iam::806570384721:role/DSOAWS_Kinesis


In [8]:
%store -r kinesis_data_analytics_app_name

In [9]:
print(kinesis_data_analytics_app_name)

dsoaws-kinesis-data-analytics-sql-app


In [10]:
%store -r lambda_fn_name

In [11]:
print(lambda_fn_name)

DeliverKinesisAnalyticsToCloudWatch


In [12]:
firehoses = firehose.list_delivery_streams(DeliveryStreamType='DirectPut')
print(json.dumps(firehoses, indent=4, sort_keys=True, default=str))

{
    "DeliveryStreamNames": [
        "dsoaws-kinesis-data-firehose"
    ],
    "HasMoreDeliveryStreams": false,
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "87",
            "content-type": "application/x-amz-json-1.1",
            "date": "Thu, 10 Sep 2020 16:49:15 GMT",
            "x-amz-id-2": "oDCNvmcoqwVIQBHBbCSKpI3IkldiHdRU2kYousDlhXl1NVAYSQ2u3WATXeGLZBDuMZdTeMt0YvFor4Fk3+AuIuP4PYvdbo9m",
            "x-amzn-requestid": "dad6e463-ee36-5c14-8d02-bf28a5d9b3e9"
        },
        "HTTPStatusCode": 200,
        "RequestId": "dad6e463-ee36-5c14-8d02-bf28a5d9b3e9",
        "RetryAttempts": 0
    }
}


# Download Dataset

In [13]:
!aws s3 cp 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz' ./data/

download: s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz to data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz


In [14]:
import csv
import pandas as pd

df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', 
                 delimiter='\t', 
                 quoting=csv.QUOTE_NONE,
                 compression='gzip')
df.shape

(102084, 15)

In [15]:
df.head(5)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,17747349,R2EI7QLPK4LF7U,B00U7LCE6A,106182406,CCleaner Free [Download],Digital_Software,4,0,0,N,Y,Four Stars,So far so good,2015-08-31
1,US,10956619,R1W5OMFK1Q3I3O,B00HRJMOM4,162269768,ResumeMaker Professional Deluxe 18,Digital_Software,3,0,0,N,Y,Three Stars,Needs a little more work.....,2015-08-31
2,US,13132245,RPZWSYWRP92GI,B00P31G9PQ,831433899,Amazon Drive Desktop [PC],Digital_Software,1,1,2,N,Y,One Star,Please cancel.,2015-08-31
3,US,35717248,R2WQWM04XHD9US,B00FGDEPDY,991059534,Norton Internet Security 1 User 3 Licenses,Digital_Software,5,0,0,N,Y,Works as Expected!,Works as Expected!,2015-08-31
4,US,17710652,R1WSPK2RA2PDEF,B00FZ0FK0U,574904556,SecureAnywhere Intermet Security Complete 5 De...,Digital_Software,4,1,2,N,Y,Great antivirus. Worthless customer support,I've had Webroot for a few years. It expired a...,2015-08-31


In [16]:
df_star_rating_and_review_body = df[['review_id', 
                                         'star_rating', 
                                         'product_category', 
                                         'review_body']][0:1]

df_star_rating_and_review_body.to_csv(sep='\t',
                                      header=None,
                                      index=False)

'R2EI7QLPK4LF7U\t4\tDigital_Software\tSo far so good\n'

# Simulate Producer Application Writing Records to the Stream

In [17]:
firehose_response = firehose.describe_delivery_stream(
    DeliveryStreamName=firehose_name
)

print(firehose_response)

{'DeliveryStreamDescription': {'DeliveryStreamName': 'dsoaws-kinesis-data-firehose', 'DeliveryStreamARN': 'arn:aws:firehose:us-east-1:806570384721:deliverystream/dsoaws-kinesis-data-firehose', 'DeliveryStreamStatus': 'ACTIVE', 'DeliveryStreamEncryptionConfiguration': {'Status': 'DISABLED'}, 'DeliveryStreamType': 'DirectPut', 'VersionId': '1', 'CreateTimestamp': datetime.datetime(2020, 9, 10, 12, 55, 8, 543000, tzinfo=tzlocal()), 'Destinations': [{'DestinationId': 'destinationId-000000000001', 'S3DestinationDescription': {'RoleARN': 'arn:aws:iam::806570384721:role/DSOAWS_Kinesis', 'BucketARN': 'arn:aws:s3:::sagemaker-us-east-1-806570384721', 'Prefix': 'kinesis-data-firehose', 'BufferingHints': {'SizeInMBs': 5, 'IntervalInSeconds': 300}, 'CompressionFormat': 'UNCOMPRESSED', 'EncryptionConfiguration': {'NoEncryptionConfig': 'NoEncryption'}, 'CloudWatchLoggingOptions': {'Enabled': False}}, 'ExtendedS3DestinationDescription': {'RoleARN': 'arn:aws:iam::806570384721:role/DSOAWS_Kinesis', 'Buc

## Put Records onto Firehose

In [None]:
step = 1
for start_idx in range(0, 10000, step):
    end_idx = start_idx + step

    df_star_rating_and_review_body = df[['review_id', 
                                         'star_rating', 
                                         'product_category', 
                                         'review_body']][start_idx:end_idx]

    reviews_tsv = df_star_rating_and_review_body.to_csv(sep='\t',
                                                        header=None,
                                                        index=False)
    
    # print(reviews_tsv.encode('utf-8'))
    
    response = firehose.put_record(        
        Record={
            'Data': reviews_tsv.encode('utf-8')
        },
        DeliveryStreamName=firehose_name
    )

# Analyze Stream with Kinesis Data Analytics

![](./img/no_rows_in_source_kinesis_firehose_stream.png)

## _Re-Run ^^ Above ^^ Cell If You See `No rows in source stream`_

In [23]:
anomaly_step = 1
import time

timestamp = int(time.time())

for start_idx in range(0, 10000, anomaly_step):
    end_idx = start_idx + step

    df_anomalies = pd.DataFrame([
        {'review_id': str(timestamp), 
         'star_rating': 100, 
         'product_category': 'Digital_Software', 
         'review_body': 'blahblah'},     
    ], columns=['star_rating', 'review_body'])

    df_star_rating_and_review_body_anomalies = df_anomalies[['review_id',
                                                             'star_rating', 
                                                             'product_category',
                                                             'review_body']][start_idx:end_idx]

    reviews_tsv_anomalies = df_star_rating_and_review_body_anomalies.to_csv(sep='\t',
                                                                            header=None,
                                                                            index=False)
    response = firehose.put_record(           
        Record={
            'Data': reviews_tsv_anomalies.encode('utf-8')
        },
        DeliveryStreamName=firehose_name
    )

# Explore Kinesis Data Analytics App

In [None]:
from IPython.core.display import display, HTML
        
display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/kinesisanalytics/home?region={}#/wizard/editor?applicationName={}"> Kinesis Data Analytics App</a></b>'.format(region, kinesis_data_analytics_app_name)))


# Explore Kinesis Firehose

In [None]:
from IPython.core.display import display, HTML
    
display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/firehose/home?region={}#/details/{}/monitoring"> Firehose</a></b>'.format(region, firehose_name)))


# Explore Custom CloudWatch Metrics

In [None]:
from IPython.core.display import display, HTML
    
display(HTML("""<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#metricsV2:graph=~(metrics~(~(~'kinesis*2fanalytics*2fAVGStarRating~'AVGStarRating~'Product*20Category~'All))~view~'timeSeries~stacked~false~start~'-PT5M~end~'P0D~region~'us-east-1~liveData~true~stat~'Average~period~1~title~'Avg*20Star*20Rating);query=~'*7bkinesis*2fanalytics*2fAVGStarRating*2c*22Product*20Category*22*7d">CloudWatch Metrics</a></b>""".format(region, region)))


# Explore Lambda Code

In [None]:
display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/lambda/home?region={}#/functions/DeliverKinesisAnalyticsToCloudWatch">Lambda</a></b>'.format(region, lambda_fn_name)))


# Explore Lambda Logs

In [None]:
from IPython.core.display import display, HTML
    
display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=%252Faws%252Flambda%252F{}">Lambda Logs</a></b>'.format(region, lambda_fn_name)))


In [None]:
#%%javascript
#Jupyter.notebook.save_checkpoint();
#Jupyter.notebook.session.delete();