# Kinesis Firehose

![Kinesis Firehose](img/kinesis_firehose_s3_docs.png)

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)
kinesis = boto3.Session().client(service_name='kinesis', region_name=region)
firehose = boto3.Session().client(service_name='firehose', region_name=region)
sts = boto3.Session().client(service_name='sts', region_name=region)

# Download Dataset

In [None]:
!aws s3 cp 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz' ./data/

In [None]:
import csv
import pandas as pd

df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', 
                 delimiter='\t', 
                 quoting=csv.QUOTE_NONE,
                 compression='gzip')
df.shape

In [None]:
df.head(5)

In [None]:
df_star_rating_and_review_body = df[['star_rating', 'review_body']][:100]
df_star_rating_and_review_body.shape

In [None]:
df_star_rating_and_review_body.head()

In [None]:
reviews_tsv = df_star_rating_and_review_body.to_csv(sep='\t',
                                                    header=None,
                                                    index=False)

In [None]:
reviews_tsv

In [None]:
%store -r stream_name

In [None]:
print(stream_name)

# Create Firehose Delivery Stream to S3

In [None]:
%store -r partition_key

In [None]:
print(partition_key)

In [None]:
%store -r data_stream_arn

In [None]:
print(data_stream_arn)

In [None]:
%store -r iam_role_kinesis_arn

In [None]:
print(iam_role_kinesis_arn)

In [None]:
account_id = sts.get_caller_identity()['Account']

In [None]:
firehose_name = 'dsoaws-firehose-stream'
s3_bucket_arn = 'arn:aws:s3:::{}'.format(bucket)
s3_prefix = 'kinesis-firehose'

In [None]:
firehose_response = firehose.create_delivery_stream(
    DeliveryStreamName=firehose_name,
    DeliveryStreamType='KinesisStreamAsSource',
    KinesisStreamSourceConfiguration={
        'KinesisStreamARN': data_stream_arn,
        'RoleARN': iam_role_kinesis_arn
    },
    ExtendedS3DestinationConfiguration={
        'RoleARN': iam_role_kinesis_arn,
        'BucketARN': s3_bucket_arn,
        'Prefix': s3_prefix,
        'BufferingHints': {
            'SizeInMBs': 1,
            'IntervalInSeconds': 60
        }        
    }
)

print(firehose_response)

In [None]:
firehouse_delivery_stream_arn = firehose_response['DeliveryStreamARN']
print(firehouse_delivery_stream_arn)

In [28]:
%store firehouse_delivery_stream_arn

Stored 'firehouse_delivery_stream_arn' (str)


In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/kinesis/home?region={}#/streams/details/{}/monitoring"> Stream</a></b>'.format(region, stream_name)))


In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/firehose/home?region={}#/details/{}/monitoring"> Firehose</a></b>'.format(region, firehose_name)))


In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/s3/buckets/{}/{}?region={}">Data in S3</a></b>'.format(bucket, s3_prefix, region)))


# Simulate an Application and Write to Data Stream
When you configure a a Kinesis Firehose to use Kinesis Data Streams as the source, you must use the Kinesis Data Streams `PutRecord` and `PutRecords` operations to add data to the Kinesis Data Firehose.  These are the same APIs that we used in the Kinesis Data Streams notebook.

In [None]:
data_stream = boto3.Session().client(service_name='kinesis', region_name=region)

response = data_stream.put_records(
    Records=[
        {
            'Data': reviews_tsv.encode('utf-8'),
            'PartitionKey': partition_key
        },
    ],
    StreamName=stream_name
)