# Setup the Amazon Kinesis Data Pipeline
https://github.com/aws-samples/aws-ml-data-lake-workshop

In [7]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)
kn_data = boto3.Session().client(service_name='kinesis', region_name=region)
kn_firehose = boto3.Session().client(service_name='firehose', region_name=region)

In [8]:
kn_data.list_streams()

{'StreamNames': [],
 'HasMoreStreams': False,
 'ResponseMetadata': {'RequestId': 'ff7b21b0-1cff-8a6f-a827-6ea0d4a798b3',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ff7b21b0-1cff-8a6f-a827-6ea0d4a798b3',
   'x-amz-id-2': 'be/RpaGEKW1vW9dg2VhqgfpSxcBTxPJ9wV5kRJ+Qi2YbFF6y41N6+j8eQmU/RcQxrCOMp/qLo0hiTZWp33xulJfj4FC8NZe9',
   'date': 'Thu, 11 Jun 2020 15:25:04 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '41'},
  'RetryAttempts': 0}}

## Step 1: Create a Kinesis Data Stream

In [9]:
stream_name = "dsoaws-data-stream"
shard_count = 1

In [10]:
response = kn_data.create_stream(
    StreamName=stream_name, 
    ShardCount=shard_count
)

In [11]:
print(response)

{'ResponseMetadata': {'RequestId': 'c41539d5-929d-fb44-9349-76b775319cd8', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'c41539d5-929d-fb44-9349-76b775319cd8', 'x-amz-id-2': '6yOC+U9TRA53IUCNrp+IWA9+c3JrN7rcFqpDub/CZlRF3RDXfsGTgi9TmozvoDbVSaYZFu1EsEPrqHYTQF99wgdBDCumBNsD', 'date': 'Thu, 11 Jun 2020 15:26:26 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '0'}, 'RetryAttempts': 0}}


In [12]:
response = kn_data.describe_stream(
    StreamName=stream_name
    # Limit=123,
    # ExclusiveStartShardId='string'
)

In [13]:
print(response)

{'StreamDescription': {'StreamName': 'dsoaws-data-stream', 'StreamARN': 'arn:aws:kinesis:us-east-1:806570384721:stream/dsoaws-data-stream', 'StreamStatus': 'ACTIVE', 'Shards': [{'ShardId': 'shardId-000000000000', 'HashKeyRange': {'StartingHashKey': '0', 'EndingHashKey': '340282366920938463463374607431768211455'}, 'SequenceNumberRange': {'StartingSequenceNumber': '49607862348221712927862777905367152435935770759162494978'}}], 'HasMoreShards': False, 'RetentionPeriodHours': 24, 'StreamCreationTimestamp': datetime.datetime(2020, 6, 11, 15, 26, 25, tzinfo=tzlocal()), 'EnhancedMonitoring': [{'ShardLevelMetrics': []}], 'EncryptionType': 'NONE'}, 'ResponseMetadata': {'RequestId': 'c4af8a78-4c9e-385d-93f3-c5b87951bed5', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'c4af8a78-4c9e-385d-93f3-c5b87951bed5', 'x-amz-id-2': 'w8s++bNpqZf93D+3yacXlAgRN0XdNfNrSwrOnu9SKluhcTUzcHZC9b9P4bDxHBaTgJza0cGzrxgMSJ06bJSTfdB2y5aBtTsv', 'date': 'Thu, 11 Jun 2020 15:28:00 GMT', 'content-type': 'applicat

## Step 2: Create a Kinesis Firehose Stream with source data stream

In [16]:
firehose_name = 'dsoaws-firehose-stream'
delivery_stream_type = 'KinesisStreamAsSource'
data_stream_arn = 'arn:aws:kinesis:us-east-1:806570384721:stream/dsoaws-data-stream'
bucket_arn = 'arn:aws:s3:::dsoaws-streaming-data'
shard_count = 1
kinesis_role_arn = 'arn:aws:iam::806570384721:role/DSOAWS_Kinesis'

In [17]:
firehose_response = kn_firehose.create_delivery_stream(
    DeliveryStreamName=firehose_name,
    DeliveryStreamType=delivery_stream_type,
    KinesisStreamSourceConfiguration={
        'KinesisStreamARN': data_stream_arn,
        'RoleARN': kinesis_role_arn
    },
    ExtendedS3DestinationConfiguration={
        'RoleARN': kinesis_role_arn,
        'BucketARN': bucket_arn
    }
)

In [18]:
print(firehose_response)

{'DeliveryStreamARN': 'arn:aws:firehose:us-east-1:806570384721:deliverystream/dsoaws-firehose-stream', 'ResponseMetadata': {'RequestId': 'd53a192c-4270-bd41-8266-4d67382e2862', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'd53a192c-4270-bd41-8266-4d67382e2862', 'x-amz-id-2': 'gWNN8HsmxtOpfzAFrKmmkreL9PJbxkbGacAsn9k48H1djXcUGUpos3QTfiaCgJ3dsZBYvS6X2uqcsO7xax3CSBfJUNWPaqXh', 'content-type': 'application/x-amz-json-1.1', 'content-length': '101', 'date': 'Thu, 11 Jun 2020 15:47:23 GMT'}, 'RetryAttempts': 0}}


## Step 3: Put a Record

## Create and trigger Lambda function in `src/lambda.py`

In [None]:
partition_key = '123'
data = 'testdata'

In [None]:
!aws kinesis put-record --stream-name $stream_name --partition-key $partition_key --data $data

## Step 4: Get the Record

In [None]:
shard_id = 'shardId-000000000000'

In [None]:
!aws kinesis get-shard-iterator --shard-id $shard_id --shard-iterator-type TRIM_HORIZON --stream-name $stream_name