Instructions to create the mock environment, run in terminal:
  SQS:
  
      moto_server sqs -p 4576 -H localhost   
      aws --endpoint-url=http://localhost:4576 sqs create-queue --queue-name sse_queue --region us-east-1   
  
  aws configure:
  
      AWS Access Key ID [None]: AKIAIOSFODNN7EXAMPLE 
      AWS Secret Access Key [None]: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY 
      Default region name [None]: us-east-1 
      Default output format [None]: json
  
  S3:
      
      moto_server s3 -p 4572 -H localhost
      aws --endpoint-url=http://localhost:4572 s3 mb s3://sse-bucket --region us-east-1
  
  read bucket
  
      aws --endpoint-url=http://localhost:4572 s3 ls sse-bucket 

## Data Pipeline with AWS

In [32]:
import boto3
import json
from sseclient import SSEClient as EventSource
from moto import mock_s3
import time
import pandas as pd
import psycopg2
#from moto import mock_ec2
from moto import mock_redshift
from moto.core import ACCOUNT_ID
from botocore.exceptions import ClientError
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker

In [33]:
# sqs parameters
sqs_endpoint= 'http://localhost:4576'
region='us-east-1'
queue_name='sse_queue'
events_limit=3
url='https://stream.wikimedia.org/v2/stream/recentchange'
queue_url = sqs_endpoint+'/queue/'+queue_name

# s3 parameters
AWS_ACCESS_KEY_ID = 'AKIAIOSFODNN7EXAMPLE'
AWS_SECRET_ACCESS_KEY = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY '

s3_endpoint="http://localhost:4572" #only for test purposes
bucket='camila-bucket'

#redshift parameters
redshift_user='user'
redshift_password='password'
redshift_db='test'
iam_roles=['arn:aws:iam::123456789012:role/myRedshiftRole']


### SQS Catch Events

In [34]:
sqs = boto3.client('sqs'
    , endpoint_url=sqs_endpoint
    , use_ssl=False #only for test purposes
    , region_name=region)
    
sqs   

    

<botocore.client.SQS at 0x7f11eeb97070>

In [35]:
def catch_events(num, url):
    
    i=0
    for event in EventSource(url):
        if(i>num):
            return
        else:
            if event.event == 'message':
                try:
                    message = json.loads(event.data)
                except ValueError:
                    pass
                else: 
                    enqueue_message( json.dumps(message))
        i=i+1 
    return    

In [36]:
def enqueue_message(message):
    
    response = sqs.send_message(
        QueueUrl = queue_url,
        DelaySeconds=1,
        MessageBody = message
    )
    print('\rMessage %s enqueued' % response['MessageId'], sep=' ', end='', flush=True)
  

In [37]:
catch_events(events_limit,url)

Message 253f3dcc-3757-4baf-73ed-af47234b1173 enqueued

### Read Data Stream and Save in S3

In [38]:
s3 = boto3.resource('s3'
    , endpoint_url=s3_endpoint
    , use_ssl=False #only for test purposes
    , region_name=region)
s3

s3.ServiceResource()

In [39]:
#desired payload
map_keys = ['id','type','namespace','title','comment','timestamp','user','bot', 'ReceiptHandle']
list_msgs = [] 

In [40]:
def read_data(num):
    i=0
    while i < num:
        try:
            response=sqs.receive_message(
                QueueUrl = queue_url,
                MaxNumberOfMessages=5
            )
            process_data( response['Messages'])
        except KeyError:
            print('No message available')
            time.sleep(5)
        i=i+1    

In [41]:
def process_data(messages):
    list_msg=[]
    for message in messages:
        d=json.loads(message['Body'])
        clean_dict={key:(d[key] if key in d else None) for key in map_keys}
        clean_dict['ReceiptHandle']=message['ReceiptHandle']
        list_msg.append(clean_dict)
        print('ready to Data Lake')
        to_data_lake(list_msg)
        list_msg=[]

In [42]:
def remove_messages( df ):
    for receipt_handle in df['ReceiptHandle'].values:
        sqs.delete_message(
            QueueUrl = queue_url,
            ReceiptHandle = receipt_handle
        ) 

In [43]:
def to_data_lake(df):
    fdf=pd.DataFrame(list_msgs)
    csv=fdf.to_csv(index=False)
    filename='file-%s.csv' % df[0]['id']
    #csv to S3
    s3.Bucket(bucket).put_object(Key=filename, Body=csv,ACL='public-read')
    print('\r%s saved into the Data Lake' % filename, sep=' ', end='', flush=True
)
    remove_messages( fdf ) 

In [44]:
read_data(10)

ready to Data Lake
file-1502134215.csv saved into the Data LakeNo message available
ready to Data Lake
file-422889350.csv saved into the Data LakeNo message available
No message available
No message available
No message available
No message available
ready to Data Lake
file-1502134215.csv saved into the Data LakeNo message available
ready to Data Lake
file-422889350.csv saved into the Data LakeNo message available
No message available
No message available


### From S3 to Refshift

#### Create a cluster

In [48]:
@mock_redshift
def create_redshift_cluster(ClusterId):
    redshift_client = boto3.client('redshift', region_name="us-east-1")
    try:
        response = redshift_client.create_cluster(
            ClusterIdentifier=ClusterId,
            DBName=redshift_db,
            NodeType='dc2.large',
            MasterUsername=redshift_user,
            MasterUserPassword=redshift_password,
            IamRoles=iam_roles,
        )
          
        
    except ClientError as e:
        print(f'ERROR: {e}')
        return None
    
    redshift_host=response['Cluster']['Endpoint']['Address']
    redshift_port=response['Cluster']['Endpoint']['Port']      
    return response['Cluster'],redshift_host,redshift_port

response, redshift_host,redshift_port= create_redshift_cluster('redshift-cluster-1')

In [49]:
response

{'ClusterIdentifier': 'redshift-cluster-1',
 'NodeType': 'dc2.large',
 'ClusterStatus': 'creating',
 'MasterUsername': 'user',
 'DBName': 'test',
 'Endpoint': {'Address': 'redshift-cluster-1.cg034hpkmmjt.us-east-1.redshift.amazonaws.com',
  'Port': 5439},
 'ClusterCreateTime': datetime.datetime(2020, 10, 6, 16, 32, 11, 77000, tzinfo=tzutc()),
 'AutomatedSnapshotRetentionPeriod': 1,
 'ClusterSecurityGroups': [{'ClusterSecurityGroupName': 'Default',
   'Status': 'active'}],
 'VpcSecurityGroups': [],
 'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
   'ParameterApplyStatus': 'in-sync'}],
 'ClusterSubnetGroupName': '',
 'AvailabilityZone': 'us-east-1a',
 'PreferredMaintenanceWindow': 'Mon:03:00-Mon:03:30',
 'PendingModifiedValues': {},
 'ClusterVersion': '1.0',
 'AllowVersionUpgrade': True,
 'NumberOfNodes': 1,
 'PubliclyAccessible': False,
 'Encrypted': False,
 'Tags': [],
 'EnhancedVpcRouting': False,
 'IamRoles': [{'IamRoleArn': 'arn:aws:iam::123456789012:role/

#### Execute a Query

In [47]:
# just to demonstrate the code because the cluster doesn't really exist in aws
def redshift_query(): 
    connection_string = "redshift+psycopg2://%s:%s@%s:%s/%s" % (redshift_user,redshift_password,redshift_host,str(redshift_port),redshift_db)
    engine = sa.create_engine(connection_string)
    session = sessionmaker()
    session.configure(bind=engine)
    s = session()   
    s.execute('create table test ( id int)')
    s.close()
     
   