## Import Packages and Libraries

In [107]:
import pandas as pd
import boto3
from io import StringIO 
import psycopg2
import time
from dotenv import load_dotenv 
import os

# Getting the Data

- Getting data from: [Cycling Data](https://cycling.data.tfl.gov.uk/)

In [5]:
url = 'https://cycling.data.tfl.gov.uk/usage-stats/384JourneyDataExtract15Nov2023-30Nov2023.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Number,Start date,Start station number,Start station,End date,End station number,End station,Bike number,Bike model,Total duration,Total duration (ms)
0,135292553,2023-10-31 23:59,1009,"Taviton Street, Bloomsbury",2023-11-01 00:20,1153,"Pall Mall East, West End",54429,CLASSIC,20m 24s,1224353
1,135292554,2023-11-01 00:00,300012,"Irene Road, Parsons Green",2023-11-01 00:06,300058,"The Vale, Chelsea",60442,PBSC_EBIKE,6m 18s,378747
2,135292549,2023-10-31 23:58,300080,"Culvert Road, Battersea",2023-11-01 00:09,1135,"Claverton Street, Pimlico",53480,CLASSIC,10m 20s,620273
3,135292550,2023-10-31 23:58,300249,"Westminster Pier, Westminster",2023-11-01 00:03,1219,"Lower Marsh, Waterloo",20962,CLASSIC,4m 4s,244652
4,135292551,2023-10-31 23:59,1228,"Southampton Street, Strand",2023-11-01 00:17,200195,"St Martins Close, Camden Town",57448,CLASSIC,18m 29s,1109695


# Define functions and constants

## constants

In [109]:
load_dotenv()

s3 = boto3.resource('s3')
bucket_name = 'bycle-rental-london'
file_name = 'data.csv'
region_name = 'us-east-1'
cluster_params = {
    'ClusterIdentifier': 'bycle-london-cluster',
    'NodeType': 'dc2.large',
    'MasterUsername': os.environ['MASTER_USER_NAME'],
    'MasterUserPassword': os.environ['MasterUserPassword'],
    'DBName': os.environ['DB_NAME'],
    'ClusterType': 'single-node',
    'PubliclyAccessible': True,    
}


## Functions

In [9]:
def create_bucket(bucket_name):
    response = s3.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration={
            'LocationConstraint': 'us-west-2'  # Change to your desired region
        }
    )
    print("Bucket created:", bucket_name)
    return response

In [10]:
def delete_bucket(bucket_name : str):
    response = s3.delete_bucket(
        Bucket=bucket_name
    )
    print("Bucket deleted:", bucket_name)
    return response

# Put dataframe inside data lake (S3 Bucket)

## Creating bucket

In [16]:
create_bucket(bucket_name=bucket_name)

Bucket created: bycle-rental-london


s3.Bucket(name='bycle-rental-london')

In [17]:
# Check if it was created
for bucket in s3.buckets.all():
    print(bucket.name)

bycle-rental-london


## Put the data

In [20]:
csv_buffer = StringIO()
df.to_csv(csv_buffer)
s3.Object(bucket_name, file_name).put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '5FRH8HE3NE8DAP72',
  'HostId': 'GyHT7pGVS9CGbfrfy2Gm3b7fH3YjS/bwpbAnfW0alBe3zBt0E6dkJSm/693QjcYGNGYLGdHu7MG81Yiude7JbQ==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'GyHT7pGVS9CGbfrfy2Gm3b7fH3YjS/bwpbAnfW0alBe3zBt0E6dkJSm/693QjcYGNGYLGdHu7MG81Yiude7JbQ==',
   'x-amz-request-id': '5FRH8HE3NE8DAP72',
   'date': 'Fri, 12 Apr 2024 10:21:04 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"5257c98f906ce9f199c4c7706c29c15b"',
   'server': 'AmazonS3',
   'content-length': '0',
   'connection': 'close'},
  'RetryAttempts': 1},
 'ETag': '"5257c98f906ce9f199c4c7706c29c15b"',
 'ServerSideEncryption': 'AES256'}

## Read the data from the Data Lake

In [25]:
response = s3.Object(bucket_name, file_name).get()
data = response['Body'].read().decode('utf-8')
df = pd.read_csv(StringIO(data))
df.head()

Unnamed: 0.1,Unnamed: 0,Number,Start date,Start station number,Start station,End date,End station number,End station,Bike number,Bike model,Total duration,Total duration (ms)
0,0,135292553,2023-10-31 23:59,1009,"Taviton Street, Bloomsbury",2023-11-01 00:20,1153,"Pall Mall East, West End",54429,CLASSIC,20m 24s,1224353
1,1,135292554,2023-11-01 00:00,300012,"Irene Road, Parsons Green",2023-11-01 00:06,300058,"The Vale, Chelsea",60442,PBSC_EBIKE,6m 18s,378747
2,2,135292549,2023-10-31 23:58,300080,"Culvert Road, Battersea",2023-11-01 00:09,1135,"Claverton Street, Pimlico",53480,CLASSIC,10m 20s,620273
3,3,135292550,2023-10-31 23:58,300249,"Westminster Pier, Westminster",2023-11-01 00:03,1219,"Lower Marsh, Waterloo",20962,CLASSIC,4m 4s,244652
4,4,135292551,2023-10-31 23:59,1228,"Southampton Street, Strand",2023-11-01 00:17,200195,"St Martins Close, Camden Town",57448,CLASSIC,18m 29s,1109695


# Moving the data from the Data Lake to Data Warehouse

## Creating Data Warehouse

In [81]:
redshift = boto3.client('redshift',  region_name=region_name)
response = redshift.create_cluster(**cluster_params)
print("Cluster created:", response['Cluster']['ClusterIdentifier'])

Cluster created: bycle-london-cluster


In [95]:
time.sleep(90) # Wait for Creating of cluster
cluster_id = response['Cluster']['ClusterIdentifier']
cluster_details = redshift.describe_clusters(ClusterIdentifier=cluster_id)

In [111]:
endpoint = cluster_details['Clusters'][0]['Endpoint']
port = endpoint['Port']
host = endpoint['Address']
username = cluster_params['MasterUsername']
password = cluster_params['MasterUserPassword']
database = cluster_params['DBName']

## Put this information into the DBT
print("\nConnection Settings:")
print("Host:", host)
print("Port:", port)
print("Database:", database)
print("Username:", username)
print("Password:", password)


Connection Settings:
Port: 5439


In [103]:
# Permit inbound traffic to the dataset
security_group_id = cluster_details['Clusters'][0]['VpcSecurityGroups'][0]['VpcSecurityGroupId']
try:
    ec2 = boto3.client('ec2', region_name=region_name)
    ec2.authorize_security_group_ingress(
        GroupId=security_group_id,
        IpProtocol='TCP',
        FromPort=5439,
        ToPort=5439,
        CidrIp='0.0.0.0/0'
    )
    print("Inbound traffic on port 5439 is now allowed from any IP address.")
except Exception as e:
    print("An error occurred while authorizing ingress:", e)

Inbound traffic on port 5439 is now allowed from any IP address.


In [104]:
try:
    conn = psycopg2.connect(
        host=host, 
        port=port, 
        database=database, 
        user = username, 
        password = password
    )
    print("Connected to Redshift!")
except Exception as e:
    print("Unable to connect to Redshift:", e)

# Close the connection
conn.close()

Connected to Redshift!
