# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [None]:
%idle_timeout 2880
%glue_version 5.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [None]:
dyf = glueContext.create_dynamic_frame.from_catalog(database='database_name', table_name='table_name')
dyf.printSchema()

#### Example: Convert the DynamicFrame to a Spark DataFrame and display a sample of the data


In [None]:
df = dyf.toDF()
df.show()

#### Example: Visualize data with matplotlib


In [None]:
import matplotlib.pyplot as plt

# Set X-axis and Y-axis values
x = [5, 2, 8, 4, 9]
y = [10, 4, 8, 5, 2]
  
# Create a bar chart 
plt.bar(x, y)
  
# Show the plot
%matplot plt

#### Example: Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog


In [None]:
s3output = glueContext.getSink(
  path="s3://bucket_name/folder_name",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)
s3output.setCatalogInfo(
  catalogDatabase="demo", catalogTableName="populations"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(DyF)

In [None]:
import sys
import boto3
import pandas as pd

# Input arguments for the Glue job
args = sys.argv
source_bucket_name = "team11project"  # Source S3 bucket
target_bucket_name = "team11projectdw"  # Target S3 bucket
oltp_prefix = "oltp/"  # Folder containing source data
olap_prefix = "olap/"  # Folder for transformed data in the target bucket

# File paths for source data
order_items_file = f"s3://{source_bucket_name}/{oltp_prefix}order.csv"
order_reviews_file = f"s3://{source_bucket_name}/{oltp_prefix}order_reviews.csv"
products_file = f"s3://{source_bucket_name}/{oltp_prefix}products.csv"
sellers_file = f"s3://{source_bucket_name}/{oltp_prefix}sellers.parquet"

# --- Step 1: Load the data into Pandas DataFrames ---
print("Loading data from S3...")
df_order_items = pd.read_csv(order_items_file)
df_order_reviews = pd.read_csv(order_reviews_file)
df_products = pd.read_csv(products_file)
df_sellers = pd.read_parquet(sellers_file)

# --- Step 2: Create Order Dimension ---
print("Creating Order Dimension...")
order_dimension = df_order_items.copy()

# Add 'Orderkey' as a unique identifier
order_dimension['Orderkey'] = range(1, len(order_dimension) + 1)

# Select relevant columns, including 'shipping_limit_date'
order_dimension = order_dimension[['Orderkey', 'order_id', 'product_id', 'seller_id', 'price', 'freight_value', 'shipping_limit_date']]

# Check for duplicates and drop them
print("Checking for duplicates in Order Dimension...")
order_dimension = order_dimension.drop_duplicates().reset_index(drop=True)

# --- Step 3: Create Review Dimension ---
print("Creating Review Dimension...")
review_dimension = df_order_reviews.copy()
review_dimension['Reviewkey'] = range(1, len(review_dimension) + 1)
review_dimension = review_dimension[['Reviewkey', 'order_id', 'review_score']].rename(columns={'review_score': 'score'})

# Check for duplicates and drop them
print("Checking for duplicates in Review Dimension...")
review_dimension = review_dimension.drop_duplicates().reset_index(drop=True)

# --- Step 4: Create Product Dimension ---
print("Creating Product Dimension...")
product_dimension = df_products.copy()
product_dimension['Productkey'] = range(1, len(product_dimension) + 1)
product_dimension = product_dimension[['Productkey', 'product_id', 'product_category_name', 
                                       'product_weight_g', 'product_length_cm', 
                                       'product_height_cm', 'product_width_cm']].rename(columns={
    'product_weight_g': 'product_weight',
    'product_length_cm': 'product_length',
    'product_height_cm': 'product_height',
    'product_width_cm': 'product_width'
})

# Check for duplicates and drop them
print("Checking for duplicates in Product Dimension...")
product_dimension = product_dimension.drop_duplicates().reset_index(drop=True)

# --- Step 5: Create Seller Dimension ---
print("Creating Seller Dimension...")
seller_dimension = df_sellers.copy()
seller_dimension['Sellerkey'] = range(1, len(seller_dimension) + 1)
seller_dimension = seller_dimension[['Sellerkey', 'seller_id', 'seller_zip_code_prefix', 
                                     'seller_city', 'seller_state']].rename(columns={
    'seller_zip_code_prefix': 'seller_zipcode'
})

# Check for duplicates and drop them
print("Checking for duplicates in Seller Dimension...")
seller_dimension = seller_dimension.drop_duplicates().reset_index(drop=True)

# --- Step 6: Save Transformed Data Locally ---
print("Saving transformed data locally...")
order_dimension.to_csv("/tmp/order_dimension.csv", index=False)
review_dimension.to_csv("/tmp/review_dimension.csv", index=False)
product_dimension.to_csv("/tmp/product_dimension.csv", index=False)
seller_dimension.to_csv("/tmp/seller_dimension.csv", index=False)

# --- Step 7: Upload Transformed Data to the New Target S3 Bucket ---
print("Uploading transformed data to the target S3 bucket...")
s3 = boto3.client('s3')

# Upload each dimension to the appropriate path in the target bucket
s3.upload_file("/tmp/order_dimension.csv", target_bucket_name, f"{olap_prefix}order_dimension/order_dimension.csv")
s3.upload_file("/tmp/review_dimension.csv", target_bucket_name, f"{olap_prefix}review_dimension/review_dimension.csv")
s3.upload_file("/tmp/product_dimension.csv", target_bucket_name, f"{olap_prefix}product_dimension/product_dimension.csv")
s3.upload_file("/tmp/seller_dimension.csv", target_bucket_name, f"{olap_prefix}seller_dimension/seller_dimension.csv")

print("Transformation and upload to the target bucket complete!")
