In [None]:
# Schema Validator

import boto3
import csv
import io
import os

s3 = boto3.client("s3")

EXPECTED_SCHEMA = {
    "Employee_ID": "int",
    "Employee_Name": "str",
    "College_Degree": "str",
    "Department": "str",
    "Job_Role": "str",
    "DMC_Campus": "str",
    "Email": "str",
    "Phone_Number": "str",
    "Performance_Rating": "int"
} # <-- Configure according to your expected schema

def lambda_handler(event, context):
    for record in event["Records"]:
        bucket = record["s3"]["bucket"]["name"]
        key = record["s3"]["object"]["key"]

        if not key.startswith("raw/"):
            continue

        try:
            obj = s3.get_object(Bucket=bucket, Key=key)
            body = obj["Body"].read().decode("utf-8").strip()
            reader = csv.DictReader(io.StringIO(body))
            headers = reader.fieldnames

            if not headers or headers != list(EXPECTED_SCHEMA.keys()):
                move_file(bucket, key, "invalid/")
                print(f"{key}: ❌ Invalid schema.")
                continue

            bad_row = False
            for row in reader:
                for col, dtype in EXPECTED_SCHEMA.items():
                    value = row.get(col, "").strip()
                    if value and dtype == "int" and not value.isdigit():
                        bad_row = True
                        break
                if bad_row:
                    break

            dest = "invalid/" if bad_row else "valid/"
            move_file(bucket, key, dest)
            print(f"{key}: {'❌ Invalid data types.' if bad_row else '✅ Validation passed.'}")

        except Exception:
            move_file(bucket, key, "invalid/")
            print(f"{key}: ⚠️ Validation failed due to an error.")

def move_file(bucket, source_key, dest_prefix):
    filename = os.path.basename(source_key)
    dest_key = f"{dest_prefix}{filename}"
    s3.copy_object(Bucket=bucket, CopySource=f"{bucket}/{source_key}", Key=dest_key)
    s3.delete_object(Bucket=bucket, Key=source_key)

In [None]:
# Python Transformation Logic

import pandas as pd

df = pd.read_csv(r"YOUR_FILE")
df = df.drop_duplicates()

df['Employee_Name'] = df['Employee_Name'].str.title().str.strip()
df['Employee_Name'] = df['Employee_Name'].str.replace(r'^(Dr\.|Mr\.|Ms\.|Mrs\.)\s*|,?\s*(DVM|MD|PhD|DDS|Esq\.|Jr\.|Sr\.)$', '', regex=True).str.strip()

df['Job_Role'] = df['Job_Role'].str.title().str.strip()

df['Email'] = df['Email'].str.replace('(?i)example', 'gmail', regex=True)
df['Email'] = df['Email'].str.replace(r'(@)(.*)', lambda m: m.group(1) + m.group(2).lower(), regex=True)

df['Phone_Number'] = '+63 ' + df['Phone_Number'].astype(str)
df

# Converted to PySpark

import sys
import boto3
from datetime import datetime
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.context import SparkContext
import pyspark.sql.functions as F

args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# Read valid data
dyf = glueContext.create_dynamic_frame.from_options(
    connection_type="s3",
    connection_options={"paths": ["s3://YOUR_BUCKET/valid/"]},
    format="csv",
    format_options={"withHeader": True}
)
df = dyf.toDF().dropDuplicates()

# Clean columns
df = df.withColumn(
    "Employee_Name",
    F.regexp_replace(
        F.initcap(F.trim(F.col("Employee_Name"))),
        r'^(Dr\.|Mr\.|Ms\.|Mrs\.)\s*|,?\s*(DVM|MD|PhD|DDS|Esq\.|Jr\.|Sr\.)$',
        ''
    )
).withColumn(
    "Job_Role", F.initcap(F.trim(F.col("Job_Role")))
).withColumn(
    "Email", F.regexp_replace(F.col("Email"), '(?i)example', 'gmail')
).withColumn(
    "Email",
    F.concat(
        F.regexp_extract(F.col("Email"), r'^[^@]+', 0),
        F.lit('@'),
        F.lower(F.regexp_extract(F.col("Email"), r'@(.+)', 1))
    )
).withColumn(
    "Phone_Number", F.concat(F.lit("+63 "), F.col("Phone_Number").cast("string"))
).filter(
    F.col("Email").isNotNull() & F.col("Phone_Number").isNotNull()
)

# Write cleaned data
cleaned_dyf = DynamicFrame.fromDF(df, glueContext, "cleaned_dyf")
sink = glueContext.getSink(
    path="s3://YOUR_BUCKET/transformed/",
    connection_type="s3",
    updateBehavior="UPDATE_IN_DATABASE",
    compression="snappy",
    enableUpdateCatalog=True,
    transformation_ctx="sink"
)
sink.setCatalogInfo(catalogDatabase="YOUR_DATABASE_NAME", catalogTableName="YOUR_CLEAN_TABLE_NAME")
sink.setFormat("glueparquet")
sink.writeFrame(cleaned_dyf)

# Archive
s3 = boto3.client("s3")
bucket = "YOUR_BUCKET_NAME"
response = s3.list_objects_v2(Bucket=bucket, Prefix="transformed/")
today = datetime.now().strftime("%Y-%m-%d")
archive_prefix = f"archive/{today}/"

if "Contents" in response:
    for obj in response["Contents"]:
        copy_source = {"Bucket": bucket, "Key": obj["Key"]}
        archive_key = obj["Key"].replace("transformed/", archive_prefix)
        s3.copy_object(
            CopySource=copy_source,
            Bucket=bucket,
            Key=archive_key,
            StorageClass="DEEP_ARCHIVE"
        )

print("✅ Glue transformation and archiving complete.")
job.commit()

In [None]:
# LAMBDA-ATHENA GLOBAL DEDUPLICATION

import boto3

def lambda_handler(event, context):
    athena = boto3.client('athena')

    query = """
    CREATE OR REPLACE VIEW "YOUR_DATABASE_NAME"."YOUR_DEDUPLICATED_NAME" AS
    SELECT *
    FROM (
        SELECT *,
               ROW_NUMBER() OVER (PARTITION BY employee_id) AS rn
        FROM "YOUR_DATABASE_NAME"."YOUR_CLEAN_TABLE_NAME"
    )
    WHERE rn = 1;
    """

    response = athena.start_query_execution(
        QueryString=query,
        QueryExecutionContext={
            'Catalog': 'AwsDataCatalog',
            'Database': 'YOUR_DATABASE_NAME'
        },
        ResultConfiguration={
            'OutputLocation': 's3://YOUR_ATHENA_QUERY_LOCATION/'
        }
    )

    print(f"Athena deduplication view created. Query ID: {response['QueryExecutionId']}")

In [None]:
# Python File Exporter. You can automate this using a method of your own (e.g., Python schedule Library, Windows Task Scheduler, and etc.)

import pandas as pd
import boto3
import io
from datetime import datetime

ACCESS_KEY = "YOUR_ACCESS_KEY"
SECRET_KEY = "YOUR_SECRET_KEY"
REGION = "YOUR_REGION"
BUCKET_NAME = "YOUR_BUCKET_NAME"

sheet_url = "YOUR_DOCS_LINK/export?format=csv"

s3 = boto3.client(
    "s3",
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY,
    region_name=REGION
)

df = pd.read_csv(sheet_url)

if "Submission ID" in df.columns:
    df = df.drop(columns=["Submission ID"])

timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_name = f"google_sheet_responses_{timestamp}.csv"
s3_key = f"raw/{file_name}"

csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False)

s3.put_object(Bucket=BUCKET_NAME, Key=s3_key, Body=csv_buffer.getvalue())

print(f"✅ Successfully uploaded full sheet to s3://{BUCKET_NAME}/{s3_key}")