# AWS Glue Testing Notebook

This notebook demonstrates how to test and run your AWS Glue Python scripts.

In [None]:
# Test PySpark and Glue Context
from pyspark.sql import SparkSession
from awsglue.context import GlueContext
from awsglue.job import Job

# Create Spark and Glue contexts
spark = SparkSession.builder.appName("TestGlueJob").getOrCreate()
glueContext = GlueContext(spark)
job = Job(glueContext)

print("✅ Spark and Glue contexts created successfully!")
print(f"Spark version: {spark.version}")

In [None]:
# Method 1: Run your Python file directly using exec
import sys
import os

# Add current directory to path so we can import our modules
sys.path.append('/home/hadoop/workspace')

# Set command line arguments for your script
sys.argv = ['sample.py', '--JOB_NAME', 'my-test-job']

# Read and execute the Python file
try:
    with open('/home/hadoop/workspace/sample.py', 'r') as f:
        script_content = f.read()
    
    exec(script_content)
    print("✅ Script executed successfully!")
except Exception as e:
    print(f"❌ Error running script: {e}")

In [None]:
# Method 2: Import your script as a module (if it has functions)
# First, let's see what files are available
import os
print("Files in workspace:")
for file in os.listdir('/home/hadoop/workspace'):
    if file.endswith('.py'):
        print(f"  📄 {file}")

In [None]:
# Method 3: Use subprocess to run the script (like running from terminal)
import subprocess
import sys

try:
    result = subprocess.run(
        [sys.executable, 'sample.py', '--JOB_NAME', 'my-test-job'],
        cwd='/home/hadoop/workspace',
        capture_output=True,
        text=True
    )
    
    print("STDOUT:")
    print(result.stdout)
    
    if result.stderr:
        print("STDERR:")
        print(result.stderr)
        
    print(f"Return code: {result.returncode}")
    
except Exception as e:
    print(f"Error: {e}")

In [None]:
# Method 4: Use spark-submit (recommended for Spark jobs)
import subprocess

try:
    result = subprocess.run(
        ['spark-submit', 'sample.py', '--JOB_NAME', 'my-test-job'],
        cwd='/home/hadoop/workspace',
        capture_output=True,
        text=True
    )
    
    print("STDOUT:")
    print(result.stdout)
    
    if result.stderr:
        print("STDERR:")
        print(result.stderr)
        
    print(f"Return code: {result.returncode}")
    
except Exception as e:
    print(f"Error: {e}")

In [None]:
# Test reading your sample data
try:
    # Read the parquet file if it exists
    df = spark.read.parquet('/home/hadoop/workspace/data/employees.parquet')
    print(f"✅ Successfully read parquet file with {df.count()} rows")
    
    # Show schema and sample data
    df.printSchema()
    df.show(5)
    
except Exception as e:
    print(f"❌ Could not read parquet file: {e}")
    print("You may need to create sample data first by running create_sample_data.py")

## Running Scripts from Terminal

You can also open a terminal in Jupyter Lab and run:

```bash
# Direct Python execution
python3 sample.py --JOB_NAME my-test-job

# Using spark-submit (recommended)
spark-submit sample.py --JOB_NAME my-test-job

# Make executable and run
chmod +x sample.py
./sample.py --JOB_NAME my-test-job
```