In [None]:
import sys
import boto3


emr_provisioning = boto3.client('emr',region_name='us-east-1',aws_access_key_id=os.environ['aws_access_keys'],\
                                aws_secret_access_key=os.environ['aws_secret_key'])

def lambda_handler(event,context):
    bucket_name = event['Records'][0]['s3']['bucket']['name']
    file_name = urllib.parse.unquote_plus(event['Records'][0]['s3']['bucket']['key'])
    print("FileName:",file_name)
    print("BucketName:",bucket_name)
    backend_code="s3://codebucketpsspark/LambdaPysparkEMR.py"
    spark_submit= [
       "spark-submit",
       "--master","yarn",
       "--deploy-mode","cluster",
       "--executor-memory", '6G',
       "--num-executors", '1',
       "--executor-cores", '2',
       "bucket_name",
       "backend_code",
       "file_name" ]

    cluster_id = emr_provisioning.run_job_flow(
        Name= 'spark_job_cluster',
        LogUri= 's3://athena-bucket1/QueryResults/',
        ReleaseLabel= 'emr-6.5.0',
        Tags = [
                {'Key': 'Project', 'Value': 'MysparkProject'},
                {'Key': 'Service', 'Value': 'sparkserv'},
                {'Key': 'Environment', 'Value': 'Development'}
        ],
        Instances={
            'MasterInstanceType': 'm5.xlarge',
            'SlaveInstanceType': 'm5.large',
            'InstanceCount': 3,
            'KeepJobFlowAliveWhenNoSteps': False,
            'TerminationProtected': False,
            'Ec2SubnetId': 'my-subnet-public-02'
        },
        Applications = [ {'Name': 'Spark'},{'Name': 'Hadoop'} ],
        Configurations = [
            { 'Classification': 'spark-hive-site',
              'Properties': {
                  'hive.metastore.client.factory.class': 'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory'}
            }
        ],
        VisibleToAllUsers=True,
        Tags = [
            {'Key': 'Project', 'Value': 'MysparkProject'},
            {'Key': 'Service', 'Value': 'sparkserv'},
            {'Key': 'Environment', 'Value': 'Development'}
        ]
        JobFlowRole = 'EMRLambda-EMREC2InstanceProfile-XXXXXXXXX',
        ServiceRole = 'EMRLambda-EMRRole-XXXXXXXXX',
        Steps=[
            {
                'Name': 'flow-log-analysis',
                'ActionOnFailure': 'TERMINATE_CLUSTER',
                'HadoopJarStep': {
                        'Jar': 'command-runner.jar',
                        'Args': [
                            'spark-submit',
                            '--deploy-mode', 'cluster',
                            '--executor-memory', '6G',
                            '--num-executors', '1',
                            '--executor-cores', '2',
                            '--class', 'com.aws.emr.ProfitCalc',
                            's3://codebucketpsspark/LambdaPysparkEMR.py', # backendcode
                            's3://etlbucket/SampleSuperstore.csv', # file to be processed
                            's3://boto3practticebucket/aggregatedData.csv' #output location for file
                        ]
                }
            }
        ]
    )
    