# Sequencia para processamento em Cluster

Todo o código foi baseado na biblioteca boto3.
Para executar é necessário ter na máquina configurada as credencias da AWS conforme descrito no link https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html.

In [1]:
try:
    !pip install boto3=="1.13.1" --quiet
except:
    print("Running throw py file.")

In [2]:
import boto3
import os
import json

In [3]:
dirpath = os.getcwd()
dataPath = dirpath + "/../data"

## Preparando os servicos AWS S3

In [4]:
s3 = boto3.resource('s3')

### Criacao de bucket chamado <span style="color:red">tembici-fk</span> 

In [5]:
my_bucket = "tembici-fk"
s3_client = boto3.client('s3')
s3_client.create_bucket(Bucket=my_bucket)

{'ResponseMetadata': {'RequestId': '1105F1D34F0183B3',
  'HostId': 'C6MVkxyxuoHshzEAynwDCmsQiyPWqv63h0VH8ua5YgcgWdQ8Y4vhXMKuiDCSk/QElv7+DpT7tbo=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'C6MVkxyxuoHshzEAynwDCmsQiyPWqv63h0VH8ua5YgcgWdQ8Y4vhXMKuiDCSk/QElv7+DpT7tbo=',
   'x-amz-request-id': '1105F1D34F0183B3',
   'date': 'Mon, 04 May 2020 02:37:55 GMT',
   'location': '/tembici-fk',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'Location': '/tembici-fk'}

### Criando uma TAG chamada <span style="color:red">tembici-test-fk</span> no bucket criado.

In [6]:
my_tag = [{'Key': 'tembici-test-fk', 'Value': ''}]
tag={'TagSet': my_tag}
s3_client.put_bucket_tagging(Bucket=my_bucket, Tagging=tag)

{'ResponseMetadata': {'RequestId': '757EF9B402DDB944',
  'HostId': 'InNn2fwKEZ4HnAAGJ0R/WnrZaH4DhirYjhPY54Ud4Vcx0rXxyNozhdrLUyWntV2Ys5fdxcPnBnY=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': 'InNn2fwKEZ4HnAAGJ0R/WnrZaH4DhirYjhPY54Ud4Vcx0rXxyNozhdrLUyWntV2Ys5fdxcPnBnY=',
   'x-amz-request-id': '757EF9B402DDB944',
   'date': 'Mon, 04 May 2020 02:37:55 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

### Exibindo os buckets existentes

In [7]:
for bucket in s3.buckets.all():
    print(bucket.name)

elasticbeanstalk-us-east-1-032594213725
estudo-aws-fabio
estudo-aws-fk
tembici-fk


### Upload de data files para o bucket

In [8]:
files = ['station.csv', 'weather.csv','trip.csv'] #'trip.csv'

for file in files: 
    file_name = dataPath + "/" + file
    try:
        response = s3_client.upload_file(file_name, my_bucket, "data/" + file)
        print("It was uploaded the file", "'" + file + "'", ".")
    except ClientError as e:
        logging.error(e)

It was uploaded the file 'station.csv' .
It was uploaded the file 'weather.csv' .
It was uploaded the file 'trip.csv' .


### Upload Zeppelin file (model) to bucket 

In [9]:
files = ['zeppelin-tembici-test-fk.json','zeppelin-tembici-test-fk.py']

for file in files: 
    file_name = dirpath + "/" + file
    try:
        response = s3_client.upload_file(file_name, my_bucket, "model/" + file)
        print("It was uploaded the file", "'" + file + "'", ".")
    except ClientError as e:
        logging.error(e)

It was uploaded the file 'zeppelin-tembici-test-fk.json' .
It was uploaded the file 'zeppelin-tembici-test-fk.py' .


## Configurando um Resource Group com o nome de <span style="color:red">rg-tembici-test-fk</span>

In [10]:
my_resource_group = "rg-tembici-test-fk"
RG_client = boto3.client('resource-groups')

#AWS::AllSupported
#AWS::S3::Bucket
query = {
    "ResourceTypeFilters": ["AWS::AllSupported"],
    "TagFilters":  [{
        "Key": my_tag[0].get("Key"),
        "Values": [""]
    }] 
}
resource_query = {
    'Type': 'TAG_FILTERS_1_0',
    'Query': json.dumps(query)
}
client = boto3.client("resource-groups")
try:
    resp = RG_client.create_group(Name=my_resource_group,ResourceQuery=resource_query)
    print("Resource Group was created.")
except Exception as e:
    print(e)


Resource Group was created.


### Criacao de um cluster EMR Cluster chamado <span style="color:red">spark-tembici-test-fk</span>

Foi escolhido a maquina m4.large pois tem uma dos menores custos.

In [11]:
emr_client = boto3.client('emr') #region_name='us-east-1'
my_emr_cluster = "spark-tembici-test-fk"

cluster_id = emr_client.run_job_flow(Name=my_emr_cluster, ReleaseLabel='emr-5.29.0',
    LogUri='s3://tembici-fk/log/',
    Applications=[
        {
            'Name': 'Spark'
        },
    ],
    Instances={
        'InstanceGroups': [
            {
                'Name': "Master",
                'Market': 'ON_DEMAND',
                'InstanceRole': 'MASTER',
                'InstanceType': 'm4.large',
                'InstanceCount': 1,
            },
            {
                'Name': "Slave",
                'Market': 'ON_DEMAND',
                'InstanceRole': 'CORE',
                'InstanceType': 'm4.large',
                'InstanceCount': 2,
            }
        ],
        'KeepJobFlowAliveWhenNoSteps': False,
        'TerminationProtected': False,
    },
    Steps=[
        {
            'Name': 'Spark application',   
                    'ActionOnFailure': 'CONTINUE',
                    'HadoopJarStep': {
                        'Jar': 'command-runner.jar',
                        'Args': ["spark-submit","--deploy-mode","cluster","s3://tembici-fk/model/zeppelin-tembici-test-fk.py"]
                    }
        }        
    ],                                    
    VisibleToAllUsers=True,
    JobFlowRole='EMR_EC2_DefaultRole',
    ServiceRole='EMR_DefaultRole',
    Tags=[
        {
            'Key': 'tembici-test-fk',
            'Value': ''
        },
    ],
)


# Verificaçao da Saida
Verificar na S3 "tembici-fk" a existencia de um arquivo na pasta "output/trips/" do tipo parquet com cerca de 7.4 Mb.

Este é o arquivo esperado pelo o desafio 2 do processo Tembici.

# Removendo as configuracoes de S3 e Resource Groups
### Remocao do Resource Group

In [12]:
RG_client.delete_group(GroupName=my_resource_group)

{'ResponseMetadata': {'RequestId': 'f3bbfd44-2713-4684-b812-99d2dcde90c3',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Mon, 04 May 2020 02:56:04 GMT',
   'content-type': 'application/json',
   'content-length': '149',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'f3bbfd44-2713-4684-b812-99d2dcde90c3',
   'x-amz-apigw-id': 'L_GawHyxoAMF--w=',
   'x-amzn-trace-id': 'Root=1-5eaf8444-e05724181240570048912fd0;Sampled=1'},
  'RetryAttempts': 0},
 'Group': {'GroupArn': 'arn:aws:resource-groups:us-east-1:032594213725:group/rg-tembici-test-fk',
  'Name': 'rg-tembici-test-fk'}}

### Remocao de todos os arquivos do Bucket

In [13]:
bucket = s3.Bucket(my_bucket)
bucket.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'B45EFCE91581FCF7',
   'HostId': 'Liz8JWsAo37cg+6ZXE4dEdkA6i+ZC0qpchIDFO3ZRpX5lE7JLKqCDYW6tFAjQ8Fl6AEQvt8+y+w=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'Liz8JWsAo37cg+6ZXE4dEdkA6i+ZC0qpchIDFO3ZRpX5lE7JLKqCDYW6tFAjQ8Fl6AEQvt8+y+w=',
    'x-amz-request-id': 'B45EFCE91581FCF7',
    'date': 'Mon, 04 May 2020 02:56:10 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'log/j-QT7175VZF6OL/node/i-03803a99bc4c3a78b/applications/hadoop-hdfs/hadoop-hdfs-datanode-ip-172-31-87-36.log.gz'},
   {'Key': 'log/j-QT7175VZF6OL/node/i-040f0ab09d99030b1/daemons/instance-state/console.log-2020-05-04-02-44.gz'},
   {'Key': 'log/j-QT7175VZF6OL/node/i-03803a99bc4c3a78b/setup-devices/setup_var_log_dir.log.gz'},
   {'Key': 'log/j-QT7175VZF6OL/node/i-0b917cec94ffb68ca/provision-node/reports/0/f27c9bfd-fc1c-47f3-98c5-7627e3d2e92c/i

### Remocao do bucket

In [14]:
s3_client.delete_bucket(Bucket=my_bucket)

{'ResponseMetadata': {'RequestId': 'A2277F30CB36165F',
  'HostId': 'yJokDWgKLSF4HKV77ceA3yePQRMY6OS8Y+738Yv3kVCpEXrM4SCdys4bD7MHmAB9ECUEMNXUfhc=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': 'yJokDWgKLSF4HKV77ceA3yePQRMY6OS8Y+738Yv3kVCpEXrM4SCdys4bD7MHmAB9ECUEMNXUfhc=',
   'x-amz-request-id': 'A2277F30CB36165F',
   'date': 'Mon, 04 May 2020 02:56:15 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}