# Proceso GLUE Transactions

## 1. Cargamos las librerias

In [1]:
%%capture
!pip install -q awswrangler

In [2]:
import numpy as np
import pandas as pd
import boto3
import ast
from datetime import datetime, timedelta
import awswrangler as wr
from itertools import chain
import gc
import sys
import time

glue = boto3.client('glue')
s3 = boto3.resource('s3')
ssm = boto3.client('ssm') 
lakeformation = boto3.client('lakeformation')

### 2. Armamos el proceso en "GLUE GetData Transactions"

In [57]:
%%writefile get_data_transactions.py


import sys
import pyspark.sql.functions as func
from awsglue.dynamicframe import DynamicFrame
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.sql.types import *
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from pyspark.sql import SparkSession
from awsglue.job import Job
import pyspark.sql.functions as F
import json
import boto3
import ast
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import gc
from pyspark.conf import SparkConf
import pandas as pd
import os
from io import BytesIO
import awswrangler as wr

os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=com.amazonaws:aws-java-sdk-bundle:1.11.271,org.apache.hadoop:hadoop-aws:3.1.2 pyspark-shell"


print('Lectura de parámetros')

# ----------------------------------------------------------------------------------
print('NOW:', datetime.now())

args = getResolvedOptions(sys.argv,
                          ['bucket_transactions_data', 
                           'today', 
                           'kms_key_arn', 
                           'recommendations_bucket'])

bucket_transactions_data = args['bucket_transactions_data']
recommendations_bucket = args['recommendations_bucket']
kms_key_id = args['kms_key_arn']
today = args['today']

#--------------------------------------------------------------------------------------------------------------


#https://stackoverflow.com/questions/52932459/accessing-s3-bucket-from-local-pyspark-using-assume-role


print('Crear objetos S3-ssm')
# ----------------------------------------------------------------------------------
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
ssm = boto3.client('ssm')

#--------------------------------------------------------------------------------------------------------------
print('Parámetros:')
path_key_transactions = 'ar/tb_ar_core_transactions/'
path_key_authorizations = 'ar/tb_ar_core_authorizations/'
path_key_cotizaciones = 's3://test-datascience-adquirencia-fraude/data/auxiliar/df_cotizacion.parquet'
#path_key_params = 's3a://uala-arg-datalake-stage-prod/ar/configs/tb_ar_configs_params/be46f255d9c44e59a70edbbf0b815874.snappy.parquet'
## FECHAS INTERVALO
#print('1. CALCULO DE FECHAS')
##Today llevado al primero del mes menos 1 día
#today = datetime.strptime(today, '%Y-%m-%d').date().replace(day=1)
#last_day=(today-pd.offsets.DateOffset(days=1)).date()
##
#first_day=(last_day-pd.offsets.DateOffset(days=365)).date()
#
#print('2. Intevalo de fechas analizada: ',first_day,'y',last_day)

def first_and_last(today):
    fecha=datetime.strptime(today, '%Y-%m-%d').date()
    first_day=fecha.replace(day=1)
    next_month = fecha.replace(day=28) + timedelta(days=4)
    last_day_of_month = next_month - timedelta(days=next_month.day)
    return first_day,last_day_of_month

print('Declaración de funciones')
def list_objects_function(buckets_, first_day, last_day, keys_, retrieve_last=False):
    
    sts = boto3.client('sts')
    response = sts.assume_role(
        RoleArn='arn:aws:iam::514405401387:role/aws-rol-ml-read-stage-prod', #es el rol que existe en produccion por el cual "nos hacemos pasar" para acceder a los buckets de s3
        RoleSessionName='sesion-dsr-recomendaciones', # nombre que le damos a la sesión
        DurationSeconds=3600 # es el tiempo que dura la sesion por default si no especificamos este parámetro.
    )

    s3 = boto3.client(
        's3',
        aws_access_key_id=response['Credentials']['AccessKeyId'],
        aws_secret_access_key=response['Credentials']['SecretAccessKey'],
        aws_session_token=response['Credentials']['SessionToken']
    )


    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=buckets_, Prefix=keys_)
    files_in_bucket=[]
    for page in pages:
        files_page=[key['Key'] for key in page['Contents']]
        files_in_bucket+=files_page
        files_objets = [f"s3a://{buckets_}/" + i for i in files_in_bucket if
                            (keys_ in i)  and (i.find('.parquet') >= 0)]
        df_bucket_files = pd.DataFrame({
                'key': [i[:(i.find('dt=') + 14)] for i in files_objets],
                'path': files_objets,
                'date': pd.to_datetime([i[(i.find('dt=') + 3):(i.find('dt=') + 13)] for i in files_objets])
            })
        files=list(df_bucket_files.loc[df_bucket_files['date'].between(str(first_day),str(last_day)),'path'].values)
    return files

#-----------------------------------------------------------------------------------------------------------------

########## INICIO CONFIG SPARK ###############
sts = boto3.client('sts')
response = sts.assume_role(
    RoleArn='arn:aws:iam::514405401387:role/aws-rol-ml-read-stage-prod', #es el rol que existe en produccion por el cual "nos hacemos pasar" para acceder a los buckets de s3
    RoleSessionName='sesion-dsr-spark', # nombre que le damos a la sesión
    DurationSeconds=3600 # es el tiempo que dura la sesion por default si no especificamos este parámetro.
)
print('Spark Configuración')
spark_conf = SparkConf().setAll([
  ("spark.hadoop.fs.s3.enableServerSideEncryption", "true"),
  ("spark.hadoop.fs.s3.serverSideEncryption.kms.keyId", kms_key_id)
])
sc = SparkContext(conf=spark_conf) 
glueContext = GlueContext(sc)
spark = glueContext.spark_session
logger = glueContext.get_logger()
spark._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", response["Credentials"]["AccessKeyId"])
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", response["Credentials"]["SecretAccessKey"])
spark._jsc.hadoopConfiguration().set("fs.s3a.session.token",  response["Credentials"]["SessionToken"])
print(f"Hadoop version = {sc._jvm.org.apache.hadoop.util.VersionInfo.getVersion()}")
########## FIN CONFIG SPARK ###############

#-----------------------------------------------------------------------------------------------------------------

first_day,last_day = first_and_last(today)
print('Primer dia',first_day)
print('Ultimo dia',last_day)

files_objets_transactions= list_objects_function(bucket_transactions_data, first_day, last_day ,path_key_transactions)
print(f'Hay {len(files_objets_transactions)} archivos de transactions en la carpeta')

files_objets_authorizations= list_objects_function(bucket_transactions_data, first_day, last_day ,path_key_authorizations)
print(f'Hay {len(files_objets_authorizations)} archivos de transactions en la carpeta')

df_transactions = spark.read.parquet(*files_objets_transactions).dropDuplicates(['transaction_id']).select(["dt","transaction_id","account_from","amount","currency_code","transaction_type","status","authorization_id"])

df_authorizations = spark.read.parquet(*files_objets_authorizations).dropDuplicates(['authorization_id']).select(['authorization_id','metadata'])

s3=boto3.resource('s3')
df_param_mcc=pd.read_csv('s3://test-datascience-recommendations/data/param/param_mcc_category.csv')
df_param_mcc.mcc=df_param_mcc.mcc.astype(str)
df_param_mcc.rename(columns={'mcc':'mcc_param'},inplace=True)

df_transactions = df_transactions.filter(F.col("status").isin(['AUTHORIZED']))

df_cotizaciones=spark.createDataFrame(wr.s3.read_parquet(path_key_cotizaciones))

df_param_mcc=spark.createDataFrame(df_param_mcc)

df_transactions = df_transactions.join(df_cotizaciones, df_transactions['dt']==df_cotizaciones['fecha'], 'inner')

df_transactions = df_transactions.withColumn('amount_usd', F.col("amount")/F.col("venta_uala"))

df_transactions=df_transactions.join(df_authorizations, df_transactions["authorization_id"] == df_authorizations["authorization_id"], "left")

df_transactions = df_transactions.select("dt","account_from","amount_usd","transaction_type",F.get_json_object(df_transactions.metadata,'$.mcc').alias('mcc'))

df_transactions = df_transactions.withColumn('year_month', F.date_format(df_transactions.dt,'YYYY-MM'))

df_transactions = df_transactions.drop("dt")

df_transactions=df_transactions.join(df_param_mcc, df_transactions["mcc"] == df_param_mcc["mcc_param"], "left")#.na.fill('otros')

df_transactions.fillna('otros',subset=['category'])


##### ARMAMOS COMPRAS POR CATEGORIA USANDO MCC #######
df_transactions = df_transactions.withColumn('new_transaction_type',F.concat(F.when(F.col('transaction_type').isin(['AUTOMATIC_DEBIT','CONSUMPTION_POS']), F.lit('PURCHASE_')).otherwise(F.col('transaction_type')),F.when(F.col('transaction_type').isin(['AUTOMATIC_DEBIT','CONSUMPTION_POS']), F.upper(F.col('category'))).otherwise(F.lit(''))))
##### ARMAMOS COMPRAS POR CATEGORIA USANDO MCC #######

df_transactions = df_transactions.select("year_month","account_from","amount_usd","new_transaction_type")




df_transactions_nu = (df_transactions    
      .groupBy(['year_month', 'new_transaction_type','account_from'])
      .agg(F.count('new_transaction_type').alias('nu'))
      .groupBy(['account_from','year_month'])
      .pivot("new_transaction_type")
      .agg(F.sum('nu'))
      .na.fill(0)
      )

oldColumns = df_transactions_nu.schema.names
nonNuValues=["account_from","year_month"]
oldColumns=["NU_" + x for x in oldColumns if not str(x) in ["year_month","account_from","amount_usd"]]
newColumns=nonNuValues+oldColumns

df_transactions_nu=df_transactions_nu.toDF(*newColumns)



df_transactions_vl = (df_transactions    
      .groupBy(['year_month', 'new_transaction_type','account_from'])
      .agg(F.sum('amount_usd').alias('vl'))
      .groupBy(['account_from','year_month'])
      .pivot("new_transaction_type")
      .agg(F.sum('vl'))
      .na.fill(0)
      )

oldColumns = df_transactions_vl.schema.names
oldColumns=["VL_" + x for x in oldColumns]
df_transactions_vl=df_transactions_vl.toDF(*oldColumns)
join_condition = [df_transactions_nu["account_from"] == df_transactions_vl["VL_account_from"], df_transactions_nu["year_month"] == df_transactions_vl["VL_year_month"]]
df_transactions=df_transactions_nu.join(df_transactions_vl, join_condition, "inner").drop('VL_account_from','VL_year_month','VL_null','NU_null')




print(spark.sparkContext.getConf().getAll())

#### NUEVA INSTANCIA boto3 para usar buckets en stage #####
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
ssm = boto3.client('ssm')

df_pandas=df_transactions.toPandas()
df_pandas['dt'] = first_day
wr.s3.to_parquet(df_pandas,
                        path='s3://{}/data/raw/transactions/'.format(recommendations_bucket),
                        dataset=True,
                        partition_cols=['dt'],
                        mode="append",
                        concurrent_partitioning=True,
                        index=False)

print('Ubicación files', f's3://{recommendations_bucket}/data/raw/transactions/dt={str(first_day)}')

gc.collect()
print('TRANSACTIONS')
#print(df_transactions.show())
print((df_transactions.count(), len(df_transactions.columns)))
print(df_transactions.dtypes)
print(df_transactions.show())

Overwriting get_data_transactions.py


In [58]:
job_name='test-job_recommendations_transactions'

In [59]:
 #borrar job
glue.delete_job(
    JobName=job_name
)

{'JobName': 'test-job_recommendations_transactions',
 'ResponseMetadata': {'RequestId': 'd4bfb355-4848-457f-a451-f0330c134b52',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Wed, 21 Jul 2021 19:19:20 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '51',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'd4bfb355-4848-457f-a451-f0330c134b52'},
  'RetryAttempts': 0}}

## 3. Generamos los parametros

In [60]:
today = '2021-01-10'
bucket_transactions_data='uala-arg-datalake-analytics-prod'  ## AFIP, GP, etc
#s3://test-datascience-recommendations/data/
recommendations_bucket='test-datascience-recommendations'  # Para outputs
kms_key_arn='arn:aws:kms:us-east-1:322149183112:key/9cc44b23-c5e9-46cb-9987-0982d21f8d00' ## key para desencriptar

In [61]:
s3 = boto3.resource('s3')

# Guardar el archivo .py
s3.meta.client.upload_file('get_data_transactions.py', 
                           recommendations_bucket, #bucket
                           'artifacts/code/transactions/get_data_transactions.py' #key+filename
)
print('.py uploaded')

.py uploaded


## 4. Creamos el job de GLUE

In [62]:
job = glue.create_job(Name=job_name, 
                      GlueVersion='2.0',
                      Role='iam_r_uala_arg_datalake_stage_glue',
                      Command={'Name': 'glueetl',
                               'ScriptLocation': f's3://{recommendations_bucket}/artifacts/code/transactions/get_data_transactions.py'},
                      DefaultArguments={
                        '--additional-python-modules': 'pip,setuptools,pyarrow==2,awswrangler==2.8.0,numpy==1.19.1,fsspec==0.8.2'},
                      MaxCapacity=1
                      )

In [63]:
job_run = glue.start_job_run(
    JobName = job_name,
    Arguments = {
        '--today':today,
        '--bucket_transactions_data': bucket_transactions_data,
        '--recommendations_bucket': recommendations_bucket,
        '--kms_key_arn': kms_key_arn
    } 
)

In [64]:
print(job_run)

{'JobRunId': 'jr_2a3b265bf14221a26c50e9161a4ad564e902f35048f1ac82596c0e63398db004', 'ResponseMetadata': {'RequestId': 'cb1b1ca1-57cd-4761-9c66-d7fff1473b35', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Wed, 21 Jul 2021 19:19:32 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '82', 'connection': 'keep-alive', 'x-amzn-requestid': 'cb1b1ca1-57cd-4761-9c66-d7fff1473b35'}, 'RetryAttempts': 0}}


In [65]:
MAX_WAIT_TIME=time.time() + 60*10 # 1 hour
max_time = time.time() + MAX_WAIT_TIME
while time.time() < max_time:
    response=glue.get_job_run(JobName=job_name, RunId=job_run['JobRunId'])
    status = response['JobRun']['JobRunState']
    print('Job run: {}'.format(status))
    
    if status == 'SUCCEEDED' or status == 'FAILED':
        break
        
    time.sleep(45)

Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: SUCCEEDED


## 5. Controlamos la carga de datos en el bucket


In [123]:
df=wr.s3.read_parquet(f's3://test-datascience-adquirencia-fraude/data/auxiliar/df_cotizacion.parquet')

In [129]:
oldColumns = df.columns
oldColumns=["NU_" + x for x in oldColumns if not str(x) in ["nan","asd","fecha"]]
oldColumns

['NU_venta_uala']

In [7]:
print(f'primer fecha coti {df.fecha.min()} ultima fecha coti {df.fecha.max()}')

primer fecha coti 2002-06-14 00:00:00 ultima fecha coti 2021-07-13 00:00:00


In [8]:
df.head()

Unnamed: 0,fecha,venta_uala
0,2002-06-14,3.869
1,2002-06-15,3.869
2,2002-06-16,3.869
3,2002-06-17,3.869
4,2002-06-18,3.9008


In [9]:
df.dtypes

fecha         datetime64[ns]
venta_uala           float64
dtype: object

## 6. Carga histórica


In [66]:
bucket_transactions_data='uala-arg-datalake-analytics-prod'  ## AFIP, GP, etc
recommendations_bucket='test-datascience-recommendations'  # Para outputs
kms_key_arn='arn:aws:kms:us-east-1:322149183112:key/9cc44b23-c5e9-46cb-9987-0982d21f8d00' ## key para desencriptar
list_fechas=['2020-06-01','2020-07-01','2020-08-01','2020-09-01','2020-10-01','2020-11-01','2020-12-01',
            '2021-01-01','2021-02-01','2021-03-01','2021-04-01','2021-05-01']
job_name='test-job_recommendations_transactions'


In [67]:
for value in list_fechas:
    print("Procesando:",value[:7])
    job_run = glue.start_job_run(
        JobName = job_name,
        Arguments = {
            '--today':value,
            '--bucket_transactions_data': bucket_transactions_data,
            '--recommendations_bucket': recommendations_bucket,
            '--kms_key_arn': kms_key_arn
        } 
    )
    MAX_WAIT_TIME= 3600 # 1 hour
    max_time = time.time() + MAX_WAIT_TIME
    while time.time() < max_time:
        response=glue.get_job_run(JobName=job_name, RunId=job_run['JobRunId'])
        status = response['JobRun']['JobRunState']
        print('Job run: {}'.format(status))

        if status == 'SUCCEEDED':
            time.sleep(60)
            break
        elif status == 'FAILED':
            print ("Error para fecha:",value[:7]," \n" )
            sys.exit(1)
        time.sleep(120)

Procesando: 2020-06
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: SUCCEEDED
Procesando: 2020-07
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: SUCCEEDED
Procesando: 2020-08
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: SUCCEEDED
Procesando: 2020-09
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: SUCCEEDED
Procesando: 2020-10
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: SUCCEEDED
Procesando: 2020-11
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: SUCCEEDED
Procesando: 2020-12
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: SUCCEEDED
Procesando: 2021-01
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: SUCCEEDED
Procesando: 2021-02
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job run: RUNNING
Job r

In [26]:
#pre
112085647+91896875+107893286+74785390+79832664+80947544+60897141+72981539+86242174+84235181+86334578

938132019

In [27]:
#post
1028539+942427+935155+1134141+862378+860491+429003+640624+828413+828413+882853

9372437

In [None]:
pre limpieza 60897141  colummas 31
post limpieza 429003 colummas 7
pre limpieza 72981539 columnas 31
post limpieza 640624 columnas 9
pre limpieza 86242174 columnas 31
post limpieza 828413 columnas 10
pre limpieza 84235181 columnas 31
post limpieza 828413 columnas 10
pre limpieza 86334578 columnas 31
post limpieza 882853 columnas 10

In [55]:
sts = boto3.client('sts')
response = sts.assume_role(
    RoleArn='arn:aws:iam::514405401387:role/aws-rol-ml-read-stage-prod', #es el rol que existe en produccion por el cual "nos hacemos pasar" para acceder a los buckets de s3
    RoleSessionName='sesion-dsr-recomendaciones', # nombre que le damos a la sesión
    DurationSeconds=3600 # es el tiempo que dura la sesion por default si no especificamos este parámetro.
)

s3 = boto3.client(
    's3',
    aws_access_key_id=response['Credentials']['AccessKeyId'],
    aws_secret_access_key=response['Credentials']['SecretAccessKey'],
    aws_session_token=response['Credentials']['SessionToken']
)


# Get the path to the file
s3_response_object = s3.get_object(Bucket='uala-arg-datalake-stage-prod', Key='ar/afip/tb_ar_afip_actividades/dt=2021-03-16/')

# Read your file, i.e. convert it from a stream to bytes using .read()
df = s3_response_object['Body'].read()

# Read your file using BytesIO
df = pd.read_parquet(BytesIO(df))


NoSuchKey: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.

In [None]:
import io
import pandas as pd


sts = boto3.client('sts')
response = sts.assume_role(
    RoleArn='arn:aws:iam::514405401387:role/aws-rol-ml-read-stage-prod', #es el rol que existe en produccion por el cual "nos hacemos pasar" para acceder a los buckets de s3
    RoleSessionName='sesion-dsr-recomendaciones', # nombre que le damos a la sesión
    DurationSeconds=3600 # es el tiempo que dura la sesion por default si no especificamos este parámetro.
)

buffer = io.BytesIO()
s3 = boto3.resource( 's3',
    aws_access_key_id=response['Credentials']['AccessKeyId'],
    aws_secret_access_key=response['Credentials']['SecretAccessKey'],
    aws_session_token=response['Credentials']['SessionToken']
)

objeto = s3.Object('ACA EL BUCKET', 'aca/va/el/path.parquet')
objeto.download_fileobj(buffer)
df = pd.read_parquet(buffer)

In [80]:
s3=boto3.resource('s3')
df=pd.read_csv('s3://test-datascience-recommendations/data/param/param_mcc_category.csv')
df.mcc=df.mcc.astype(str)

In [81]:
df.dtypes

mcc         object
category    object
dtype: object

In [83]:
df.mcc.value_counts().sort_values()

5231    1
7338    1
7012    1
7342    1
5697    1
       ..
5945    1
5976    1
5172    1
3035    1
5947    1
Name: mcc, Length: 344, dtype: int64

In [None]:
today = '2021-01-10'
def first_and_last(today):
    fecha=datetime.strptime(today, '%Y-%m-%d').date()
    first_day=fecha.replace(day=1)
    next_month = fecha.replace(day=28) + timedelta(days=4)
    last_day_of_month = next_month - timedelta(days=next_month.day)
    return first_day,last_day_of_month
first_day,last_day = first_and_last(today)

keys_='ar/transactions/tb_ar_transactions_events_stage/'

sts = boto3.client('sts')
response = sts.assume_role(
    RoleArn='arn:aws:iam::514405401387:role/aws-rol-ml-read-stage-prod', #es el rol que existe en produccion por el cual "nos hacemos pasar" para acceder a los buckets de s3
    RoleSessionName='sesion-dsr-recomendaciones', # nombre que le damos a la sesión
    DurationSeconds=3600 # es el tiempo que dura la sesion por default si no especificamos este parámetro.
)

s3 = boto3.client(
    's3',
    aws_access_key_id=response['Credentials']['AccessKeyId'],
    aws_secret_access_key=response['Credentials']['SecretAccessKey'],
    aws_session_token=response['Credentials']['SessionToken']
)


paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket_transactions_data, Prefix='ar/transactions/tb_ar_transactions_events_stage/')
files_in_bucket=[]
for page in pages:
    files_page=[key['Key'] for key in page['Contents']]
    files_in_bucket+=files_page
files_objets = [f"s3://{bucket_transactions_data}/" + i for i in files_in_bucket if
                    (keys_ in i)  and (i.find('.parquet') >= 0)]
df_bucket_files = pd.DataFrame({
        'key': [i[:(i.find('dt=') + 14)] for i in files_objets],
        'path': files_objets,
        'date': pd.to_datetime([i[(i.find('dt=') + 3):(i.find('dt=') + 13)] for i in files_objets])
    })
#files=list(df_bucket_files.loc[df_bucket_files['date'].between(str(first_day),str(last_day)),'path'].values)
files=list(df_bucket_files.loc[:,'path'].values)
files

In [18]:
sts = boto3.client('sts')
response = sts.assume_role(
    RoleArn='arn:aws:iam::514405401387:role/aws-rol-ml-read-stage-prod', #es el rol que existe en produccion por el cual "nos hacemos pasar" para acceder a los buckets de s3
    RoleSessionName='sesion-dsr-recomendaciones', # nombre que le damos a la sesión
    DurationSeconds=3600 # es el tiempo que dura la sesion por default si no especificamos este parámetro.
)

s3 = boto3.client(
    's3',
    aws_access_key_id=response['Credentials']['AccessKeyId'],
    aws_secret_access_key=response['Credentials']['SecretAccessKey'],
    aws_session_token=response['Credentials']['SessionToken']
)
files_bucket=[key['Key'] for key in s3.list_objects(Bucket=bucket_transactions_data)['Contents']]

In [22]:
keys_

'ar/amplitude/tb_ar_amplitude_events_stage/'

In [17]:
bucket_transactions_data

'uala-arg-datalake-stage-prod'

In [218]:
sts = boto3.client('sts')
response = sts.assume_role(
    RoleArn='arn:aws:iam::514405401387:role/aws-rol-ml-read-stage-prod', #es el rol que existe en produccion por el cual "nos hacemos pasar" para acceder a los buckets de s3
    RoleSessionName='sesion-dsr-recomendaciones', # nombre que le damos a la sesión
    DurationSeconds=3600 # es el tiempo que dura la sesion por default si no especificamos este parámetro.
)
def get_matching_s3_keys(bucket, prefix, suffix):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    s3 = boto3.client('s3',
    aws_access_key_id=response['Credentials']['AccessKeyId'],
    aws_secret_access_key=response['Credentials']['SecretAccessKey'],
    aws_session_token=response['Credentials']['SessionToken']
                     )
    kwargs = {'Bucket': bucket}

    # If the prefix is a single string (not a tuple of strings), we can
    # do the filtering directly in the S3 API.
    if isinstance(prefix, str):
        kwargs['Prefix'] = prefix

    while True:

        # The S3 API response is a large blob of metadata.
        # 'Contents' contains information about the listed objects.
        resp = s3.list_objects_v2(**kwargs)
        for obj in resp['Contents']:
            key = obj['Key']
            if key.startswith(prefix) and key.endswith(suffix):
                yield key

        # The S3 API is paginated, returning up to 1000 keys at a time.
        # Pass the continuation token into the next response, until we
        # reach the final page (when this field is missing).
        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

In [223]:

for key in get_matching_s3_keys(bucket='bukkit', prefix='images/', suffix='.parquet'):
    print('hola')

ClientError: An error occurred (AccessDenied) when calling the ListObjectsV2 operation: Access Denied

In [221]:
asd

<generator object get_matching_s3_keys at 0x7f5cbb6900f8>

In [214]:
asd

<generator object keys at 0x7f5cb95baba0>

In [None]:
files_in_bucket=[key['Key'] for key in s3.list_objects(Bucket=bucket_transactions_data)['Contents']]
files_in_bucket

In [188]:
keys_

'ar/amplitude/tb_ar_amplitude_events_stage/'

In [9]:

sts = boto3.client('sts')
response = sts.assume_role(
    RoleArn='arn:aws:iam::514405401387:role/aws-rol-ml-read-stage-prod', #es el rol que existe en produccion por el cual "nos hacemos pasar" para acceder a los buckets de s3
    RoleSessionName='sesion-dsr-recomendaciones', # nombre que le damos a la sesión
    DurationSeconds=3600 # es el tiempo que dura la sesion por default si no especificamos este parámetro.
)
s3 = boto3.client(
    's3',
    aws_access_key_id=response['Credentials']['AccessKeyId'],
    aws_secret_access_key=response['Credentials']['SecretAccessKey'],
    aws_session_token=response['Credentials']['SessionToken']
)
paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket_transactions_data)
files_in_bucket=[]
for page in pages:
    files_page=[key['Key'] for key in s3.list_objects(Bucket=bucket_transactions_data)['Contents']]
    files_in_bucket+=files_page

ClientError: An error occurred (AccessDenied) when calling the ListObjectsV2 operation: Access Denied

In [180]:
keys_

'ar/amplitude/tb_ar_amplitude_events_stage/'

In [168]:
keys_

'ar/amplitude/tb_ar_amplitude_events_stage/'

In [47]:
df_bucket_files.date.value_counts().sort_index()


2019-11-20     7
2019-11-21    13
2019-11-22    18
2019-11-23    14
2019-11-24    12
              ..
2021-07-04    24
2021-07-05    28
2021-07-06    24
2021-07-07    24
2021-07-08    14
Name: date, Length: 597, dtype: int64

In [45]:

df_bucket_files[(df_bucket_files.date>='2021-01-01')&(df_bucket_files.date<='2021-01-31')].date.value_counts().sort_index()
#df_bucket_files.date.value_counts().sort_index()

2021-01-01    24
2021-01-02    24
2021-01-03    24
2021-01-04    24
2021-01-05    24
2021-01-06    24
2021-01-07    24
2021-01-08    24
2021-01-09    24
2021-01-10    24
2021-01-11    24
2021-01-12    24
2021-01-13    24
2021-01-14    24
2021-01-15    24
2021-01-16    24
2021-01-17    24
2021-01-18    24
2021-01-19    24
2021-01-20    24
2021-01-21    24
2021-01-22    24
2021-01-23    24
2021-01-24    24
2021-01-25    24
2021-01-26    24
2021-01-27     8
2021-01-28    24
2021-01-29    24
2021-01-30    24
2021-01-31    24
Name: date, dtype: int64

In [37]:
df_bucket_files.dtypes

key             object
path            object
date    datetime64[ns]
dtype: object

In [11]:
paginator = s3.get_paginator('list_objects')
pages = paginator.paginate(Bucket=bucket_transactions_data, Prefix=keys_)
files_in_bucket=[]
for page in pages:
    files_page=[key['Key'] for key in s3.list_objects(Bucket=bucket_transactions_data)['Contents']]
    files_in_bucket+=files_page

ClientError: An error occurred (AccessDenied) when calling the ListObjects operation: Access Denied

In [55]:
pd.set_option('display.max_columns', 500)  # or 1000
pd.set_option('display.max_rows', 500)  # or 1000
pd.set_option('display.max_colwidth', 199)  # or 199
df_bucket_files.path

0        s3://uala-arg-datalake-stage-prod/ar/amplitude/tb_ar_amplitude_events_stage/dt=2019-11-20/1e4fdbb9488148c68c9d29622a49edc1.snappy.parquet
1        s3://uala-arg-datalake-stage-prod/ar/amplitude/tb_ar_amplitude_events_stage/dt=2019-11-20/375adef7d335448889605a3f7f7614b6.snappy.parquet
2        s3://uala-arg-datalake-stage-prod/ar/amplitude/tb_ar_amplitude_events_stage/dt=2019-11-20/8952a93f527449aa8bcc72c015f72c2b.snappy.parquet
3        s3://uala-arg-datalake-stage-prod/ar/amplitude/tb_ar_amplitude_events_stage/dt=2019-11-20/ac377b2e101346e6be51fef2bb0e4bb3.snappy.parquet
4        s3://uala-arg-datalake-stage-prod/ar/amplitude/tb_ar_amplitude_events_stage/dt=2019-11-20/be8c30ddb4244c9c9854dd230a8d655a.snappy.parquet
                                                                           ...                                                                    
14234    s3://uala-arg-datalake-stage-prod/ar/amplitude/tb_ar_amplitude_events_stage/dt=2021-07-08/ae0624f3786f440e820

In [67]:
import io
import pandas as pd

sts = boto3.client('sts')
response = sts.assume_role(
    RoleArn='arn:aws:iam::514405401387:role/aws-rol-ml-read-stage-prod', #es el rol que existe en produccion por el cual "nos hacemos pasar" para acceder a los buckets de s3
    RoleSessionName='sesion-dsr-recomendaciones', # nombre que le damos a la sesión
    DurationSeconds=3600 # es el tiempo que dura la sesion por default si no especificamos este parámetro.
)

buffer = io.BytesIO()
s3 = boto3.resource( 's3',
    aws_access_key_id=response['Credentials']['AccessKeyId'],
    aws_secret_access_key=response['Credentials']['SecretAccessKey'],
    aws_session_token=response['Credentials']['SessionToken']
)

objeto = s3.Object('uala-arg-datalake-stage-prod', 'ar/configs/tb_ar_configs_params/be46f255d9c44e59a70edbbf0b815874.snappy.parquet')
objeto.download_fileobj(buffer)
df = pd.read_parquet(buffer)

In [80]:
df.param_type.unique()

<StringArray>
[      'account_status', 'card_delivery_status',          'card_status',
              'country',             'currency',  'customer_occupation',
         'general_type',       'marital_status',                  'mcc',
               'origin',             'province',             'rejected',
               'states', 'telerecargas_service',   'transaction_status',
     'transaction_type']
Length: 16, dtype: string

In [79]:
pd.set_option('display.max_rows', 500)
df[df.param_type == 'mcc'].head(500)

Unnamed: 0,param_type,id,value_1,value_2,value_3,value_4,value_5
505,mcc,3531,ASTIR HOTELS,ASTIR HOTELS,ASTIR HOTELS,Hotels/Motels/Inns/Resorts,Yes
506,mcc,8041,Chiropractors,Chiropractors,Chiropractors,Chiropractors,Yes
507,mcc,5960,Direct Marketing- Insurance Service,Direct Marketing- Insurance Service,Direct Marketing- Insurance Service,Direct Marketing - Insurance Services,Yes
508,mcc,3399,Car Rental,Car Rental,,Car Rental,Yes
509,mcc,3089,TRANSAERO,TRANSAERO,TRANSAERO,Airlines,Yes
510,mcc,9402,Postal Services – Government Only,Postal Services – Government Only,Postal Services – Government Only,Postal Services - Government Only,No1.6041-3(p)(3)
511,mcc,3265,Airlines,Airlines,,Airlines,Yes
512,mcc,3163,Airlines,Airlines,,Airlines,Yes
513,mcc,3221,TRANSPORTES AEROS MILITARES ECCUATORANOS,TRANSPORTES AEROS MILITARES ECCUATORANOS,TRANSPORTES AEROS MILITARES ECCUATORANOS,Airlines,Yes
514,mcc,5978,"Typewriter Stores – Sales, Rental, Service","Typewriter Stores – Sales, Rental, Service","Typewriter Stores – Sales, Rental, Service",Typewriter Stores,No1.6041-3(c)
