In [1]:
import os
import json
import boto3
import requests
import pandas as pd
import pyarrow as pa
import awswrangler as wr
import pyarrow.parquet as pq

from io import BytesIO
from datetime import datetime

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
# Criar um cliente para o AWS Secrets Manager com base no perfil
client = boto3.session.Session(profile_name='CRISTIAN_AWS').client('secretsmanager', region_name='us-east-1')

# Recuperar o segredo
response = client.get_secret_value(SecretId='env/openweather')
secret_data = response['SecretString']

# Analisar o conteúdo do segredo (assumindo que é um formato JSON)
secrets_dict = json.loads(secret_data)

# Obtenha o valor associado à chave 'OPEN_WEATHER_SECRECT'
open_weather_token = secrets_dict.get('OPEN_WEATHER_SECRECT', '')

In [4]:
API_LINK = f"https://api.openweathermap.org/data/2.5/weather?q=novo hamburgo,br&APPID={open_weather_token}"

In [5]:
requisicao = requests.get(API_LINK)

In [6]:
wheater_json = requisicao.json()

In [7]:
wheater_json

{'coord': {'lon': -51.1306, 'lat': -29.6783},
 'weather': [{'id': 801,
   'main': 'Clouds',
   'description': 'few clouds',
   'icon': '02d'}],
 'base': 'stations',
 'main': {'temp': 302.3,
  'feels_like': 305.28,
  'temp_min': 299.86,
  'temp_max': 305.23,
  'pressure': 1010,
  'humidity': 65,
  'sea_level': 1010,
  'grnd_level': 1006},
 'visibility': 10000,
 'wind': {'speed': 2.77, 'deg': 116, 'gust': 3.42},
 'clouds': {'all': 22},
 'dt': 1704481549,
 'sys': {'type': 2,
  'id': 2020383,
  'country': 'BR',
  'sunrise': 1704443410,
  'sunset': 1704493729},
 'timezone': -10800,
 'id': 3456068,
 'name': 'Novo Hamburgo',
 'cod': 200}

In [8]:
# Convertendo o dicionário Python para uma string JSON
weather_json_string = json.dumps(wheater_json)

Save raw JSON at Data-lake

In [9]:
# Defina suas credenciais explicitamente (não recomendado)
profile_name = 'CRISTIAN_AWS'
s3 = boto3.session.Session(profile_name=profile_name).client('s3')
bucket_name = 'data-integration-projects'
directory_path = 'openweather-data-lake/raw-data/'

# Obtenha a data atual
current_date = datetime.now()
dia_da_execucao = current_date.strftime('%Y-%m-%d-%H-%M-%S')

# Divida a data em ano, mês e dia
year = str(current_date.year)
month = str(current_date.month).zfill(2)  # Adicionar zero à esquerda se necessário
day = str(current_date.day).zfill(2)  # Adicionar zero à esquerda se necessário

# Construa o caminho do diretório no formato 'year/month/day/'
directory_path = f'{directory_path}{year}/{month}/{day}/'

# Nome do arquivo usando a data e hora da execução
file_name = f'weather_data_{dia_da_execucao}.json'

# Carregar o arquivo para o S3
s3.put_object(Bucket=bucket_name, Key=directory_path + file_name, Body=weather_json_string)

print(f'Arquivo salvo em: {bucket_name}/{directory_path}{file_name}')

Arquivo salvo em: data-integration-projects/openweather-data-lake/raw-data/2024/01/05/weather_data_2024-01-05-16-05-49.json


Import data from data-lake

In [10]:
profile_name = 'CRISTIAN_AWS'
# Configuração do cliente S3
s3 = boto3.session.Session(profile_name=profile_name)
# Substitua 'seu-bucket' e 'seu-arquivo' pelos valores reais
bucket_name = 'data-integration-projects/openweather-data-lake/raw-data'
year = "2024"
month = "01"
day = "05" 
# Leitura dos dados JSON do S3 usando o awswrangler
df = wr.s3.read_json(f's3://{bucket_name + "/" + year + "/" + month + "/" + day}/*', lines=True, boto3_session = s3)

In [12]:
df.head(5)

Unnamed: 0,coord,weather,base,main,visibility,wind,clouds,dt,sys,timezone,id,name,cod
0,"{'lon': -51.1306, 'lat': -29.6783}","[{'id': 801, 'main': 'Clouds', 'description': 'few clouds', 'icon': '02d'}]",stations,"{'temp': 302.48, 'feels_like': 305.45, 'temp_min': 300.41, 'temp_max': 305.23, 'pressure': 1010, 'humidity': 64, 'sea_level': 1010, 'grnd_level': 1006}",10000,"{'speed': 2.77, 'deg': 116, 'gust': 3.42}",{'all': 22},1704479982,"{'type': 2, 'id': 2020383, 'country': 'BR', 'sunrise': 1704443410, 'sunset': 1704493729}",-10800,3456068,Novo Hamburgo,200
0,"{'lon': -51.1306, 'lat': -29.6783}","[{'id': 801, 'main': 'Clouds', 'description': 'few clouds', 'icon': '02d'}]",stations,"{'temp': 302.48, 'feels_like': 305.45, 'temp_min': 300.41, 'temp_max': 305.23, 'pressure': 1010, 'humidity': 64, 'sea_level': 1010, 'grnd_level': 1006}",10000,"{'speed': 2.77, 'deg': 116, 'gust': 3.42}",{'all': 22},1704479982,"{'type': 2, 'id': 2020383, 'country': 'BR', 'sunrise': 1704443410, 'sunset': 1704493729}",-10800,3456068,Novo Hamburgo,200
0,"{'lon': -51.1306, 'lat': -29.6783}","[{'id': 801, 'main': 'Clouds', 'description': 'few clouds', 'icon': '02d'}]",stations,"{'temp': 302.3, 'feels_like': 305.28, 'temp_min': 299.86, 'temp_max': 305.23, 'pressure': 1010, 'humidity': 65, 'sea_level': 1010, 'grnd_level': 1006}",10000,"{'speed': 2.77, 'deg': 116, 'gust': 3.42}",{'all': 22},1704481549,"{'type': 2, 'id': 2020383, 'country': 'BR', 'sunrise': 1704443410, 'sunset': 1704493729}",-10800,3456068,Novo Hamburgo,200


In [13]:
df['lon'] = df['coord'].apply(lambda x: x['lon'])
df['lat'] = df['coord'].apply(lambda x: x['lat'])

In [14]:
df['clouds_description'] = df['weather'].apply(lambda x: x[0]['description'] if x and isinstance(x, list) and 'description' in x[0] else None)

In [15]:
df['temp'] = df['main'].apply(lambda x: x['temp'])
df['feels_like'] = df['main'].apply(lambda x: x['feels_like'])
df['temp_min'] = df['main'].apply(lambda x: x['temp_min'])
df['temp_max'] = df['main'].apply(lambda x: x['temp_max'])
df['pressure'] = df['main'].apply(lambda x: x['pressure'])
df['humidity'] = df['main'].apply(lambda x: x['humidity'])

In [16]:
df['clouds_number'] = df['clouds'].apply(lambda x: x['all'])

In [17]:
df['country'] = df['sys'].apply(lambda x: x['country'])

In [18]:
df['dt'] = pd.to_numeric(df['dt'])

# Converta para formato de data e hora
df['dt_utc'] = pd.to_datetime(df['dt'], unit='s')

# Converta para UTC
df['dt_utc'] = pd.to_datetime(df['dt_utc'], utc=True)

In [19]:
df.drop(columns= ["coord", "weather", "main", "wind", "clouds", "sys", "dt", "timezone"], inplace= True)

In [20]:
df.reset_index(drop= True)

Unnamed: 0,base,visibility,id,name,cod,lon,lat,clouds_description,temp,feels_like,temp_min,temp_max,pressure,humidity,clouds_number,country,dt_utc
0,stations,10000,3456068,Novo Hamburgo,200,-51.1306,-29.6783,few clouds,302.48,305.45,300.41,305.23,1010,64,22,BR,2024-01-05 18:39:42+00:00
1,stations,10000,3456068,Novo Hamburgo,200,-51.1306,-29.6783,few clouds,302.48,305.45,300.41,305.23,1010,64,22,BR,2024-01-05 18:39:42+00:00
2,stations,10000,3456068,Novo Hamburgo,200,-51.1306,-29.6783,few clouds,302.3,305.28,299.86,305.23,1010,65,22,BR,2024-01-05 19:05:49+00:00


In [21]:
df.head(5)

Unnamed: 0,base,visibility,id,name,cod,lon,lat,clouds_description,temp,feels_like,temp_min,temp_max,pressure,humidity,clouds_number,country,dt_utc
0,stations,10000,3456068,Novo Hamburgo,200,-51.1306,-29.6783,few clouds,302.48,305.45,300.41,305.23,1010,64,22,BR,2024-01-05 18:39:42+00:00
0,stations,10000,3456068,Novo Hamburgo,200,-51.1306,-29.6783,few clouds,302.48,305.45,300.41,305.23,1010,64,22,BR,2024-01-05 18:39:42+00:00
0,stations,10000,3456068,Novo Hamburgo,200,-51.1306,-29.6783,few clouds,302.3,305.28,299.86,305.23,1010,65,22,BR,2024-01-05 19:05:49+00:00


Save at S3

In [22]:
# Defina suas credenciais explicitamente (não recomendado)
profile_name = 'CRISTIAN_AWS'
s3 = boto3.session.Session(profile_name=profile_name).client('s3')
bucket_name = 'data-integration-projects'
directory_path = 'openweather-data-lake/processed-data/'

# Obtenha a data atual
current_date = datetime.now()
dia_da_execucao = current_date.strftime('%Y-%m-%d-%H-%M-%S')

# Divida a data em ano, mês e dia
year = str(current_date.year)
month = str(current_date.month).zfill(2)  # Adicionar zero à esquerda se necessário
day = str(current_date.day).zfill(2)  # Adicionar zero à esquerda se necessário

# Construa o caminho do diretório no formato 'year/month/day/'
directory_path = f'{directory_path}{year}/{month}/{day}/'

# Nome do arquivo usando a data e hora da execução
file_name = f'df_{dia_da_execucao}.parquet'

# Supondo que 'df' seja o DataFrame que você deseja salvar
df = pd.DataFrame({'example_key': ['example_value']})

# Salvar o DataFrame no formato parquet
df.to_parquet(file_name)

# Carregar o arquivo para o S3
s3.upload_file(file_name, bucket_name, directory_path + file_name)

print(f'Arquivo salvo em: {bucket_name}/{directory_path}{file_name}')


Arquivo salvo em: data-integration-projects/openweather-data-lake/processed-data/2024/01/05/df_2024-01-05-16-05-53.parquet


In [23]:
## Defina suas credenciais explicitamente (não recomendado)
#profile_name = 'CRISTIAN_AWS'
#s3 = boto3.session.Session(profile_name=profile_name).client('s3')
#bucket_name = 'data-integration-projects'
#directory_path = 'openweather-data-lake/processed-data/'
#dia_da_exucucao = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
#file_name = f'df{dia_da_exucucao}.parquet'
#df.to_parquet(file_name)
#s3.upload_file(file_name, bucket_name, directory_path + file_name)