# 1. Extracción

In [None]:
import pandas as pd
import configparser
import boto3

Liga para el dataset de [consumo de agua](https://datos.cdmx.gob.mx/dataset/consumo-agua)

In [None]:
consumo_agua = pd.read_csv('https://datos.cdmx.gob.mx/dataset/eb38823c-488a-49e8-a2cf-62e628fa246f/resource/2263bf74-c0ed-4e7c-bb9c-73f0624ac1a9/download/consumo-agua.csv')
consumo_agua.head(2)

# 2. Carga a S3

In [None]:
consumo_agua.to_csv('../../data/consumo_agua.csv',index=False)

In [None]:
# Load the aws_boto_credentials values
parser = configparser.ConfigParser()
parser.read("../../pipeline.conf")
access_key = parser.get("aws_boto_credentials","access_key")
secret_key = parser.get("aws_boto_credentials","secret_key")
bucket_name = parser.get("aws_boto_credentials","bucket_name")

s3_client = boto3.client('s3',aws_access_key_id = access_key, aws_secret_access_key = secret_key)
consumo_agua_path = "../../data/consumo_agua.csv"
s3_file_name  = "data_raw/consumo_agua.csv"

In [None]:
s3_client.upload_file(
    consumo_agua_path,
    bucket_name,
    s3_file_name)

# 3. Extracción desde S3

In [None]:
# downloading file
    
with open('consumo_agua.csv', 'wb') as f:
    s3_client.download_fileobj(bucket_name, s3_file_name, f)
    
consumo_agua = pd.read_csv('consumo_agua.csv')
consumo_agua.head(2)

In [None]:
# Without downloading file
response = s3_client.get_object(Bucket=bucket_name, Key=s3_file_name)

status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 get_object response. Status - {status}")
    consumo_agua = pd.read_csv(response.get("Body"))
    #print(consumo_agua)
else:
    print(f"Unsuccessful S3 get_object response. Status - {status}")
    
consumo_agua.head(2)

# 4. EDA

In [None]:
consumo_agua.columns

In [None]:
len(consumo_agua.columns)

In [None]:
consumo_agua.shape

In [None]:
consumo_agua.dtypes

In [None]:
pd.DataFrame(consumo_agua.dtypes).value_counts()

In [17]:
consumo_agua['id'].nunique()

71102

In [18]:
# Número de variables únicas por cada columna
num_unique_vars = pd.DataFrame(consumo_agua.nunique())
num_unique_vars.columns = ['count_of_unique_variables']
num_unique_vars

Unnamed: 0,count_of_unique_variables
id,71102
geo_point_2d,22930
geo_shape,22922
consumo_total_mixto,24339
anio,1
nomgeo,17
consumo_prom_dom,52060
consumo_total_dom,47051
alcaldia,16
colonia,1340


In [23]:
# Conteo de valores nulos
consumo_agua.isna().sum()

id                         0
geo_point_2d               0
geo_shape                 24
consumo_total_mixto     8327
anio                       0
nomgeo                     0
consumo_prom_dom        4820
consumo_total_dom       4820
alcaldia                   0
colonia                    0
consumo_prom_mixto      8327
consumo_total              0
consumo_prom               0
consumo_prom_no_dom        0
bimestre                   0
consumo_total_no_dom       0
gid                        0
indice_des                 0
dtype: int64

In [20]:
consumo_agua['alcaldia'].unique()

array(['GUSTAVO A. MADERO', 'AZCAPOTZALCO', 'COYOACAN', 'TLALPAN',
       'MILPA ALTA', 'ALVARO OBREGON', 'CUAJIMALPA', 'TLAHUAC',
       'MAGDALENA CONTRERAS', 'XOCHIMILCO', 'IZTAPALAPA', 'IZTACALCO',
       'BENITO JUAREZ', 'MIGUEL HIDALGO', 'CUAUHTEMOC',
       'VENUSTIANO CARRANZA'], dtype=object)

In [19]:
consumo_agua['nomgeo'].unique()

array(['Gustavo A. Madero', 'Azcapotzalco', 'Coyoacán', 'Talpan',
       'Milpa Alta', 'Álvaro Obregón', 'Cuajimalpa de Morelos', 'Tláhuac',
       'La Magdalena Contreras', 'Xochimilco', 'Iztapalapa', 'Iztacalco',
       'Benito Juárez', 'Miguel Hidalgo', 'Cuauhtémoc',
       'Venustiano Carranza', 'Tlalpan'], dtype=object)

In [24]:
consumo_agua['indice_des'].unique()

array(['ALTO', 'MEDIO', 'POPULAR', 'BAJO'], dtype=object)

# 5. Transformaciones

In [None]:
consumo_agua.drop(columns = 'id')

# 6. Carga a PostgreSQL

In [26]:
with open('/extract_to_JSON/extraction_16_06_2003.json', 'r') as f:
    s3_client.download_fileobj(bucket_name, s3_file_name, f)
    
# consumo_agua = pd.read_csv('consumo_agua.csv')
# consumo_agua.head(2)

FileNotFoundError: [Errno 2] No such file or directory: '/extract_to_JSON/extraction_16_06_2003.json'

In [30]:
s3_file_name = 'extract_to_JSON/extraction_16-06-2003.json'

In [52]:
# Without downloading file
response = s3_client.get_object(Bucket=bucket_name, Key=s3_file_name)

status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

import json
data = json.load(response.get("Body"))
for line in data['results']:
    print(line.get('FOLIO'))
# if status == 200:
#     print(f"Successful S3 get_object response. Status - {status}")
#     consumo_agua = pd.read_csv(response.get("Body"))
#     #print(consumo_agua)
# else:
#     print(f"Unsuccessful S3 get_object response. Status - {status}")
    
# consumo_agua.head(2)

'0673800000103
'0673800000203
'0608400000103
'0000700000103
'0000700000203
'0000700000303
'1850000000103
'0001100000103
'1857600000103
'0002100000103
'0000600000103
'0002700000103
'1236000000103
'2042100000103
'0912000000103
'2042100000203
'0000400000103
'0001700000103
'1857200000103
'0001300000103
'0673800000303
'0001700000203
'0001600000103
'0210000000103
'0000900000103
'0000900000203
'1616100000103
'0001000000103
'0002700000203
'0001000000203
'1612100000103
'1510000000103
'1510000000203
'0002000000103
'0002700000303
'1850000000203
'1850000000303
'0002000000203
'0001100000203
'0000400000203
'0000600000203
'1850000000403
'1850000000503
'1850000000603
'1850000000703
'0000600000303
'1850000000803
'1850000000903
'1850000001003
'1850000001103
'2210300000103
'0917900000103
'1816400000103
'0001400000103
'0001400000203
'0001400000303
'1127900000103
'1127900000203
'1127900000303
'1127900000503
'1127900000403
'0613100000103
'1127900000603
'1857500000103
'2031200000103
'1850000001203
'000120000

In [33]:
status

200

In [34]:
response.get("Body").

<botocore.response.StreamingBody at 0x256a8850a88>

In [47]:
import json
data = json.load(response.get("Body"))
for line in data['results']:
    line.items()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)