# Phase 1

### 1. Identification of the data problem or objective and dataset selection
All the background information for the project will be in a PDF file in the resources folder.

### 2. Data extraction or collection


#### 2.1 Read the data from different sources (CSV files)

In [1]:
import os
from dotenv import load_dotenv
import psycopg2
from sqlalchemy import create_engine, text
import pandas as pd

# Load environment variables
load_dotenv()

db_staging_user = os.getenv("DB_STAGING_USER")
db_staging_password = os.getenv("DB_STAGING_PASSWORD")
db_staging_host = os.getenv("DB_STAGING_HOST")
db_staging_port = int(os.getenv("DB_STAGING_PORT", 5432))
db_staging_name = os.getenv("DB_STAGING_NAME")

resources_path = os.getenv("RESOURCES_PATH")
mascotas_propietarios_filename = "Mascotas_Propietarios_despensaAnimal_Generated.csv"
propietarios_transacciones_filename = "Propietarios_Transacciones_despensaAnimal_Generated.csv"

if db_staging_user is None:
    raise ValueError("DB_STAGING_USER is not set")
if db_staging_password is None:
    raise ValueError("DB_STAGING_PASSWORD is not set")
if db_staging_host is None:
    raise ValueError("DB_STAGING_HOST is not set")
if db_staging_port is None:
    raise ValueError("DB_STAGING_PORT is not set")
if db_staging_name is None:
    raise ValueError("DB_STAGING_NAME is not set")
if resources_path is None:
    raise ValueError("RESOURCES_PATH is not set")

connStaging = psycopg2.connect(
    dbname=db_staging_name,
    user=db_staging_user,
    password=db_staging_password,
    host=db_staging_host,
    port=db_staging_port
)
connStaging.autocommit = True   


#### 2.2 Create the database engine for the staging data

In [2]:
db_staging_engine = create_engine(f"postgresql://{db_staging_user}:{db_staging_password}@{db_staging_host}:{db_staging_port}/{db_staging_name}")

#### 2.3 Read the file "Mascotas_Propietarios_despensaAnimal_Generated.csv" and store it in a database table

In [3]:
df_mascotas_propietarios = pd.read_csv(f"{resources_path}/{mascotas_propietarios_filename}", delimiter=",", skiprows=1, low_memory=False)

# Rename columns after loading the data
df_mascotas_propietarios.columns = [
    "nombre_mascota",
    "raza",
    "peso",
    "fecha_nacimiento",
    "sexo",
    "temperamento",
    "numero_carnet",
    "estado_reproductivo",
    "numero_partos",
    "color",
    "fecha_fallecimiento",
    "motivo_fallecimiento",
    "comentarios_fallecimiento",
    "nombre_propietario",
    "ciudad",
    "direccion",
    "telefono",
    "whatsapp",
    "email",
    "tipo_documento",
    "numero_documento",
    "profesion",
    "estado",
    "notificaciones_whatsapp"
    ]

# Just print the first 20 rows to test
df_mascotas_propietarios.head(20)

# Store the CSV file into a database 
with db_staging_engine.connect() as conn:
    conn.execute(text("""
        CREATE TABLE IF NOT EXISTS mascotas_propietarios_staging (
            nombre_mascota CHARACTER VARYING,
            raza CHARACTER VARYING,
            peso CHARACTER VARYING,
            fecha_nacimiento DATE,
            sexo CHARACTER VARYING,
            temperamento CHARACTER VARYING,
            numero_carnet CHARACTER VARYING,
            estado_reproductivo CHARACTER VARYING,
            numero_partos CHARACTER VARYING,
            color CHARACTER VARYING,
            fecha_fallecimiento DATE,
            motivo_fallecimiento CHARACTER VARYING,
            comentarios_fallecimiento CHARACTER VARYING,
            nombre_propietario CHARACTER VARYING,
            ciudad CHARACTER VARYING,
            direccion CHARACTER VARYING,
            telefono CHARACTER VARYING,
            whatsapp CHARACTER VARYING,
            email CHARACTER VARYING,
            tipo_documento CHARACTER VARYING,
            numero_documento CHARACTER VARYING,
            profesion CHARACTER VARYING,
            estado CHARACTER VARYING,
            notificaciones_whatsapp CHARACTER VARYING
        );
    """))
    conn.commit()
    print("Table 'mascotas_propietarios_staging' created successfully.")

# Store DataFrame into PostgreSQL table named 'mascotas_propietarios_staging'
df_mascotas_propietarios.to_sql("mascotas_propietarios_staging", db_staging_engine, if_exists="append", index=False)

print("The CSV file data was inserted successfully!")

Table 'mascotas_propietarios_staging' created successfully.
The CSV file data was inserted successfully!


#### 2.4 Get data from the "mascotas_propietarios_staging" table to test

In [4]:
# Just print the first 20 rows to test
df_mascotas_propietarios_verification = pd.read_sql("SELECT * FROM mascotas_propietarios_staging LIMIT 20;", db_staging_engine)
df_mascotas_propietarios_verification

Unnamed: 0,nombre_mascota,raza,peso,fecha_nacimiento,sexo,temperamento,numero_carnet,estado_reproductivo,numero_partos,color,...,ciudad,direccion,telefono,whatsapp,email,tipo_documento,numero_documento,profesion,estado,notificaciones_whatsapp
0,THANOS URREGO,Poodle,,2022-09-01,Macho,social,1631,Entero,,,...,Cali,CRA100 #28-68 VALLE DEL LILI MADEIRO TORRE 1 ...,,3207201137,lizethurrego1990@gmail.com,CC,1036637677,,Activo,Activo
1,TINENK ORDONEZ,Shit-zu,,2017-02-17,Hembra,social,1650,,,,...,Santiago de Cali,cra 85 c # 33- 40 casa 54,3058147632.0,3157929392,pa_ordonez@hotmail.com,CC,66987417,,Activo,Activo
2,SIMON VACCA,PUG,,2021-08-01,Macho,social,1184,,,,...,,Cra98B #45-200 SAN MIGUEL,,3234191060,,CC,,,Activo,Activo
3,SUSSY PUERTA,YORK SHIRE TERRIER,3.6,2016-10-07,Hembra,social,1359,ENTERA,NINGUNO,,...,,Cra. 98b #34-53 GUADALQUIVIR CASA 64,3134024437.0,3146096191,,CC,1006107262,,Activo,Activo
4,EEVEE ANACONA,Poodle,,2017-09-03,Hembra,social,1632,,,,...,Cali,"Calle 45#98B-65 Apto 403, torre 8",,3216066041,jazmin.dag@gmail.com,CC,1061750508,,Activo,Activo
5,DULCE RODRIGUEZ,PINSCHER,,2020-04-01,Macho,social,1628,Esterilizado,,,...,Cali,CRA98B#48-127 VALLE DEL LILI,,3175175266,leidyjoroma@gmail.com,CC,2569816,,Activo,Activo
6,EURO LONDOÑO,SPHYNX,,,Macho,social,1634,,,,...,CALI,Calle 13 #98 -10 Multicentro 2 Apto 801 torre 1,,3225529091,,CC,,,Activo,Activo
7,MOLLY GIRALDO,Mestizo,,2018-10-15,Hembra,social,1636,,,,...,,CLL 28 # 96-186 APTO 301 TORRE D PORTAL DEL LI...,,3164403237,,CC,1116275102,,Activo,Activo
8,MAXIMILIANO BOHORQUEZ,ANGORA,,,Macho,social,1657,,,,...,,Cra 98b # 34-53 Guadalquivir casa 76,,3186156608,Memis0914@gmail.com,CC,1.023.865.148,,Activo,Activo
9,LUPE GIRALDO,Poodle,,2018-04-14,Hembra,social,1638,,,,...,,CLL 28 # 96-186 APTO 301 TORRE D PORTAL DEL LI...,,3164403237,,CC,1116275102,,Activo,Activo


#### 2.5 Read the file "Propietarios_Transacciones_despensaAnimal_Generated.csv" and store it in a database table

In [5]:
df_propietarios_transacciones = pd.read_csv(f"{resources_path}/{propietarios_transacciones_filename}", delimiter=",", skiprows=1, dtype={"numero_documento": str}, low_memory=False)

# Rename columns after loading the data
df_propietarios_transacciones.columns = [
    "nombre_propietario",
    "tipo_documento",
    "numero_documento",
    "nombre_mascota",
    "servico_prestado",
    "valor_servicio",
    "fecha_servicio"
    ]

# Just print the first 20 rows to test
df_propietarios_transacciones.head(20)

# Store the CSV file into a database 
with db_staging_engine.connect() as conn:
    conn.execute(text("""
        CREATE TABLE IF NOT EXISTS propietarios_transacciones_staging (
            nombre_propietario CHARACTER VARYING,
            tipo_documento CHARACTER VARYING,
            numero_documento CHARACTER VARYING,
            nombre_mascota CHARACTER VARYING,
            servico_prestado CHARACTER VARYING,
            valor_servicio NUMERIC,
            fecha_servicio DATE            
        );
    """))
    conn.commit()
    print("Table 'propietarios_transacciones_staging' created successfully.")

# Store DataFrame into PostgreSQL table named 'propietarios_transacciones_staging'
df_propietarios_transacciones.to_sql("propietarios_transacciones_staging", db_staging_engine, if_exists="append", index=False)

print("The CSV file data was inserted successfully!")

Table 'propietarios_transacciones_staging' created successfully.
The CSV file data was inserted successfully!


#### 2.6 Get data from the "propietarios_transacciones_staging" table to test

In [6]:
# Just print the first 20 rows to test
df_propietarios_transacciones = pd.read_sql("SELECT * FROM propietarios_transacciones_staging LIMIT 20;", db_staging_engine)
df_propietarios_transacciones

Unnamed: 0,nombre_propietario,tipo_documento,numero_documento,nombre_mascota,servico_prestado,valor_servicio,fecha_servicio
0,NICOLAS LOPEZ CANO,CC,1116733904,TOMAS LOPEZ CANO,Vacunación,1406023.0,2017-11-22
1,NICOLAS LOPEZ CANO,CC,1116733904,TOMAS LOPEZ CANO,Otros,125698.0,2019-10-06
2,NICOLAS LOPEZ CANO,CC,1116733904,TOMAS LOPEZ CANO,Peluquería,1815514.0,2019-05-18
3,NICOLAS LOPEZ CANO,CC,1116733904,TOMAS LOPEZ CANO,Otros,1592900.0,2023-03-20
4,NICOLAS LOPEZ CANO,CC,1116733904,TOMAS LOPEZ CANO,Vacunación,147163.0,2022-10-08
5,NICOLAS LOPEZ CANO,CC,1116733904,TOMAS LOPEZ CANO,Peluquería,1230361.0,2024-05-19
6,LIZETH URREGO,CC,1036637677,THANOS URREGO,Eutanasia,105455.0,2025-03-10
7,PAOLA ORDONEZ,CC,66987417,TINENK ORDONEZ,Desparacitada,911422.0,2017-10-30
8,PAOLA ORDONEZ,CC,66987417,TINENK ORDONEZ,Desparacitada,1061883.0,2020-07-12
9,PAOLA ORDONEZ,CC,66987417,TINENK ORDONEZ,Venta Alimentos,1019826.0,2017-05-15
