# Phase 2

### Data transformation

### 1. Read the data from the staging area and find outliers, null or empty data, and more.

In [1]:
import os
from dotenv import load_dotenv
import psycopg2
from sqlalchemy import create_engine, text
import pandas as pd

# Load environment variables
load_dotenv()

db_staging_user = os.getenv('DB_STAGING_USER')
db_staging_password = os.getenv('DB_STAGING_PASSWORD')
db_staging_host = os.getenv('DB_STAGING_HOST')
db_staging_port = int(os.getenv('DB_STAGING_PORT', 5432))
db_staging_name = os.getenv('DB_STAGING_NAME')

resources_path = os.getenv('RESOURCES_PATH')
mascotas_propietarios_filename = 'Mascotas_Propietarios_despensaAnimal_Generated.csv'
propietarios_transacciones_filename = 'Propietarios_Transacciones_despensaAnimal_Generated.csv'

if db_staging_user is None:
    raise ValueError('DB_STAGING_USER is not set')
if db_staging_password is None:
    raise ValueError('DB_STAGING_PASSWORD is not set')
if db_staging_host is None:
    raise ValueError('DB_STAGING_HOST is not set')
if db_staging_port is None:
    raise ValueError('DB_STAGING_PORT is not set')
if db_staging_name is None:
    raise ValueError('DB_STAGING_NAME is not set')
if resources_path is None:
    raise ValueError('RESOURCES_PATH is not set')

connStaging = psycopg2.connect(
    dbname=db_staging_name,
    user=db_staging_user,
    password=db_staging_password,
    host=db_staging_host,
    port=db_staging_port
)
connStaging.autocommit = True   

#### 1.1 Create the database engine for the staging data

In [2]:
db_staging_engine = create_engine(f'postgresql://{db_staging_user}:{db_staging_password}@{db_staging_host}:{db_staging_port}/{db_staging_name}')

df_mascotas_propietarios_staging = pd.read_sql('SELECT * FROM mascotas_propietarios_staging;', db_staging_engine)
df_mascotas_propietarios_staging

Unnamed: 0,nombre_mascota,raza,peso,fecha_nacimiento,sexo,temperamento,numero_carnet,estado_reproductivo,numero_partos,color,...,ciudad,direccion,telefono,whatsapp,email,tipo_documento,numero_documento,profesion,estado,notificaciones_whatsapp
0,THANOS URREGO,Poodle,,2022-09-01,Macho,social,1631,Entero,,,...,Cali,CRA100 #28-68 VALLE DEL LILI MADEIRO TORRE 1 ...,,3207201137,lizethurrego1990@gmail.com,CC,1036637677,,Activo,Activo
1,TINENK ORDONEZ,Shit-zu,,2017-02-17,Hembra,social,1650,,,,...,Santiago de Cali,cra 85 c # 33- 40 casa 54,3058147632,3157929392,pa_ordonez@hotmail.com,CC,66987417,,Activo,Activo
2,SIMON VACCA,PUG,,2021-08-01,Macho,social,1184,,,,...,,Cra98B #45-200 SAN MIGUEL,,3234191060,,CC,,,Activo,Activo
3,SUSSY PUERTA,YORK SHIRE TERRIER,3.6,2016-10-07,Hembra,social,1359,ENTERA,NINGUNO,,...,,Cra. 98b #34-53 GUADALQUIVIR CASA 64,3134024437,3146096191,,CC,1006107262,,Activo,Activo
4,EEVEE ANACONA,Poodle,,2017-09-03,Hembra,social,1632,,,,...,Cali,"Calle 45#98B-65 Apto 403, torre 8",,3216066041,jazmin.dag@gmail.com,CC,1061750508,,Activo,Activo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11994,Jennifer,Golden Retriever,46.1,2024-07-17,Hembra,Agresivo,FB13AD2F,Esterilizado,1,Negro,...,Cali,2256 Daniel Camp,3139761188,,jacob96@example.com,NIT,176580896,Panadero,Inactivo,Si
11995,Timothy,German Shepherd,41.79,2014-03-10,Macho,Agresivo,2361BF30,No Esterilizado,0,Blanco,...,Cali,194 Kimberly Tunnel Apt. 031,3184965612,,shelleyparker@example.com,NIT,130286470,Ingeniero,Activo,No
11996,Marcus,Golden Retriever,32.76,2014-09-16,Hembra,Sociable,85D44DE5,Esterilizado,0,Marrón,...,Cali,237 Castro Meadow,3168512890,,ojohnson@example.org,CE,194581954,Ingeniero,Activo,No
11997,James,Poodle,35.41,2005-08-23,Hembra,Sociable,4C95A084,No Esterilizado,5,Gris,...,Cali,6386 Miller Place,3201628333,,jasminecarr@example.com,NIT,174849231,Arquitecta,Inactivo,No


### 2. Remove data that not add value to the transacctions scenario

1.	comentarios_fallecimiento → Not useful for purchases or analysis of services.
2.	motivo_fallecimiento → Not related to transactions.
3.	fecha_fallecimiento → If the pet has died, it will no longer generate new purchases.
4.	numero_carnet → Not relevant to link purchases with clients.
5.	estado_reproductivo → Does not influence the purchase of services.
6.	numero_partos → Does not seem to affect the purchase of veterinary services.
7.	profesion → Not directly related to service purchases.
8.	notificaciones_whatsapp → Not used to analyze transactions.

In [3]:
# Function to drop unnecessary columns
def drop_unnecessary_columns(df: pd.DataFrame) -> pd.DataFrame:
    columns_to_drop = [
        'comentarios_fallecimiento',
        'motivo_fallecimiento',
        'fecha_fallecimiento',
        'numero_carnet',
        'estado_reproductivo',
        'numero_partos',
        'profesion',
        'notificaciones_whatsapp'
    ]

    for col in columns_to_drop:
        if col in df.columns:
            print(f'Removing column: {col}')
            df.drop(columns=[col], inplace=True)
        else:
            print(f'Warning: Column {col} not found in the dataset.')
    
    print(f'Columns removed. New shape: {df.shape}')
    return df

# Apply the function
df_mascotas_propietarios_cleaned = drop_unnecessary_columns(df_mascotas_propietarios_staging)
df_mascotas_propietarios_cleaned

Removing column: comentarios_fallecimiento
Removing column: motivo_fallecimiento
Removing column: fecha_fallecimiento
Removing column: numero_carnet
Removing column: estado_reproductivo
Removing column: numero_partos
Removing column: profesion
Removing column: notificaciones_whatsapp
Columns removed. New shape: (11999, 16)


Unnamed: 0,nombre_mascota,raza,peso,fecha_nacimiento,sexo,temperamento,color,nombre_propietario,ciudad,direccion,telefono,whatsapp,email,tipo_documento,numero_documento,estado
0,THANOS URREGO,Poodle,,2022-09-01,Macho,social,,LIZETH URREGO,Cali,CRA100 #28-68 VALLE DEL LILI MADEIRO TORRE 1 ...,,3207201137,lizethurrego1990@gmail.com,CC,1036637677,Activo
1,TINENK ORDONEZ,Shit-zu,,2017-02-17,Hembra,social,,PAOLA ORDONEZ,Santiago de Cali,cra 85 c # 33- 40 casa 54,3058147632,3157929392,pa_ordonez@hotmail.com,CC,66987417,Activo
2,SIMON VACCA,PUG,,2021-08-01,Macho,social,,ANDREA VACCA,,Cra98B #45-200 SAN MIGUEL,,3234191060,,CC,,Activo
3,SUSSY PUERTA,YORK SHIRE TERRIER,3.6,2016-10-07,Hembra,social,,MARIA CAMILA PUERTA,,Cra. 98b #34-53 GUADALQUIVIR CASA 64,3134024437,3146096191,,CC,1006107262,Activo
4,EEVEE ANACONA,Poodle,,2017-09-03,Hembra,social,,YAZMIN ANACONA,Cali,"Calle 45#98B-65 Apto 403, torre 8",,3216066041,jazmin.dag@gmail.com,CC,1061750508,Activo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11994,Jennifer,Golden Retriever,46.1,2024-07-17,Hembra,Agresivo,Negro,Steven Keller,Cali,2256 Daniel Camp,3139761188,,jacob96@example.com,NIT,176580896,Inactivo
11995,Timothy,German Shepherd,41.79,2014-03-10,Macho,Agresivo,Blanco,Christopher Clark,Cali,194 Kimberly Tunnel Apt. 031,3184965612,,shelleyparker@example.com,NIT,130286470,Activo
11996,Marcus,Golden Retriever,32.76,2014-09-16,Hembra,Sociable,Marrón,Lisa Reyes,Cali,237 Castro Meadow,3168512890,,ojohnson@example.org,CE,194581954,Activo
11997,James,Poodle,35.41,2005-08-23,Hembra,Sociable,Gris,Paul White,Cali,6386 Miller Place,3201628333,,jasminecarr@example.com,NIT,174849231,Inactivo


### 2.1 Analyze the field: ciudad and its different values

In [4]:
df_ciudad = df_mascotas_propietarios_cleaned.groupby("ciudad").size().reset_index(name="total_registros")
df_ciudad

Unnamed: 0,ciudad,total_registros
0,CAI,1
1,CALI,1692
2,CALUI,1
3,CRA 64 # 14-24,1
4,Cali,9259
5,Calle 60 b # 119 - 47 Torre 2 apto 602 Laurel,1
6,EEUU,5
7,JAMUNDI,8
8,Santiago de Cali,555
9,cali,54


We can see that the field: 'ciudad' doesn't have the same value and, also has information relates with countries, others towns and addresses, we must clean this data.

In [5]:
# Update all values in 'ciudad' column to 'Cali'
df_mascotas_propietarios_cleaned["ciudad"] = "Cali"

df_ciudad = df_mascotas_propietarios_cleaned.groupby("ciudad").size().reset_index(name="total_registros")
df_ciudad

Unnamed: 0,ciudad,total_registros
0,Cali,11999


### 2.2 Analyze the field: peso and its different values

In [6]:
df_mascotas_propietarios_cleaned

Unnamed: 0,nombre_mascota,raza,peso,fecha_nacimiento,sexo,temperamento,color,nombre_propietario,ciudad,direccion,telefono,whatsapp,email,tipo_documento,numero_documento,estado
0,THANOS URREGO,Poodle,,2022-09-01,Macho,social,,LIZETH URREGO,Cali,CRA100 #28-68 VALLE DEL LILI MADEIRO TORRE 1 ...,,3207201137,lizethurrego1990@gmail.com,CC,1036637677,Activo
1,TINENK ORDONEZ,Shit-zu,,2017-02-17,Hembra,social,,PAOLA ORDONEZ,Cali,cra 85 c # 33- 40 casa 54,3058147632,3157929392,pa_ordonez@hotmail.com,CC,66987417,Activo
2,SIMON VACCA,PUG,,2021-08-01,Macho,social,,ANDREA VACCA,Cali,Cra98B #45-200 SAN MIGUEL,,3234191060,,CC,,Activo
3,SUSSY PUERTA,YORK SHIRE TERRIER,3.6,2016-10-07,Hembra,social,,MARIA CAMILA PUERTA,Cali,Cra. 98b #34-53 GUADALQUIVIR CASA 64,3134024437,3146096191,,CC,1006107262,Activo
4,EEVEE ANACONA,Poodle,,2017-09-03,Hembra,social,,YAZMIN ANACONA,Cali,"Calle 45#98B-65 Apto 403, torre 8",,3216066041,jazmin.dag@gmail.com,CC,1061750508,Activo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11994,Jennifer,Golden Retriever,46.1,2024-07-17,Hembra,Agresivo,Negro,Steven Keller,Cali,2256 Daniel Camp,3139761188,,jacob96@example.com,NIT,176580896,Inactivo
11995,Timothy,German Shepherd,41.79,2014-03-10,Macho,Agresivo,Blanco,Christopher Clark,Cali,194 Kimberly Tunnel Apt. 031,3184965612,,shelleyparker@example.com,NIT,130286470,Activo
11996,Marcus,Golden Retriever,32.76,2014-09-16,Hembra,Sociable,Marrón,Lisa Reyes,Cali,237 Castro Meadow,3168512890,,ojohnson@example.org,CE,194581954,Activo
11997,James,Poodle,35.41,2005-08-23,Hembra,Sociable,Gris,Paul White,Cali,6386 Miller Place,3201628333,,jasminecarr@example.com,NIT,174849231,Inactivo


We can see that the field: 'peso' mix numeric and string values like 9.2Kg. We must change that to only numeric values.

In [7]:
df_mascotas_propietarios_cleaned.groupby("peso").size().reset_index(name="total_registros")

Unnamed: 0,peso,total_registros
0,0.4,1
1,0.53,1
2,0.6,3
3,0.6 Kg,1
4,0.7,1
...,...,...
4899,9KG,1
4900,9Kg,1
4901,9kg,1
4902,9kg,1


In [8]:
# Convert all values in 'peso' column to float, if possible (otherwise, set to NaN)
df_mascotas_propietarios_cleaned["peso"] = pd.to_numeric(df_mascotas_propietarios_cleaned["peso"], errors='coerce')
df_mascotas_propietarios_cleaned.groupby("peso").size().reset_index(name="total_registros")


Unnamed: 0,peso,total_registros
0,0.40,1
1,0.53,1
2,0.60,3
3,0.70,1
4,1.00,3
...,...,...
4668,59.98,2
4669,59.99,1
4670,60.00,2
4671,64.00,1


### 2.3 Analyze the field: 'date' and its different values

In [9]:
df_mascotas_propietarios_cleaned.groupby("fecha_nacimiento").size().reset_index(name="total_registros")

Unnamed: 0,fecha_nacimiento,total_registros
0,0204-02-21,1
1,2002-05-18,1
2,2002-11-09,1
3,2003-02-23,1
4,2003-09-26,1
...,...,...
5728,2025-03-03,1
5729,2025-03-04,4
5730,2025-03-05,2
5731,2025-03-07,3


We can see that the field: 'fecha_nacimiento' has wrong date values like 0204-02-21. That date is wrong because the business
was created in the year 2016.

In [10]:
# Convert all values in 'fecha_nacimiento' column to a datetime greater than 2016-01-01
df_mascotas_propietarios_cleaned["fecha_nacimiento"] = pd.to_datetime(df_mascotas_propietarios_cleaned["fecha_nacimiento"], errors='coerce')
df_mascotas_propietarios_cleaned = df_mascotas_propietarios_cleaned[df_mascotas_propietarios_cleaned["fecha_nacimiento"] >= '2016-01-01']
df_mascotas_propietarios_cleaned.groupby("fecha_nacimiento").size().reset_index(name="total_registros")

Unnamed: 0,fecha_nacimiento,total_registros
0,2016-01-01,7
1,2016-01-02,3
2,2016-01-03,1
3,2016-01-04,2
4,2016-01-05,1
...,...,...
2646,2025-03-03,1
2647,2025-03-04,4
2648,2025-03-05,2
2649,2025-03-07,3


### 2.4 Analyze the field: 'telefono' and 'whatsapp'

In [11]:
# Ensure 'telefono' column only contains numeric values (otherwise, set to None)
df_mascotas_propietarios_cleaned.loc[:, "telefono"] = df_mascotas_propietarios_cleaned["telefono"].apply(
    lambda x: x if str(x).isnumeric() else None
)

# Fill missing 'telefono' values with 'whatsapp' numbers
df_mascotas_propietarios_cleaned.loc[:, "telefono"] = df_mascotas_propietarios_cleaned["telefono"].fillna(
    df_mascotas_propietarios_cleaned["whatsapp"]
)

# Display the updated DataFrame
df_mascotas_propietarios_cleaned[["telefono", "whatsapp"]]

# Drop the 'whatsapp' column
df_mascotas_propietarios_cleaned.drop(columns=["whatsapp"], inplace=True)
df_mascotas_propietarios_cleaned

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mascotas_propietarios_cleaned.drop(columns=["whatsapp"], inplace=True)


Unnamed: 0,nombre_mascota,raza,peso,fecha_nacimiento,sexo,temperamento,color,nombre_propietario,ciudad,direccion,telefono,email,tipo_documento,numero_documento,estado
0,THANOS URREGO,Poodle,,2022-09-01,Macho,social,,LIZETH URREGO,Cali,CRA100 #28-68 VALLE DEL LILI MADEIRO TORRE 1 ...,3207201137,lizethurrego1990@gmail.com,CC,1036637677,Activo
1,TINENK ORDONEZ,Shit-zu,,2017-02-17,Hembra,social,,PAOLA ORDONEZ,Cali,cra 85 c # 33- 40 casa 54,3058147632,pa_ordonez@hotmail.com,CC,66987417,Activo
2,SIMON VACCA,PUG,,2021-08-01,Macho,social,,ANDREA VACCA,Cali,Cra98B #45-200 SAN MIGUEL,3234191060,,CC,,Activo
3,SUSSY PUERTA,YORK SHIRE TERRIER,3.60,2016-10-07,Hembra,social,,MARIA CAMILA PUERTA,Cali,Cra. 98b #34-53 GUADALQUIVIR CASA 64,3134024437,,CC,1006107262,Activo
4,EEVEE ANACONA,Poodle,,2017-09-03,Hembra,social,,YAZMIN ANACONA,Cali,"Calle 45#98B-65 Apto 403, torre 8",3216066041,jazmin.dag@gmail.com,CC,1061750508,Activo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11987,William,Poodle,11.05,2017-04-12,Hembra,Juguetón,Negro,Angela Moore,Cali,0106 Dudley Route,3143111203,hardingkenneth@example.com,NIT,125606839,Inactivo
11988,Anna,Golden Retriever,5.08,2024-10-28,Macho,Tranquilo,Negro,Steven Douglas,Cali,6458 Rogers Courts Suite 080,3191768598,jeremyhuber@example.net,CE,176912006,Activo
11990,Andrew,German Shepherd,25.12,2016-08-23,Macho,Tranquilo,Gris,Debbie Smith,Cali,80840 Sellers Junctions,3118659393,brandon31@example.net,CC,195453083,Inactivo
11991,Charles,Poodle,47.89,2017-12-10,Hembra,Tímido,Gris,Jose Reid,Cali,7585 Rodriguez Bypass,3024903661,huanglatoya@example.net,CE,162723607,Inactivo


### 2.5 Analyze the field: 'tipo_documento' and 'numero_documento'

In [12]:
# Remove every row where 'numero_documento' is NaN or empty
df_mascotas_propietarios_cleaned = df_mascotas_propietarios_cleaned.dropna(subset=["numero_documento"])
df_mascotas_propietarios_cleaned = df_mascotas_propietarios_cleaned[df_mascotas_propietarios_cleaned["numero_documento"] != ""]
df_mascotas_propietarios_cleaned

Unnamed: 0,nombre_mascota,raza,peso,fecha_nacimiento,sexo,temperamento,color,nombre_propietario,ciudad,direccion,telefono,email,tipo_documento,numero_documento,estado
0,THANOS URREGO,Poodle,,2022-09-01,Macho,social,,LIZETH URREGO,Cali,CRA100 #28-68 VALLE DEL LILI MADEIRO TORRE 1 ...,3207201137,lizethurrego1990@gmail.com,CC,1036637677,Activo
1,TINENK ORDONEZ,Shit-zu,,2017-02-17,Hembra,social,,PAOLA ORDONEZ,Cali,cra 85 c # 33- 40 casa 54,3058147632,pa_ordonez@hotmail.com,CC,66987417,Activo
3,SUSSY PUERTA,YORK SHIRE TERRIER,3.60,2016-10-07,Hembra,social,,MARIA CAMILA PUERTA,Cali,Cra. 98b #34-53 GUADALQUIVIR CASA 64,3134024437,,CC,1006107262,Activo
4,EEVEE ANACONA,Poodle,,2017-09-03,Hembra,social,,YAZMIN ANACONA,Cali,"Calle 45#98B-65 Apto 403, torre 8",3216066041,jazmin.dag@gmail.com,CC,1061750508,Activo
5,DULCE RODRIGUEZ,PINSCHER,,2020-04-01,Macho,social,,JAIME RODRIGUEZ,Cali,CRA98B#48-127 VALLE DEL LILI,3175175266,leidyjoroma@gmail.com,CC,2569816,Activo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11987,William,Poodle,11.05,2017-04-12,Hembra,Juguetón,Negro,Angela Moore,Cali,0106 Dudley Route,3143111203,hardingkenneth@example.com,NIT,125606839,Inactivo
11988,Anna,Golden Retriever,5.08,2024-10-28,Macho,Tranquilo,Negro,Steven Douglas,Cali,6458 Rogers Courts Suite 080,3191768598,jeremyhuber@example.net,CE,176912006,Activo
11990,Andrew,German Shepherd,25.12,2016-08-23,Macho,Tranquilo,Gris,Debbie Smith,Cali,80840 Sellers Junctions,3118659393,brandon31@example.net,CC,195453083,Inactivo
11991,Charles,Poodle,47.89,2017-12-10,Hembra,Tímido,Gris,Jose Reid,Cali,7585 Rodriguez Bypass,3024903661,huanglatoya@example.net,CE,162723607,Inactivo


In [13]:
# Show all the different values in 'tipo_documento'
df_mascotas_propietarios_cleaned["tipo_documento"].unique()


array(['CC', 'DII', 'Pasaporte', 'NIT', 'CE'], dtype=object)

For this analysis, we are only interested in the overage people with 'tipo documento' equals to 'CC'.

In [14]:
# Remove every row where 'tipo_documento' is not 'CC' and overage people so, fecha_nacimiento is less than 18 years
df_mascotas_propietarios_cleaned = df_mascotas_propietarios_cleaned[df_mascotas_propietarios_cleaned["tipo_documento"] == "CC"]
df_mascotas_propietarios_cleaned

Unnamed: 0,nombre_mascota,raza,peso,fecha_nacimiento,sexo,temperamento,color,nombre_propietario,ciudad,direccion,telefono,email,tipo_documento,numero_documento,estado
0,THANOS URREGO,Poodle,,2022-09-01,Macho,social,,LIZETH URREGO,Cali,CRA100 #28-68 VALLE DEL LILI MADEIRO TORRE 1 ...,3207201137,lizethurrego1990@gmail.com,CC,1036637677,Activo
1,TINENK ORDONEZ,Shit-zu,,2017-02-17,Hembra,social,,PAOLA ORDONEZ,Cali,cra 85 c # 33- 40 casa 54,3058147632,pa_ordonez@hotmail.com,CC,66987417,Activo
3,SUSSY PUERTA,YORK SHIRE TERRIER,3.60,2016-10-07,Hembra,social,,MARIA CAMILA PUERTA,Cali,Cra. 98b #34-53 GUADALQUIVIR CASA 64,3134024437,,CC,1006107262,Activo
4,EEVEE ANACONA,Poodle,,2017-09-03,Hembra,social,,YAZMIN ANACONA,Cali,"Calle 45#98B-65 Apto 403, torre 8",3216066041,jazmin.dag@gmail.com,CC,1061750508,Activo
5,DULCE RODRIGUEZ,PINSCHER,,2020-04-01,Macho,social,,JAIME RODRIGUEZ,Cali,CRA98B#48-127 VALLE DEL LILI,3175175266,leidyjoroma@gmail.com,CC,2569816,Activo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11960,Jeffrey,Golden Retriever,3.81,2016-06-29,Macho,Agresivo,Blanco,Shannon Bean,Cali,509 Fox Throughway Apt. 035,3185732421,pcampos@example.com,CC,183253281,Activo
11961,Steven,Bulldog,20.44,2018-11-02,Macho,Tranquilo,Beige,Stephanie Morrison,Cali,48357 Neal Hollow Suite 405,3072605930,zgonzalez@example.com,CC,161320236,Inactivo
11966,Amy,German Shepherd,40.07,2018-12-26,Hembra,Tranquilo,Negro,Mr. Timothy Ayala,Cali,18058 Richmond Fields,3156669785,hooperomar@example.net,CC,124462449,Activo
11974,Ashlee,Golden Retriever,9.78,2016-01-01,Hembra,Agresivo,Negro,Glenda Castro,Cali,41155 Curry Alley,3146524137,matthew86@example.org,CC,142758190,Inactivo


#### 3. Create a merge between the new df_mascotas_propietarios_cleaned and the table: propietarios_transacciones_staging

In [15]:
# Create a merge between the new df_mascotas_propietarios_cleaned and the table: propietarios_transacciones_staging
df_propietarios_transacciones_staging = pd.read_sql('SELECT * FROM propietarios_transacciones_staging;', db_staging_engine)
df_propietarios_transacciones_staging

Unnamed: 0,nombre_propietario,tipo_documento,numero_documento,nombre_mascota,servico_prestado,valor_servicio,fecha_servicio
0,NICOLAS LOPEZ CANO,CC,1116733904,TOMAS LOPEZ CANO,Vacunación,1406023.0,2017-11-22
1,NICOLAS LOPEZ CANO,CC,1116733904,TOMAS LOPEZ CANO,Otros,125698.0,2019-10-06
2,NICOLAS LOPEZ CANO,CC,1116733904,TOMAS LOPEZ CANO,Peluquería,1815514.0,2019-05-18
3,NICOLAS LOPEZ CANO,CC,1116733904,TOMAS LOPEZ CANO,Otros,1592900.0,2023-03-20
4,NICOLAS LOPEZ CANO,CC,1116733904,TOMAS LOPEZ CANO,Vacunación,147163.0,2022-10-08
...,...,...,...,...,...,...,...
182975,Deborah Lopez,CC,146255830,Timothy,Baño,876549.0,2023-04-23
182976,Deborah Lopez,CC,146255830,Timothy,Venta Alimentos,1991466.0,2024-06-08
182977,Deborah Lopez,CC,146255830,Timothy,Peluquería,1983390.0,2023-09-11
182978,Deborah Lopez,CC,146255830,Timothy,Cita Veterinario(a),530959.0,2024-03-10


In [18]:
# Merge the DataFrames
df_merged = pd.merge(
    df_mascotas_propietarios_cleaned,
    df_propietarios_transacciones_staging,
    how='inner',
    left_on='numero_documento',
    right_on='numero_documento'
)
df_merged

Unnamed: 0,nombre_mascota_x,raza,peso,fecha_nacimiento,sexo,temperamento,color,nombre_propietario_x,ciudad,direccion,...,email,tipo_documento_x,numero_documento,estado,nombre_propietario_y,tipo_documento_y,nombre_mascota_y,servico_prestado,valor_servicio,fecha_servicio
0,THANOS URREGO,Poodle,,2022-09-01,Macho,social,,LIZETH URREGO,Cali,CRA100 #28-68 VALLE DEL LILI MADEIRO TORRE 1 ...,...,lizethurrego1990@gmail.com,CC,1036637677,Activo,LIZETH URREGO,CC,THANOS URREGO,Eutanasia,105455.0,2025-03-10
1,TINENK ORDONEZ,Shit-zu,,2017-02-17,Hembra,social,,PAOLA ORDONEZ,Cali,cra 85 c # 33- 40 casa 54,...,pa_ordonez@hotmail.com,CC,66987417,Activo,PAOLA ORDONEZ,CC,TINENK ORDONEZ,Desparacitada,911422.0,2017-10-30
2,TINENK ORDONEZ,Shit-zu,,2017-02-17,Hembra,social,,PAOLA ORDONEZ,Cali,cra 85 c # 33- 40 casa 54,...,pa_ordonez@hotmail.com,CC,66987417,Activo,PAOLA ORDONEZ,CC,TINENK ORDONEZ,Desparacitada,1061883.0,2020-07-12
3,TINENK ORDONEZ,Shit-zu,,2017-02-17,Hembra,social,,PAOLA ORDONEZ,Cali,cra 85 c # 33- 40 casa 54,...,pa_ordonez@hotmail.com,CC,66987417,Activo,PAOLA ORDONEZ,CC,TINENK ORDONEZ,Venta Alimentos,1019826.0,2017-05-15
4,TINENK ORDONEZ,Shit-zu,,2017-02-17,Hembra,social,,PAOLA ORDONEZ,Cali,cra 85 c # 33- 40 casa 54,...,pa_ordonez@hotmail.com,CC,66987417,Activo,PAOLA ORDONEZ,CC,TINENK ORDONEZ,Vacunación,1824815.0,2018-11-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33397,Andrew,German Shepherd,25.12,2016-08-23,Macho,Tranquilo,Gris,Debbie Smith,Cali,80840 Sellers Junctions,...,brandon31@example.net,CC,195453083,Inactivo,Debbie Smith,CC,Andrew,Venta Alimentos,447797.0,2018-08-11
33398,Andrew,German Shepherd,25.12,2016-08-23,Macho,Tranquilo,Gris,Debbie Smith,Cali,80840 Sellers Junctions,...,brandon31@example.net,CC,195453083,Inactivo,Debbie Smith,CC,Andrew,Vacunación,277567.0,2022-11-23
33399,Andrew,German Shepherd,25.12,2016-08-23,Macho,Tranquilo,Gris,Debbie Smith,Cali,80840 Sellers Junctions,...,brandon31@example.net,CC,195453083,Inactivo,Debbie Smith,CC,Andrew,Desparacitada,1177547.0,2022-04-05
33400,Andrew,German Shepherd,25.12,2016-08-23,Macho,Tranquilo,Gris,Debbie Smith,Cali,80840 Sellers Junctions,...,brandon31@example.net,CC,195453083,Inactivo,Debbie Smith,CC,Andrew,Otros,813519.0,2017-08-24


## Note: Now, we are going to work with a new data frame with 33402 rows with better data, instead that with 182980 rows.