In [29]:
#Important libraries
import pandas as pd
import numpy as np
import random
import duckdb
import yaml
from datetime import datetime
import logging

### Extracting relevant info to build the contract

In [None]:
#Dataframe to use
df = pd.read_csv("mejorado.csv")
df

Added age as part of the example to the contract. Ranges: 18-100

In [None]:
#DO NOT RUN AGAIN
age = []
for i in range(1000):
    age.append(random.randint(18,100))

df=df.assign(ep_edad = age)
df

In [None]:
#DO NOT RUN AGAIN
df.to_csv('mejorado.csv')

In [None]:
np.unique(df['ep_tipo_exp'])

In [None]:
print(np.unique(df['ep_estado']))

In [None]:
print(np.unique(df['ep_ubicacion']))

In [None]:
df['ep_edad'].describe()

### Contract example using the four variables described earlier

In [54]:
#Log file basic configuration
logging.basicConfig(filename="ContractFiles.log",
                    format='%(asctime)s %(message)s',
                    filemode='w')

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Stablishing connection with DB
conn = duckdb.connect('file.db')

In [40]:
# Creating the tables that I'm gonna use and viewing them
try:
        conn.sql("""CREATE TABLE resultados(fecha_val_contrato DATETIME, 
                table_name VARCHAR(50), 
                categorical VARCHAR(50), 
                numerical VARCHAR(50), 
                nulls VARCHAR(50),
                doesnt_exist VARCHAR(50))""")
except:
        print("resultados table already exists")
        
conn.table('resultados').show()

┌────────────────────┬────────────┬─────────────┬───────────┬─────────┬──────────────┐
│ fecha_val_contrato │ table_name │ categorical │ numerical │  nulls  │ doesnt_exist │
│     timestamp      │  varchar   │   varchar   │  varchar  │ varchar │   varchar    │
├────────────────────────────────────────────────────────────────────────────────────┤
│                                       0 rows                                       │
└────────────────────────────────────────────────────────────────────────────────────┘



In [None]:
# This will be the CSV that ill pass in the validator
result = duckdb.query('SELECT * FROM "mejorado.csv"').to_df()
result.head(5)

In [None]:
# Getting the details of the contract in the YAML
with open('Contract.yml', 'rb') as f:
    conf = yaml.safe_load(f.read())

In [48]:
# Contract enforcer
def enforcerSQL(yaml):

    # Defining returns
    now = datetime.now()
    categ_n = ""
    numer_n = ""
    nulls_n = ""
    nonexist = ""
    exists = True

    logger.info("Table %s", yaml["tableName"])

    # Cycle to move in all the columns that the DC defines
    for i in range (len(yaml['columns'])):

        columna = yaml['columns'][i]['column']
        valores = yaml['columns'][i]['values']
        nva = []

        # Validator for categorical columns
        if yaml['columns'][i]['isCategorical']:
            try:
                qry = duckdb.query(f'SELECT DISTINCT {columna} FROM {yaml["tableName"]}').fetchall()
                for i in range(len(qry)):
                    nva.append(qry[i][0])
                if len(list(set(nva).difference(valores))) == 0:
                    logger.info("Column %s correct", columna)
                else:
                    logger.warning(f"Col %s unknown values: {list(set(nva).difference(valores))}", columna)
                    categ_n = categ_n + f"{columna}, "
            except:
                logger.error("Column %s doesnt exist", columna)
                nonexist = nonexist + f"{columna}, "
                exists = False
        
        # Validator for non categorical columns
        else:
            try:
                qry = duckdb.query(f'''SELECT {columna} FROM {yaml["tableName"]}
                                    WHERE {columna} < {valores[0]} OR {columna} > {valores[1]}''').fetchall()
                if len(qry) != 0:
                    logger.warning(f"Col %s wrong vals: {qry}", columna)
                    numer_n = numer_n + f"{columna}, "
                else:
                    logger.info("Column %s correct", columna)
            except:
                logger.error("Column %s doesnt exist", columna)
                nonexist = nonexist + f"{columna}, "
                exists = False

        # Checking for nulls
        if exists:
            nulls = duckdb.query(f'''select {columna} from {yaml["tableName"]} 
                        WHERE {columna} IS NULL''').fetchall()
            if len(nulls) != 0:
                logger.warning("Column %s have nulls", columna)
                nulls_n = nulls_n + f"{columna}, "
            else:
                logger.info("No Null values in %s", columna)
        else:
            exists = True
    
    logger.info("---------------------------------------------------------")

    if len(categ_n) == 0:
        categ_n = "All good"
    if len(numer_n) == 0:
        numer_n = "All good"
    if len(nulls_n) == 0:
        nulls_n = "All good"
    if len(nonexist) == 0:
        nonexist = "All good"

    return now, yaml["tableName"], categ_n, numer_n, nulls_n, nonexist

In [55]:
lista = enforcerSQL(conf)

insert_query = f"""
    INSERT INTO resultados
    (fecha_val_contrato, table_name, categorical, numerical, nulls, doesnt_exist)
    VALUES
    ('{lista[0]}', '{lista[1]}', 
    '{lista[2]}', '{lista[3]}', 
    '{lista[4]}', '{lista[5]}')
"""

conn.execute(insert_query)
conn.table('resultados').show()

┌──────────────────────┬──────────────┬─────────────────────────┬───────────┬───────────────────────────┬──────────────┐
│  fecha_val_contrato  │  table_name  │       categorical       │ numerical │           nulls           │ doesnt_exist │
│      timestamp       │   varchar    │         varchar         │  varchar  │          varchar          │   varchar    │
├──────────────────────┼──────────────┼─────────────────────────┼───────────┼───────────────────────────┼──────────────┤
│ 2023-09-27 11:09:0…  │ tester.csv   │ ep_tipo_exp, ep_estad…  │ ep_edad,  │ ep_estado, ep_ubicacion,  │ All good     │
│ 2023-09-27 11:17:2…  │ mejorado.csv │ All good                │ All good  │ All good                  │ All good     │
│ 2023-09-27 11:26:4…  │ tester.csv   │ ep_tipo_exp, ep_estad…  │ ep_edad,  │ ep_estado, ep_ubicacion,  │ All good     │
└──────────────────────┴──────────────┴─────────────────────────┴───────────┴───────────────────────────┴──────────────┘



In [52]:
conn.close()