# Using DataQuality Library
This demo shows the way to use the dataQuality library.

In [1]:
# Reading file using Pandas
import pandas as pd

encoding = 'iso-8859-1'

# File to be analyzed
people = pd.read_csv('people.csv',
            delimiter = ';',
            encoding = encoding)
people.head(5)

Unnamed: 0,name,job,sex,age,salary,project,email
0,Inés,Front Developer,F,35,72000,Project B,Maritza@domain.com
1,Toño,Software Developer,M,33,72000,Project A,Deividomain.com
2,Carmiña,Software Developer,M,27,72000,Project B,Manuela@domain.com
3,Antony,Software Developer,M,24,72000,Project A,Antony@domain.com
4,Martha,Front Developer,F,30,74000,Project A,Martha@domain.com


In [2]:
# Info about people file
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 7 columns):
name       168 non-null object
job        167 non-null object
sex        168 non-null object
age        167 non-null object
salary     168 non-null object
project    165 non-null object
email      164 non-null object
dtypes: object(7)
memory usage: 9.3+ KB


# Importing library
Data quality library receives three parameters:
1. File to be analized
2. delimiter
3. a boolean identifier to retrieving records that do not comply with the rule (False) or records that do comply with the rule (True).

For this demo we are going to check for bad records.

In [3]:
import Data_Quality as DQ

# Start data quality rules 
dataWork = DQ.Rules('people.csv', 
                    ';', 
                    False) # False for retrieving bad registers only

# Rule for checking specific words
This rules checks for specific words on records on certain column. In this case the rule returns all records with not "Intern" word.

In [4]:
print(dataWork.checkContains('job', 'Intern').head())


   index     name                 job sex age salary    project  \
0      0     Inés     Front Developer   F  35  72000  Project B   
1      1     Toño  Software Developer   M  33  72000  Project A   
2      2  Carmiña  Software Developer   M  27  72000  Project B   
3      3   Antony  Software Developer   M  24  72000  Project A   
4      4   Martha     Front Developer   F  30  74000  Project A   

                email  
0  Maritza@domain.com  
1     Deividomain.com  
2  Manuela@domain.com  
3   Antony@domain.com  
4   Martha@domain.com  


# Rule for checking length
This rules checks for length of records on certain column. In this case the rule returns all records with more than 12 characters.

In [5]:
print(dataWork.checkMaxLength('job', 12).head())

   index     name                 job sex age salary    project  \
0      0     Inés     Front Developer   F  35  72000  Project B   
1      1     Toño  Software Developer   M  33  72000  Project A   
2      2  Carmiña  Software Developer   M  27  72000  Project B   
3      3   Antony  Software Developer   M  24  72000  Project A   
4      4   Martha     Front Developer   F  30  74000  Project A   

                email  
0  Maritza@domain.com  
1     Deividomain.com  
2  Manuela@domain.com  
3   Antony@domain.com  
4   Martha@domain.com  


# Rule for list
This rule checks data against a reference list.

In [6]:
# Reference List for rule.
pd.read_csv('jobs.csv')

Unnamed: 0,job
0,Front Developer
1,Software Developer
2,Data Engineer
3,Manager


In [7]:
# Checking for registers not in reference list for column "job"
# Nan and Intern not in reference list
print(dataWork.checkListReference('jobs.csv', 'job', 'job')[['name','job']])

    name     job
0  David     NaN
1    Ana  Intern
2  Johan  Intern


# Rule for email
This rule checks for email structure.

In [8]:
# Checking for bad registers in column "email", returning emails with wrong structure 
print(dataWork.checkEmail('email')[['name','email']].head())

     name               email
0    Toño     Deividomain.com
1   Lucia                 NaN
2     Ana     Ana@@domain.com
3  Andrew  Andrew@domain..com
4  Andrea                   A


# Rule for null values
This rule checks for missing values.

In [9]:
# Checking registers for null values in project column
print(dataWork.checkNull('project')[['name','project']])

         name project
0      Justin     NaN
1      Stella     NaN
2  Margarette     NaN


# Rule for numbers
This rule checks for number type. Non numbers will be mark as wrong

In [10]:
# Checking registers for number types
print(dataWork.checkNumber('salary')[['name', 'salary']])


      name  salary
0    Johan   97a00
1  Joaquin  1020o0


# Rule for name
This rule checks the names. Only records with allows letters and spaces pass the rule.

In [11]:
print(dataWork.checkName('name')[['name']])

      name
0  Ant0nio
1    J3nny
2    Ang3l
3   Ne/son
4    Juan0
5   Pedro_


# Checking for Generic Pattern

This rules checks for an specific pattern

In [12]:
print(dataWork.checkPattern('name', '^[A-Za-z]+$')[['name']])

      name
0     Inés
1     Toño
2  Carmiña
3  Ant0nio
4    J3nny
5    Ang3l
6   Ne/son
7    Juan0
8   Pedro_


# Closing 
Removes and deletes all files generated during data quality process.

In [13]:
dataWork.close()