<a href="https://colab.research.google.com/github/claredavies/InterviewTaskDataCleaning/blob/main/InterviewDataPrepTask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [6]:
import numpy as np
from google.colab import files
import pandas as pd
from matplotlib import pyplot as plt
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_string_dtype
import re
import socket
import datetime
from datetime import datetime

In [7]:
pd.set_option('max_rows', 99999)
pd.set_option('max_colwidth', 400)
pd.set_option('display.max_rows', None)

# Read in files

In [8]:
!git clone https://github.com/claredavies/InterviewTaskDataCleaning.git

Cloning into 'InterviewTaskDataCleaning'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 22 (delta 8), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (22/22), done.


In [9]:
%cd InterviewTaskDataCleaning

/content/InterviewTaskDataCleaning


In [10]:
df = pd.read_csv('InputData.csv')
df.head(3)

Unnamed: 0,Ref,First_Name,Last_Name,Email,Gender,IP_Address,Date_of_Birth
0,1,Germayne,Simmons,gsimmons0@cdbaby.com,Male,24.48.223.251,18/11/1959
1,2,Cosmo,Pleass,cpleass1@netscape.com,Male,108.87.136.28,08/11/1935
2,3,Henrik,Boag,hboag2@bizjournals.com,Male,221.105.60.101,07/10/1989


# Functions

In [11]:
def boolCheckLengthOverLimitString(value, limit):
  lengthFound = len(value)
  if(lengthFound > limit):
    return True
  else:
    return False

In [12]:
def boolCheckLengthOverLimitInt(value, limit):
  stringValue = str(value)
  return boolCheckLengthOverLimitString(stringValue, limit)

In [52]:
def checkFirstLetterCapitalised(string):
  lengthWord = len(string)
  return string[0].isupper()

In [158]:
def containsNumber(string):
    return any(char.isdigit() for char in string)

In [15]:
def checkIfSpacesBetweenWords(string):
  res = bool(re.search(r"\s", string))
  return res

In [16]:
def checkMaleOrFemale(string):
  lowercaseString = string.lower()
  if lowercaseString == 'female':
    return True
  elif lowercaseString == 'male':
    return True
  else:
    return False

In [17]:
def checkIfIPV4Valid(string):
  try:
      socket.inet_aton(string)
      return True
      # legal
  except socket.error:
    return False

In [18]:
def checkFormatDate(string):
  format = '%d/%m/%Y'
  try:
    datetime.strptime(string, format)
    return True
  except ValueError:
    return False 

In [19]:
def check18OrOverBy1stDec2020(startDate):
  endDate = '01/12/2020'
  endDate = datetime.strptime(endDate, '%d/%m/%Y')
  startDate = datetime.strptime(startDate, '%d/%m/%Y')
  yearsOld = endDate.year - startDate.year - ((endDate.month, endDate.day) < (startDate.month, startDate.day))
  if yearsOld >= 18:
    return True
  else:
    return False

In [113]:
def createHTMLOfDf(df, name):
  html = df.to_html()
  
  # write html to file
  text_file = open(name, "w")
  text_file.write(html)
  text_file.close()

In [162]:
messageErrorNA = "Value is NaN"
messageErrorOverLength = "Length is over the limit"
messageErrorCaseIncorrect = "Case is incorrect"
messageErrorContainsUnwantedNumbers = "Unwanted numbers contained"
messageErrorUnwantedSpace = "Unwanted space"
messageErrorNotMaleOrFemale = "Not Male or Female"
messageErrorNotString = "Not a string"
messageErrorNotValidIPAddress = "Not a valid IPv4"
messageErrorNot18OrOverBy1stDec2020 = "Not 18 or over by 1st Dec 2020"
messageDateOfBirthIncorrectFormat = "Date of birth not in format dd/mm/yyyy"

# Check Duplicates

In [21]:
dfWithoutRef = df.drop('Ref', axis=1)
dfWithoutRef.head(3)

Unnamed: 0,First_Name,Last_Name,Email,Gender,IP_Address,Date_of_Birth
0,Germayne,Simmons,gsimmons0@cdbaby.com,Male,24.48.223.251,18/11/1959
1,Cosmo,Pleass,cpleass1@netscape.com,Male,108.87.136.28,08/11/1935
2,Henrik,Boag,hboag2@bizjournals.com,Male,221.105.60.101,07/10/1989


In [22]:
duplicate = df[dfWithoutRef.duplicated()]
print("Duplicate Rows :")
print(duplicate)

Duplicate Rows :
Empty DataFrame
Columns: [Ref, First_Name, Last_Name, Email, Gender, IP_Address, Date_of_Birth]
Index: []


# Check size - should be 1000

In [23]:
print(df.shape)

(1000, 7)


# Check Ref Column

In [24]:
# want to add in check for unique
def checkRefColumn(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if boolCheckLengthOverLimitInt(input, 8) == True:
    issueMessage = messageErrorOverLength
    dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  elif pd.isna(input):
    issueMessage = messageErrorNA
    dfFlaws = dfFlaws.append({'Ref': ref, 'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  return dfFlaws

# Check First Name

In [166]:
def checkColumnFirstName(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if pd.isna(input):
    issueMessage = messageErrorNA
    dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  else:
    if boolCheckLengthOverLimitInt(input, 20) == True:
      issueMessage = messageErrorOverLength
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)


    if(checkFirstLetterCapitalised(input) == False):
      issueMessage = messageErrorCaseIncorrect
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

    if checkIfSpacesBetweenWords(input) == True:
      issueMessage = messageErrorUnwantedSpace
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

    # need to search for numbers within
    if containsNumber(input) == True:
      issueMessage = messageErrorContainsUnwantedNumbers
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  return dfFlaws

# Check Second Name

In [167]:
def checkColumnSecondName(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if pd.isna(input):
    issueMessage = messageErrorNA
    dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  else:
    if boolCheckLengthOverLimitInt(input, 40) == True:
      issueMessage = messageErrorOverLength
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

    if(checkFirstLetterCapitalised(input) == False):
      issueMessage = messageErrorCaseIncorrect
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)


    # need to search for numbers within
    if containsNumber(input) == True:
      issueMessage = messageErrorContainsUnwantedNumbers
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

    if checkIfSpacesBetweenWords(input) == True:
      issueMessage = messageErrorUnwantedSpace
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
  
  return dfFlaws

# Check Email 

In [27]:
def checkColumnEmail(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if boolCheckLengthOverLimitInt(input, 254) == True:
    issueMessage = messageErrorOverLength
    dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  return dfFlaws

# Check Gender

In [99]:
def checkColumnGender(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if pd.notna(input):
    if type(input) == str:
      if boolCheckLengthOverLimitInt(input, 6) == True:
        issueMessage = messageErrorOverLength
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

      if checkMaleOrFemale(input) == False:
        issueMessage = messageErrorNotMaleOrFemale
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
    
      if checkIfSpacesBetweenWords(input) == True:
        issueMessage = messageErrorUnwantedSpace
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
  
    else:
      issueMessage = messageErrorNotString
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
  else:
      issueMessage = messageErrorNA
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
  
  return dfFlaws

# Check IP Address

In [29]:
def checkColumnIPAddress(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if pd.notna(input):
    if type(input) == str:

      if boolCheckLengthOverLimitInt(input, 15) == True:
        issueMessage = messageErrorOverLength
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
      
      if checkIfIPV4Valid(input) == False:
        issueMessage = messageErrorNotValidIPAddress 
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

    else:
      issueMessage = messageErrorNotString
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  else:
    issueMessage = messageErrorNA
    dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
  
  return dfFlaws

# Check Date of Birth

In [30]:
def checkColumnDateOfBirth(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if pd.notna(input):
    if type(input) == str:

      if boolCheckLengthOverLimitInt(input, 10) == True:
        issueMessage = messageErrorOverLength
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
      
      if checkFormatDate(input) == False:
        issueMessage = messageDateOfBirthIncorrectFormat
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
      
      else:
        if check18OrOverBy1stDec2020(input) == False:
           issueMessage = messageErrorNot18OrOverBy1stDec2020
           dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
    else:
        issueMessage = messageErrorNotString
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  else:
    issueMessage = messageErrorNA
    dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
  
  return dfFlaws

# Find List of Errors

In [211]:
def findErrorsData(dataframe):

  # check all values unique
  print("Are ref unique: " + str(dataframe['Ref'].is_unique))

  # check each row
  dfErrorsFound = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  for index, row in dataframe.iterrows():
    dfErrorsFoundRefColumn = checkRefColumn(row["Ref"], "Ref", row["Ref"])
    if dfErrorsFoundRefColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundRefColumn, ignore_index=True)

    dfErrorsFoundFirstNameColumn = checkColumnFirstName(row["First_Name"], "First_Name", row["Ref"])
    if dfErrorsFoundFirstNameColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundFirstNameColumn, ignore_index=True)

    dfErrorsFoundLastNameColumn = checkColumnSecondName(row["Last_Name"], "Last_Name", row["Ref"])
    if dfErrorsFoundLastNameColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundLastNameColumn, ignore_index=True)

    dfErrorsFoundEmailColumn = checkColumnEmail(row["Email"], "Email", row["Ref"])
    if dfErrorsFoundEmailColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundEmailColumn, ignore_index=True)

    dfErrorsFoundGenderColumn = checkColumnGender(row["Gender"], "Gender", row["Ref"])
    if dfErrorsFoundGenderColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundGenderColumn, ignore_index=True)

    dfErrorsFoundIPAddressColumn = checkColumnIPAddress(row["IP_Address"], "IP_Address", row["Ref"])
    if dfErrorsFoundIPAddressColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundIPAddressColumn, ignore_index=True)

    dfErrorsFoundDateOfBirthColumn = checkColumnDateOfBirth(row["Date_of_Birth"], "Date_of_Birth", row["Ref"])
    if dfErrorsFoundDateOfBirthColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundDateOfBirthColumn, ignore_index=True)

  return dfErrorsFound

In [197]:
dfErrors = findErrorsData(df)

Are ref unique: True


In [198]:
print(dfErrors.head())

  Ref  ColumnName                              ColumnValue  \
0   6  First_Name                               christabel   
1   6   Last_Name                                    start   
2   9  IP_Address  1df6:b3b9:cdb7:d246:3fe9:7288:46ee:e528   
3   9  IP_Address  1df6:b3b9:cdb7:d246:3fe9:7288:46ee:e528   
4  27   Last_Name                                      NaN   

                ColumnIssue  Fixed FixedResult  
0         Case is incorrect  False         N/A  
1         Case is incorrect  False         N/A  
2  Length is over the limit  False         N/A  
3          Not a valid IPv4  False         N/A  
4              Value is NaN  False         N/A  


In [170]:
createHTMLOfDf(dfErrors, "DfErrors.html")

# Data Error Exploration

In [81]:
def inspectRowsEffectedByError(dfOriginal, dfErrors, errorMessage):
  noRowsOriginalDf = len(dfOriginal.index)
  errorDfSpecificErrorMsg = dfErrors[dfErrors['ColumnIssue'] == errorMessage]
  print("where: " + errorMessage)
  noRowsErrorDfSpecificErrorMsg = len(errorDfSpecificErrorMsg.index)
  percentage = (noRowsErrorDfSpecificErrorMsg/noRowsOriginalDf)*100
  percentage = str(round(percentage, 2))
  print("% where : " + errorMessage + "  " + percentage + "%") 
  print()
  return errorDfSpecificErrorMsg

In [None]:
# messageErrorNA = "Value is NaN"
# messageErrorOverLength = "Length is over the limit"
# messageErrorNotValidIPAddress = "Not a valid IPv4"
# messageErrorNot18OrOverBy1stDec2020 = "Not 18 or over by 1st Dec 2020"
# messageErrorNotString = "Not a string"

# messageErrorCaseIncorrect = "Case is incorrect" x
# messageErrorNotOnlyCharacter = "Not only characters contained" x
# messageErrorUnwantedSpace = "Space between names" x
# messageErrorNotMaleOrFemale = "Not Male or Female" x
# messageDateOfBirthIncorrectFormat = "Date of birth not in format dd/mm/yyyy"x

# Clean Functions

In [152]:
def removeSpace(value):
  valueWithoutSpace = value.replace(" ", "")
  return valueWithoutSpace

In [216]:
def fixSpace(df, ref, columnName):
  row = df[df['Ref'] == ref]
  rowValueColumnName = row[columnName].iat[0]
  rowValueColumnNameWithoutSpace = removeSpace(rowValueColumnName)
  df.loc[df['Ref'] == ref,[columnName]] = rowValueColumnNameWithoutSpace
  return df

In [227]:
def fixNumbersToLetters(df, ref, columnName):
  row = df[df['Ref'] == ref]
  rowValueColumnName = row[columnName].iat[0]
  rowValueColumnNameWithoutNumbers = convertNumbersToLetters(rowValueColumnName)
  df.loc[df['Ref'] == ref,[columnName]] = rowValueColumnNameWithoutNumbers
  return df

In [183]:
def convertNumbersToLetters(value):
  stringWithout0 = value.replace("0", "o" )
  stringWithout0And1 = stringWithout0.replace("1", "l" )
  return stringWithout0And1

In [234]:
def capitalismFirstLetter(df, ref, columnName):
  row = df[df['Ref'] == ref]
  rowValueColumnName = row[columnName].iat[0]
  rowValueColumnNameCapitalised = convertNumbersToLetters(rowValueColumnName)
  df.loc[df['Ref'] == ref,[columnName]] = rowValueColumnNameCapitalised.capitalize()
  return df

In [248]:
def replaceGenderTerms(value):
  value = str(value)
  if value == '1':
    value = 'Male'
  elif value == '2':
    value = 'Female'
  elif value == 'm':
    value = 'Male'
  elif value == 'f':
    value = 'Female'
  elif value == 'fem':
    value = 'Female'
  return value

In [240]:
def fixGenderTerms(df, ref, columnName):
  row = df[df['Ref'] == ref]
  rowValueColumnName = row[columnName].iat[0]
  rowValueColumnNameCapitalised = convertNumbersToLetters(rowValueColumnName)
  df.loc[df['Ref'] == ref,[columnName]] = replaceGenderTerms(rowValueColumnNameCapitalised)
  return df

# To fix

Fix unwanted space

In [249]:
dfClean = df.copy()

In [250]:
# remove unwanted spaces
errorDfSpaces = dfErrors[dfErrors['ColumnIssue'] == messageErrorUnwantedSpace]
for index, row in errorDfSpaces.iterrows():
  dfClean = fixSpace(dfClean, row["Ref"], row["ColumnName"])

In [251]:
print(dfClean[dfClean['Last_Name'] == "McCorley"])

     Ref First_Name Last_Name                     Email Gender     IP_Address  \
100  101      Britt  McCorley  bmccorley2s@illinois.edu   Male  184.157.96.18   

    Date_of_Birth  
100    18/03/1935  


In [252]:
# remove unwanted numbers
errorDfUnwantedNumbers = dfErrors[dfErrors['ColumnIssue'] == messageErrorContainsUnwantedNumbers]
for index, row in errorDfUnwantedNumbers.iterrows():
  dfClean = fixNumbersToLetters(dfClean, row["Ref"], row["ColumnName"])

In [253]:
# fix case (first capital)
errorDfUnwantedNumbers = dfErrors[dfErrors['ColumnIssue'] == messageErrorCaseIncorrect]
for index, row in errorDfUnwantedNumbers.iterrows():
  dfClean = capitalismFirstLetter(dfClean, row["Ref"], row["ColumnName"])

In [254]:
# fix gender
errorDfUnwantedNumbers = dfErrors[dfErrors['ColumnIssue'] == messageErrorNotMaleOrFemale]
for index, row in errorDfUnwantedNumbers.iterrows():
  dfClean = fixGenderTerms(dfClean, row["Ref"], row["ColumnName"])

In [None]:
# fix date of birth format

In [255]:
dfErrorsCleaned = findErrorsData(dfClean)
createHTMLOfDf(dfErrorsCleaned, "DfErrorsFixErrorsCleaned.html")

Are ref unique: True


Gender

In [92]:
errorDfSpecificErrorMsg = dfErrors[dfErrors['ColumnIssue'] == messageErrorNotMaleOrFemale]
print(errorDfSpecificErrorMsg)

genderIsOne = df[df['Gender'] == '1']
print(genderIsOne)

genderIsTwo = df[df['Gender'] == '2']
print(genderIsTwo)

genderIsM = df[df['Gender'] == 'm ']
print(genderIsM)

genderIsF = df[df['Gender'] == 'f ']
print(genderIsF)

genderIsFem = df[df['Gender'] == 'fem ']
print(genderIsFem)

    Ref ColumnName ColumnValue         ColumnIssue  Fixed FixedResult
17  124     Gender           1  Not Male or Female  False         N/A
22  158     Gender          m   Not Male or Female  False         N/A
42  347     Gender           1  Not Male or Female  False         N/A
44  354     Gender           1  Not Male or Female  False         N/A
47  402     Gender        fem   Not Male or Female  False         N/A
61  521     Gender           2  Not Male or Female  False         N/A
67  586     Gender          f   Not Male or Female  False         N/A
     Ref First_Name Last_Name                       Email Gender  \
123  124  Benedicta    Nurden    bnurden3f@friendfeed.com      1   
346  347     Eadith      Call   ecall9m@deliciousdays.com      1   
353  354       Maxy  Skilbeck  mskilbeck9t@friendfeed.com      1   

        IP_Address Date_of_Birth  
123  97.133.217.98    16/07/1940  
346    3.46.15.157    22/06/2003  
353  241.81.36.194    03/04/1960  
     Ref First_Name Last_Na

messageErrorNotOnlyCharacter

In [111]:
errorDfSpecificErrorMsg = dfErrors[dfErrors['ColumnIssue'] == messageErrorNotOnlyCharacter]
print(errorDfSpecificErrorMsg)

genderIs1ain = df[df['First_Name'] == '1ain']
print(genderIs1ain)

     Ref  ColumnName      ColumnValue                    ColumnIssue  Fixed  \
8     79   Last_Name        O' Markey  Not only characters contained  False   
10   101   Last_Name        Mc Corley  Not only characters contained  False   
13   111   Last_Name       Le Pruvost  Not only characters contained  False   
18   128   Last_Name         Le Franc  Not only characters contained  False   
25   180   Last_Name       Van Geffen  Not only characters contained  False   
27   192   Last_Name          Hainey`  Not only characters contained  False   
28   241   Last_Name           Mc Gee  Not only characters contained  False   
34   288  First_Name        Miof mela  Not only characters contained  False   
36   298   Last_Name          Gambe11  Not only characters contained  False   
38   322   Last_Name           O'Dare  Not only characters contained  False   
47   376   Last_Name  Falconer-Taylor  Not only characters contained  False   
55   456   Last_Name         O'Cannan  Not only char

date of birth

In [115]:
errorDfSpecificErrorMsg = dfErrors[dfErrors['ColumnIssue'] == messageDateOfBirthIncorrectFormat]
print(errorDfSpecificErrorMsg)

     Ref     ColumnName  ColumnValue                             ColumnIssue  \
5     36  Date_of_Birth   01/31/1995  Date of birth not in format dd/mm/yyyy   
12   102  Date_of_Birth   27/27/1991  Date of birth not in format dd/mm/yyyy   
114  878  Date_of_Birth  09-MAY-1956  Date of birth not in format dd/mm/yyyy   
130  965  Date_of_Birth  06/05/1943*  Date of birth not in format dd/mm/yyyy   

     Fixed FixedResult  
5    False         N/A  
12   False         N/A  
114  False         N/A  
130  False         N/A  


# To Delete

To be deleted

In [82]:
inspectRowsEffectedByError(df, dfErrors, messageErrorNA)

inspectRowsEffectedByError(df, dfErrors, messageErrorOverLength)

inspectRowsEffectedByError(df, dfErrors, messageErrorNotValidIPAddress)

inspectRowsEffectedByError(df, dfErrors, messageErrorNot18OrOverBy1stDec2020)

inspectRowsEffectedByError(df, dfErrors, messageErrorNotString)

where: Value is NaN
% where : Value is NaN  0.7%

where: Length is over the limit
% where : Length is over the limit  0.6%

where: Not a valid IPv4
% where : Not a valid IPv4  0.6%

where: Not 18 or over by 1st Dec 2020
% where : Not 18 or over by 1st Dec 2020  4.5%



Unnamed: 0,Ref,ColumnName,ColumnValue,ColumnIssue,Fixed,FixedResult
6,64,Date_of_Birth,28/06/2003,Not 18 or over by 1st Dec 2020,False,
7,77,Date_of_Birth,18/10/2004,Not 18 or over by 1st Dec 2020,False,
15,119,Date_of_Birth,21/12/2003,Not 18 or over by 1st Dec 2020,False,
16,120,Date_of_Birth,23/06/2004,Not 18 or over by 1st Dec 2020,False,
20,133,Date_of_Birth,25/03/2003,Not 18 or over by 1st Dec 2020,False,
21,145,Date_of_Birth,21/03/2005,Not 18 or over by 1st Dec 2020,False,
31,249,Date_of_Birth,08/07/2005,Not 18 or over by 1st Dec 2020,False,
35,308,Date_of_Birth,26/07/2004,Not 18 or over by 1st Dec 2020,False,
37,328,Date_of_Birth,15/09/2004,Not 18 or over by 1st Dec 2020,False,
38,329,Date_of_Birth,30/09/2005,Not 18 or over by 1st Dec 2020,False,
