<a href="https://colab.research.google.com/github/claredavies/InterviewTaskDataCleaning/blob/main/InterviewDataPrepTask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [4]:
import numpy as np
from google.colab import files
import pandas as pd
from matplotlib import pyplot as plt
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_string_dtype
import re
import socket
import datetime
from datetime import datetime

In [5]:
pd.set_option('max_rows', 99999)
pd.set_option('max_colwidth', 400)
pd.set_option('display.max_rows', None)

# Read in files

In [233]:
!git clone https://github.com/claredavies/InterviewTaskDataCleaning.git

Cloning into 'InterviewTaskDataCleaning'...
remote: Enumerating objects: 40, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 40 (delta 20), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (40/40), done.


In [234]:
%cd InterviewTaskDataCleaning

/content/InterviewTaskDataCleaning/InterviewTaskDataCleaning


In [235]:
df = pd.read_csv('InputData.csv')
df.head(3)

Unnamed: 0,Ref,First_Name,Last_Name,Email,Gender,IP_Address,Date_of_Birth
0,1,Germayne,Simmons,gsimmons0@cdbaby.com,Male,24.48.223.251,18/11/1959
1,2,Cosmo,Pleass,cpleass1@netscape.com,Male,108.87.136.28,08/11/1935
2,3,Henrik,Boag,hboag2@bizjournals.com,Male,221.105.60.101,07/10/1989


# Functions

In [236]:
def boolCheckLengthOverLimitString(value, limit):
  lengthFound = len(value)
  if(lengthFound > limit):
    return True
  else:
    return False

In [237]:
def boolCheckLengthOverLimitInt(value, limit):
  stringValue = str(value)
  return boolCheckLengthOverLimitString(stringValue, limit)

In [238]:
def checkFirstLetterCapitalised(string):
  lengthWord = len(string)
  return string[0].isupper()

In [239]:
def containsNumber(string):
    return any(char.isdigit() for char in string)

In [240]:
def checkIfSpacesBetweenWords(string):
  res = bool(re.search(r"\s", string))
  return res

In [241]:
def checkMaleOrFemale(string):
  lowercaseString = string.lower()
  if lowercaseString == 'female':
    return True
  elif lowercaseString == 'male':
    return True
  else:
    return False

In [242]:
def checkIfIPV4Valid(string):
  try:
      socket.inet_aton(string)
      return True
      # legal
  except socket.error:
    return False

In [243]:
def checkFormatDate(string):
  format = '%d/%m/%Y'
  try:
    datetime.strptime(string, format)
    return True
  except ValueError:
    return False 

In [244]:
def check18OrOverBy1stDec2020(startDate):
  endDate = '01/12/2020'
  endDate = datetime.strptime(endDate, '%d/%m/%Y')
  startDate = datetime.strptime(startDate, '%d/%m/%Y')
  yearsOld = endDate.year - startDate.year - ((endDate.month, endDate.day) < (startDate.month, startDate.day))
  if yearsOld >= 18:
    return True
  else:
    return False

In [245]:
def createHTMLOfDf(df, name):
  html = df.to_html()
  
  # write html to file
  text_file = open(name, "w")
  text_file.write(html)
  text_file.close()

In [246]:
messageErrorNA = "Value is NaN"
messageErrorOverLength = "Length is over the limit"
messageErrorCaseIncorrect = "Case is incorrect"
messageErrorContainsUnwantedNumbers = "Unwanted numbers contained"
messageErrorUnwantedSpace = "Unwanted space"
messageErrorNotMaleOrFemale = "Not Male or Female"
messageErrorNotString = "Not a string"
messageErrorNotValidIPAddress = "Not a valid IPv4"
messageErrorNot18OrOverBy1stDec2020 = "Not 18 or over by 1st Dec 2020"
messageDateOfBirthIncorrectFormat = "Date of birth not in format dd/mm/yyyy"

# Check Duplicates

In [247]:
dfWithoutRef = df.drop('Ref', axis=1)
dfWithoutRef.head(3)

Unnamed: 0,First_Name,Last_Name,Email,Gender,IP_Address,Date_of_Birth
0,Germayne,Simmons,gsimmons0@cdbaby.com,Male,24.48.223.251,18/11/1959
1,Cosmo,Pleass,cpleass1@netscape.com,Male,108.87.136.28,08/11/1935
2,Henrik,Boag,hboag2@bizjournals.com,Male,221.105.60.101,07/10/1989


In [248]:
duplicate = df[dfWithoutRef.duplicated()]
print("Duplicate Rows :")
print(duplicate)

Duplicate Rows :
Empty DataFrame
Columns: [Ref, First_Name, Last_Name, Email, Gender, IP_Address, Date_of_Birth]
Index: []


# Check size - should be 1000

In [249]:
print(df.shape)

(1000, 7)


# Check Ref Column

In [287]:
# want to add in check for unique
def checkRefColumn(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if boolCheckLengthOverLimitInt(input, 8) == True:
    issueMessage = messageErrorOverLength
    dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  elif pd.isna(input):
    issueMessage = messageErrorNA
    dfFlaws = dfFlaws.append({'Ref': ref, 'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  return dfFlaws

In [315]:
def fixColumnRef(dataframe, ref, columnIssue, columnName):
  dataframe = fixSpace(dataframe, ref, columnName)
  return dataframe

# Check First Name

In [289]:
def checkColumnFirstName(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if pd.isna(input):
    issueMessage = messageErrorNA
    dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  else:
    if boolCheckLengthOverLimitInt(input, 20) == True:
      issueMessage = messageErrorOverLength
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)


    if(checkFirstLetterCapitalised(input) == False):
      issueMessage = messageErrorCaseIncorrect
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

    if checkIfSpacesBetweenWords(input) == True:
      issueMessage = messageErrorUnwantedSpace
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

    # need to search for numbers within
    if containsNumber(input) == True:
      issueMessage = messageErrorContainsUnwantedNumbers
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  return dfFlaws

In [314]:
def fixColumnFirstName(dataframe, ref, columnIssue, columnName):
  if columnIssue == messageErrorUnwantedSpace:
    dataframe = fixSpace(dataframe, ref, columnName)
  elif columnIssue == messageErrorContainsUnwantedNumbers:
    dataframe = fixNumbersToLetters(dataframe, ref, columnName)
  elif columnIssue == messageErrorCaseIncorrect:
    dataframe = capitalismFirstLetter(dataframe, ref, columnName)
  return dataframe

# Check Second Name

In [291]:
def checkColumnSecondName(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if pd.isna(input):
    issueMessage = messageErrorNA
    dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  else:
    if boolCheckLengthOverLimitInt(input, 40) == True:
      issueMessage = messageErrorOverLength
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

    if(checkFirstLetterCapitalised(input) == False):
      issueMessage = messageErrorCaseIncorrect
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)


    # need to search for numbers within
    if containsNumber(input) == True:
      issueMessage = messageErrorContainsUnwantedNumbers
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

    if checkIfSpacesBetweenWords(input) == True:
      issueMessage = messageErrorUnwantedSpace
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
  
  return dfFlaws

In [313]:
def fixColumnSecondName(dataframe, ref, columnIssue, columnName):
  if columnIssue == messageErrorUnwantedSpace:
    dataframe = fixSpace(dataframe,  ref, columnName)
  elif columnIssue == messageErrorContainsUnwantedNumbers:
    dataframe = fixNumbersToLetters(dataframe, ref, columnName)
  elif columnIssue == messageErrorCaseIncorrect:
    dataframe = capitalismFirstLetter(dataframe, ref, columnName)
  return dataframe

# Check Email 

In [293]:
def checkColumnEmail(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if boolCheckLengthOverLimitInt(input, 254) == True:
    issueMessage = messageErrorOverLength
    dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  return dfFlaws

In [312]:
def fixColumnEmail(dataframe, ref, columnIssue, columnName):
  return dataframe

# Check Gender

In [295]:
def checkColumnGender(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if pd.notna(input):
    if type(input) == str:
      if boolCheckLengthOverLimitInt(input, 6) == True:
        issueMessage = messageErrorOverLength
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

      if checkIfSpacesBetweenWords(input) == True:
        issueMessage = messageErrorUnwantedSpace
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
  
      if checkMaleOrFemale(input) == False:
        issueMessage = messageErrorNotMaleOrFemale
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
  
    else:
      issueMessage = messageErrorNotString
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
  else:
      issueMessage = messageErrorNA
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
  
  return dfFlaws

In [309]:
def fixColumnGender(dataframe, ref, columnIssue, columnName):
  if columnIssue == messageErrorUnwantedSpace:
    dataframe = fixSpace(dataframe, ref, columnName)
  elif columnIssue == messageErrorNotMaleOrFemale:
    # print("here to fix gender term")
    dataframe = fixGenderTerms(dataframe, ref, columnName)
  return dataframe

# Check IP Address

In [297]:
def checkColumnIPAddress(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if pd.notna(input):
    if type(input) == str:

      if boolCheckLengthOverLimitInt(input, 15) == True:
        issueMessage = messageErrorOverLength
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
      
      if checkIfIPV4Valid(input) == False:
        issueMessage = messageErrorNotValidIPAddress 
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

    else:
      issueMessage = messageErrorNotString
      dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  else:
    issueMessage = messageErrorNA
    dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
  
  return dfFlaws

In [310]:
def fixColumnIPAddress(dataframe, ref, columnIssue, columnName):
  return dataframe

# Check Date of Birth

In [299]:
def checkColumnDateOfBirth(input, columnName, ref):
  dfFlaws = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  if pd.notna(input):
    if type(input) == str:

      if boolCheckLengthOverLimitInt(input, 10) == True:
        issueMessage = messageErrorOverLength
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
      
      if checkFormatDate(input) == False:
        issueMessage = messageDateOfBirthIncorrectFormat
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
      
      else:
        if check18OrOverBy1stDec2020(input) == False:
           issueMessage = messageErrorNot18OrOverBy1stDec2020
           dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
    else:
        issueMessage = messageErrorNotString
        dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)

  else:
    issueMessage = messageErrorNA
    dfFlaws = dfFlaws.append({'Ref': ref,'ColumnName': columnName, 'ColumnValue': input, 'ColumnIssue': issueMessage, 'Fixed': False, 'FixedResult': 'N/A'}, ignore_index=True)
  
  return dfFlaws

In [311]:
def fixColumnDateOfBirth(dataframe, ref, columnIssue, columnName):
  if columnIssue == messageErrorUnwantedSpace:
    dataframe = fixSpace(dataframe, ref, columnName)
  elif columnIssue == messageDateOfBirthIncorrectFormat:
    dataframe = fixDate(dataframe, ref, columnName)
  return dataframe

# Clean Functions

In [264]:
def removeSpace(value):
  valueWithoutSpace = value.replace(" ", "")
  return valueWithoutSpace

In [265]:
def fixSpace(df, ref, columnName):
  row = df[df['Ref'] == ref]
  rowValueColumnName = row[columnName].iat[0]
  rowValueColumnNameWithoutSpace = removeSpace(rowValueColumnName)
  df.loc[df['Ref'] == ref,[columnName]] = rowValueColumnNameWithoutSpace
  return df

In [266]:
def fixNumbersToLetters(df, ref, columnName):
  row = df[df['Ref'] == ref]
  rowValueColumnName = row[columnName].iat[0]
  rowValueColumnNameWithoutNumbers = convertNumbersToLetters(rowValueColumnName)
  df.loc[df['Ref'] == ref,[columnName]] = rowValueColumnNameWithoutNumbers
  return df

In [267]:
def convertNumbersToLetters(value):
  stringWithout0 = value.replace("0", "o" )
  stringWithout0And1 = stringWithout0.replace("1", "l" )
  return stringWithout0And1

In [268]:
def capitalismFirstLetter(df, ref, columnName):
  row = df[df['Ref'] == ref]
  rowValueColumnName = row[columnName].iat[0]
  rowValueColumnNameCapitalised = convertNumbersToLetters(rowValueColumnName)
  df.loc[df['Ref'] == ref,[columnName]] = rowValueColumnNameCapitalised.capitalize()
  return df

In [269]:
def replaceGenderTerms(value):
  value = str(value)
  if value == 'l':
    value = 'Female'
  elif value == '2':
    value = 'Male'
  elif value == 'm':
    value = 'Male'
  elif value == 'f':
    value = 'Female'
  elif value == 'fem':
    value = 'Female'
  return value

In [270]:
def fixGenderTerms(df, ref, columnName):
  row = df[df['Ref'] == ref]
  rowValueColumnName = row[columnName].iat[0]
  df.loc[df['Ref'] == ref,[columnName]] = replaceGenderTerms(rowValueColumnName)
  return df

In [271]:
def dateInFormMMDDYYYY(input):
  format = '%m/%d/%Y'
  try:
    datetime.strptime(input, format)
    return True
  except ValueError:
    return False 

In [272]:
def dateContainMonthName(input):
  format = '%d-%B-%Y'
  try:
    datetime.strptime(input, format)
    return True
  except ValueError:
    return False 

In [273]:
def convertDateMonthName(input):
  input = datetime.strptime(input, "%d-%B-%Y").strftime("%d/%m/%Y")
  return input

In [274]:
def dateConvertMMDDYYYYToDDMMYYYY(input):
  input = datetime.strptime(input, "%m/%d/%Y").strftime("%d/%m/%Y")
  return input

In [275]:
def checkIfContainsSpecialCharactersOtherThanSlash(input):
  special_characters = ['@','#','%', '*']
  return any(c in special_characters for c in input)

In [276]:
def removeSpecialCharactersOtherThanSlash(input):
  special_characters = ['@','#','%', '*']
  out_list = []
  for x in input:
    for y in special_characters:
        if y in x:
            input = input.replace(x,'')
  return input

In [277]:
def fixDate(df, ref, columnName):
  row = df[df['Ref'] == ref]
  rowValueColumnName = row[columnName].iat[0]
  if dateInFormMMDDYYYY(rowValueColumnName):
    df.loc[df['Ref'] == ref,[columnName]] = dateConvertMMDDYYYYToDDMMYYYY(rowValueColumnName)
  elif dateContainMonthName(rowValueColumnName):
    df.loc[df['Ref'] == ref,[columnName]] = convertDateMonthName(rowValueColumnName)
  elif checkIfContainsSpecialCharactersOtherThanSlash(rowValueColumnName):
    df.loc[df['Ref'] == ref,[columnName]] = removeSpecialCharactersOtherThanSlash(rowValueColumnName)
  return df

# Find List of Errors

In [278]:
def findErrorsData(dataframe):

  # check all values unique
  print("Are ref unique: " + str(dataframe['Ref'].is_unique))

  # check each row
  dfErrorsFound = pd.DataFrame(columns=['Ref', 'ColumnName', 'ColumnValue', 'ColumnIssue','Fixed',"FixedResult"])

  for index, row in dataframe.iterrows():
    dfErrorsFoundRefColumn = checkRefColumn(row["Ref"], "Ref", row["Ref"])
    if dfErrorsFoundRefColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundRefColumn, ignore_index=True)

    dfErrorsFoundFirstNameColumn = checkColumnFirstName(row["First_Name"], "First_Name", row["Ref"])
    if dfErrorsFoundFirstNameColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundFirstNameColumn, ignore_index=True)

    dfErrorsFoundLastNameColumn = checkColumnSecondName(row["Last_Name"], "Last_Name", row["Ref"])
    if dfErrorsFoundLastNameColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundLastNameColumn, ignore_index=True)

    dfErrorsFoundEmailColumn = checkColumnEmail(row["Email"], "Email", row["Ref"])
    if dfErrorsFoundEmailColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundEmailColumn, ignore_index=True)

    dfErrorsFoundGenderColumn = checkColumnGender(row["Gender"], "Gender", row["Ref"])
    if dfErrorsFoundGenderColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundGenderColumn, ignore_index=True)

    dfErrorsFoundIPAddressColumn = checkColumnIPAddress(row["IP_Address"], "IP_Address", row["Ref"])
    if dfErrorsFoundIPAddressColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundIPAddressColumn, ignore_index=True)

    dfErrorsFoundDateOfBirthColumn = checkColumnDateOfBirth(row["Date_of_Birth"], "Date_of_Birth", row["Ref"])
    if dfErrorsFoundDateOfBirthColumn.empty == False:
      dfErrorsFound = dfErrorsFound.append(dfErrorsFoundDateOfBirthColumn, ignore_index=True)

  return dfErrorsFound

In [279]:
dfErrors = findErrorsData(df)

Are ref unique: True


In [280]:
print(dfErrors.head())

  Ref  ColumnName                              ColumnValue  \
0   6  First_Name                               christabel   
1   6   Last_Name                                    start   
2   9  IP_Address  1df6:b3b9:cdb7:d246:3fe9:7288:46ee:e528   
3   9  IP_Address  1df6:b3b9:cdb7:d246:3fe9:7288:46ee:e528   
4  27   Last_Name                                      NaN   

                ColumnIssue  Fixed FixedResult  
0         Case is incorrect  False         N/A  
1         Case is incorrect  False         N/A  
2  Length is over the limit  False         N/A  
3          Not a valid IPv4  False         N/A  
4              Value is NaN  False         N/A  


In [281]:
createHTMLOfDf(dfErrors, "DfErrors.html")

# Data Error Exploration

In [208]:
def inspectRowsEffectedByError(dfOriginal, dfErrors, errorMessage):
  noRowsOriginalDf = len(dfOriginal.index)
  errorDfSpecificErrorMsg = dfErrors[dfErrors['ColumnIssue'] == errorMessage]
  print("where: " + errorMessage)
  noRowsErrorDfSpecificErrorMsg = len(errorDfSpecificErrorMsg.index)
  percentage = (noRowsErrorDfSpecificErrorMsg/noRowsOriginalDf)*100
  percentage = str(round(percentage, 2))
  print("% where : " + errorMessage + "  " + percentage + "%") 
  print()
  return errorDfSpecificErrorMsg

Gender

In [209]:
errorDfSpecificErrorMsg = dfErrors[dfErrors['ColumnIssue'] == messageErrorNotMaleOrFemale]
print(errorDfSpecificErrorMsg)

genderIsOne = df[df['Gender'] == '1']
print(genderIsOne)

genderIsTwo = df[df['Gender'] == '2']
print(genderIsTwo)

genderIsM = df[df['Gender'] == 'm ']
print(genderIsM)

genderIsF = df[df['Gender'] == 'f ']
print(genderIsF)

genderIsFem = df[df['Gender'] == 'fem ']
print(genderIsFem)

    Ref ColumnName ColumnValue         ColumnIssue  Fixed FixedResult
14  124     Gender           1  Not Male or Female  False         N/A
18  158     Gender          m   Not Male or Female  False         N/A
35  347     Gender           1  Not Male or Female  False         N/A
37  354     Gender           1  Not Male or Female  False         N/A
39  402     Gender        fem   Not Male or Female  False         N/A
52  521     Gender           2  Not Male or Female  False         N/A
58  586     Gender          f   Not Male or Female  False         N/A
     Ref First_Name Last_Name                       Email Gender  \
123  124  Benedicta    Nurden    bnurden3f@friendfeed.com      1   
346  347     Eadith      Call   ecall9m@deliciousdays.com      1   
353  354       Maxy  Skilbeck  mskilbeck9t@friendfeed.com      1   

        IP_Address Date_of_Birth  
123  97.133.217.98    16/07/1940  
346    3.46.15.157    22/06/2003  
353  241.81.36.194    03/04/1960  
     Ref First_Name Last_Na

Containing Numbers

In [210]:
errorDfSpecificErrorMsg = dfErrors[dfErrors['ColumnIssue'] == messageErrorContainsUnwantedNumbers]
print(errorDfSpecificErrorMsg)

genderIs1ain = df[df['First_Name'] == '1ain']
print(genderIs1ain)

    Ref  ColumnName ColumnValue                 ColumnIssue  Fixed FixedResult
28  298   Last_Name     Gambe11  Unwanted numbers contained  False         N/A
47  482  First_Name        1ain  Unwanted numbers contained  False         N/A
55  554  First_Name        0wen  Unwanted numbers contained  False         N/A
83  815   Last_Name  0'Dowgaine  Unwanted numbers contained  False         N/A
     Ref First_Name Last_Name                    Email Gender     IP_Address  \
481  482       1ain    Freeth  ifreethdd@artisteer.com   Male  162.225.238.5   

    Date_of_Birth  
481    03/10/1935  


Date

In [211]:
errorDfSpecificErrorMsg = dfErrors[dfErrors['ColumnIssue'] == messageDateOfBirthIncorrectFormat]
print(errorDfSpecificErrorMsg)

     Ref     ColumnName  ColumnValue                             ColumnIssue  \
5     36  Date_of_Birth   01/31/1995  Date of birth not in format dd/mm/yyyy   
10   102  Date_of_Birth   27/27/1991  Date of birth not in format dd/mm/yyyy   
90   878  Date_of_Birth  09-MAY-1956  Date of birth not in format dd/mm/yyyy   
101  965  Date_of_Birth  06/05/1943*  Date of birth not in format dd/mm/yyyy   

     Fixed FixedResult  
5    False         N/A  
10   False         N/A  
90   False         N/A  
101  False         N/A  


In [212]:
# messageErrorNA = "Value is NaN"
# messageErrorOverLength = "Length is over the limit"
# messageErrorNotValidIPAddress = "Not a valid IPv4"
# messageErrorNot18OrOverBy1stDec2020 = "Not 18 or over by 1st Dec 2020"
# messageErrorNotString = "Not a string"

# messageErrorCaseIncorrect = "Case is incorrect" x
# messageErrorNotOnlyCharacter = "Not only characters contained" x
# messageErrorUnwantedSpace = "Space between names" x
# messageErrorNotMaleOrFemale = "Not Male or Female" x
# messageDateOfBirthIncorrectFormat = "Date of birth not in format dd/mm/yyyy"x

# To fix

In [319]:
# remove unwanted spaces
def clean(df, dfErrors):
  dfClean = df.copy()
  # errorDfSpaces = dfErrors[dfErrors["ColumnIssue"] == messageErrorUnwantedSpace]
  for index, row in dfErrors.iterrows():

    # rowDF = df[df["Ref"] == row["Ref"]]

    if row["ColumnName"] == "Ref":
      dfClean = fixColumnRef(dfClean, row["Ref"], row["ColumnIssue"], row["ColumnName"])   

    elif row["ColumnName"] == "First_Name":
      dfClean = fixColumnFirstName(dfClean, row["Ref"], row["ColumnIssue"], row["ColumnName"])

    elif row["ColumnName"] == "Last_Name":
      dfClean = fixColumnSecondName(dfClean, row["Ref"], row["ColumnIssue"], row["ColumnName"])

    elif row["ColumnName"] == "Email":
      dfClean = fixColumnEmail(dfClean, row["Ref"], row["ColumnIssue"], row["ColumnName"])

    elif row["ColumnName"] == "Gender":
      dfClean = fixColumnGender(dfClean, row["Ref"], row["ColumnIssue"], row["ColumnName"])
    
    elif row["ColumnName"] == "IP_Address":
      dfClean = fixColumnIPAddress(dfClean, row["Ref"], row["ColumnIssue"], row["ColumnName"])

    elif row["ColumnName"] == "Date_Of_Birth":
      dfClean = fixColumnDateOfBirth(dfClean, row["Ref"], row["ColumnIssue"], row["ColumnName"])
    
  dfErrorsCleaned = findErrorsData(dfClean)
  createHTMLOfDf(dfErrorsCleaned, "DfErrorsFixErrorsCleaned.html")  
  
  return dfClean

In [320]:
clean(df, dfErrors)

Are ref unique: True


Unnamed: 0,Ref,First_Name,Last_Name,Email,Gender,IP_Address,Date_of_Birth
0,1,Germayne,Simmons,gsimmons0@cdbaby.com,Male,24.48.223.251,18/11/1959
1,2,Cosmo,Pleass,cpleass1@netscape.com,Male,108.87.136.28,08/11/1935
2,3,Henrik,Boag,hboag2@bizjournals.com,Male,221.105.60.101,07/10/1989
3,4,Kinsley,Millard,kmillard3@furl.net,Male,67.153.106.99,03/02/1954
4,5,Peri,Lippatt,plippatt4@liveinternet.ru,Female,179.46.90.224,21/12/1988
5,6,Christabel,Start,cstart5@wired.com,Female,204.249.255.202,25/04/1996
6,7,Oralee,Warr,owarr6@mediafire.com,Female,149.182.65.177,04/12/1956
7,8,Elmo,Kime,ekime7@paginegialle.it,Male,238.134.174.180,13/05/1934
8,9,Diego,Kolin,dkolin8@statcounter.com,Male,1df6:b3b9:cdb7:d246:3fe9:7288:46ee:e528,31/07/1938
9,10,Chris,Seiffert,cseiffert9@vimeo.com,Male,21.155.195.124,16/11/1990


# Info

In [169]:
# fix case (first capital)
errorDfUnwantedNumbers = dfErrors[dfErrors['ColumnIssue'] == messageErrorCaseIncorrect]
for index, row in errorDfUnwantedNumbers.iterrows():
  dfClean = capitalismFirstLetter(dfClean, row["Ref"], row["ColumnName"])

In [170]:
# fix gender
errorDfUnwantedNumbers = dfErrors[dfErrors['ColumnIssue'] == messageErrorNotMaleOrFemale]
for index, row in errorDfUnwantedNumbers.iterrows():
  dfClean = fixGenderTerms(dfClean, row["Ref"], row["ColumnName"])

In [171]:
# fix date of birth format
errorDfUnwantedNumbers = dfErrors[dfErrors['ColumnIssue'] == messageDateOfBirthIncorrectFormat]
for index, row in errorDfUnwantedNumbers.iterrows():
  dfClean = fixDate(dfClean, row["Ref"], row["ColumnName"])

In [172]:
dfErrorsCleaned = findErrorsData(dfClean)
createHTMLOfDf(dfErrorsCleaned, "DfErrorsFixErrorsCleaned.html")

Are ref unique: True


In [168]:
# remove unwanted numbers
errorDfUnwantedNumbers = dfErrors[dfErrors['ColumnIssue'] == messageErrorContainsUnwantedNumbers]
for index, row in errorDfUnwantedNumbers.iterrows():
  dfClean = fixNumbersToLetters(dfClean, row["Ref"], row["ColumnName"])

# To Delete

To be deleted

In [82]:
inspectRowsEffectedByError(df, dfErrors, messageErrorNA)

inspectRowsEffectedByError(df, dfErrors, messageErrorOverLength)

inspectRowsEffectedByError(df, dfErrors, messageErrorNotValidIPAddress)

inspectRowsEffectedByError(df, dfErrors, messageErrorNot18OrOverBy1stDec2020)

inspectRowsEffectedByError(df, dfErrors, messageErrorNotString)

where: Value is NaN
% where : Value is NaN  0.7%

where: Length is over the limit
% where : Length is over the limit  0.6%

where: Not a valid IPv4
% where : Not a valid IPv4  0.6%

where: Not 18 or over by 1st Dec 2020
% where : Not 18 or over by 1st Dec 2020  4.5%



Unnamed: 0,Ref,ColumnName,ColumnValue,ColumnIssue,Fixed,FixedResult
6,64,Date_of_Birth,28/06/2003,Not 18 or over by 1st Dec 2020,False,
7,77,Date_of_Birth,18/10/2004,Not 18 or over by 1st Dec 2020,False,
15,119,Date_of_Birth,21/12/2003,Not 18 or over by 1st Dec 2020,False,
16,120,Date_of_Birth,23/06/2004,Not 18 or over by 1st Dec 2020,False,
20,133,Date_of_Birth,25/03/2003,Not 18 or over by 1st Dec 2020,False,
21,145,Date_of_Birth,21/03/2005,Not 18 or over by 1st Dec 2020,False,
31,249,Date_of_Birth,08/07/2005,Not 18 or over by 1st Dec 2020,False,
35,308,Date_of_Birth,26/07/2004,Not 18 or over by 1st Dec 2020,False,
37,328,Date_of_Birth,15/09/2004,Not 18 or over by 1st Dec 2020,False,
38,329,Date_of_Birth,30/09/2005,Not 18 or over by 1st Dec 2020,False,
