In [None]:

import pandas as pd
import numpy as np

# Variable initialization
base_name = 'input_base'
input_path = 'path\\to\\input\\'
output_path = 'path\\to\\output\\'
final_name = 'Cleaned_Contact_List'

# Import data
base_e = pd.read_csv(input_path + base_name + '.csv', dtype=str)

# Create enumerated column
base_e.insert(0, 'Column', range(0, len(base_e)))

base_e.head()

# Create base
base_e = pd.DataFrame(base_e, columns=['Column', 'name', 'email', 'phone', 'phone2', 'score'])

# Rename columns
base_e.columns = ['Column', 'Name', 'Email', 'Phone', 'Phone2', 'Score']
base_e.head()

#--------------------------------
# PHONE 2 (MOBILE) FIELD
#--------------------------------

# Convert type to string
base_e['Phone2'] = base_e['Phone2'].astype(str)

# Numbers only
base_e['Phone2'] = base_e['Phone2'].str.replace('[-]|[,]|[;]|[´]|[:]|[()]|[[]]|[.]|[*]|[+]|[&]|[/]|[]]|[[]|[ ]', '')

# Inconsistencies
base_e['Phone2'] = base_e['Phone2'].str.replace('00000000|000000000|99999999|444444444|111111111|222222222|333333333', '')

# Adding country code
base_e['Phone2'] = base_e['Phone2'].apply(lambda x: '55' + x if len(x) == 11 else '')

# Update Phone field
base_e['Phone'] = np.where(base_e['Phone'] != '', base_e['Phone'], base_e['Phone2'])

# Drop Phone2 column
base_e.drop(['Phone2'], axis=1, inplace=True)

# Remove duplicates
base_e = base_e.drop_duplicates(subset=['Email'])
base_e = base_e.drop_duplicates(subset=['Name'])

# Keep only filled emails
base_e = base_e.loc[base_e['Email'] != '']

# Remove spaces from Phone field
base_e['Phone'] = base_e['Phone'].str.replace(' ', '')

# Replace blank phones with Column field value
base_e['Phone'] = np.where(base_e['Phone'] != '', base_e['Phone'], base_e['Column'])

# Set Phone field type as string
base_e['Phone'] = base_e['Phone'].astype(str)

# Remove duplicate phones
base_e = base_e.drop_duplicates(subset=['Phone'])

# Insert TYPE field
base_e['Type'] = final_name

# Drop COLUMN field
base_e.drop(['Column'], axis=1, inplace=True)

# Infer 'blank' for phones shorter than 13 characters
base_e['Phone'] = base_e['Phone'].apply(lambda x: x if len(x) == 13 else '')

# Insert LOCALE field
base_e['Locale'] = np.where(base_e['Phone'] != '', 'en', '')

# NAME FIELD
#--------------------------------

# Unify names in uppercase
base_e['Name'] = base_e['Name'].str.upper()

# Letters only
base_e['Name'] = base_e['Name'].str.replace('[-]|[:]|[_]|[´]|[()]|[[]]|[.]|[*]|[+]|[&]|[/]|[]]|[[]', '')

# Remove accents
base_e['Name'] = base_e['Name'].str.replace('Ç','C').str.replace('ç','c').str.replace('é','e').str.replace('Ã','A').str.replace('Á','A').str.replace('Ê','E').str.replace('É','E').str.replace('Í','I').str.replace('Ú','U').str.replace('À','A').str.replace('à','a').str.replace('á','a').str.replace('ú','u').str.replace('í','i').str.replace('ó','o').str.replace('ã','a').str.replace('õ','o').str.replace('Õ','O')

# Inconsistencies
base_e = base_e[base_e['Name'].str.contains("GMIL|\||NÃO USAR|NAOINFORMADO|EXAMPLE|EXCLUDE|EMAIL|NOEMAIL|-|NOEXISTE|@@|WRONG|UPDATE|GMAIL|DEACTIVATED|DUPLICATE|.COMM|@XXXX|CONTACT |TEST|@@|@XXXX|ATUALISAR|CONFIRM|WRONGEMAIL|XX|EMAIL|ZZZZZZ|ARCHITECT|INFORM|OUTLOOK|HOTMAIL") == False]

# EMAIL FIELD
#--------------------------------

# Unify emails in lowercase
base_e['Email'] = base_e['Email'].str.lower()

# Remove accents
base_e['Email'] = base_e['Email'].str.replace('Ç','C').str.replace('ç','c').str.replace('é','e').str.replace('Ã','A').str.replace('Á','A').str.replace('Ê','E').str.replace('É','E').str.replace('Í','I').str.replace('Ú','U').str.replace('À','A').str.replace('à','a').str.replace('á','a').str.replace('ú','u').str.replace('í','i').str.replace('ó','o').str.replace('ã','a').str.replace('õ','o').str.replace('Õ','O')

# Remove spaces
base_e['Email'] = base_e['Email'].str.replace(' ', '')

# Inconsistencies
base_e = base_e[base_e['Email'].str.contains("gmil|\||não usar|naoinformado|example|exclude|email|noemail|-|noexiste|@@|wrong|update|gmail|deactivated|duplicate|.comm|@xxxx|contact |test|@@|@xxxx|atualisar|confirm|wrongemail|xx|email|zzzzzz|architect|inform|outlook|hotmail") == False]

# PHONE FIELD
#--------------------------------

# Convert type to string
base_e['Phone'] = base_e['Phone'].astype(str)

# Numbers only
base_e['Phone'] = base_e['Phone'].str.replace('[-]|[,]|[´]|[;]|[:]|[()]|[[]]|[.]|[*]|[+]|[&]|[/]|[]]|[[]|[ ]', '')

# Inconsistencies
base_e['Phone'] = base_e['Phone'].str.replace('00000000|000000000|99999999|444444444|111111111|222222222|333333333', '')

# Adding country code
base_e['Phone'] = base_e['Phone'].apply(lambda x: '55' + x if len(x) == 11 else '')

# View base
base_e.head()
base_e = base_e.iloc[:, [0, 2, 4, 5, 1, 3]]
