In [172]:
import openclean
import glob
import pandas as pd
import numpy as np


In [173]:
import gzip
import humanfriendly
import os

from openclean.data.source.socrata import Socrata

In [174]:
import pandas as pd
from openclean.pipeline import stream

In [175]:
file_list = glob.glob("*.tsv.gz")

In [176]:
def readData(file):
    
    datafile = './'+file
    
    df  = pd.read_csv(datafile, dtype='object', sep='\t')
    ds = stream(datafile)
    
    return datafile, df, ds

In [177]:
def findColumns(ds, column_name_list):
    data_cols = []

    for col in ds.columns:
        for name in column_name_list:
            if name in col:
                data_cols.append(col)
                
    return  data_cols        

In [178]:
def fix_ID_Number_Column(df, ds):
    data_cols = findColumns(ds, ['ID','id','#','Number', 'number',' No',' NO'])
    
    for col in data_cols:
        
        df[col].fillna('', inplace=True)
        df[col] = df[col].astype('str')
        df[col] = df[col].str.upper()

        df.loc[df[col].str.strip('')=='ONE', col] = '1'
        df.loc[df[col].str.strip('')=='TWO', col] = '2'
        df.loc[df[col].str.strip('')=='THREE', col] = '3'
        df.loc[df[col].str.strip('')=='FOUR', col] = '4'
        df.loc[df[col].str.strip('')=='FIVE', col] = '5'
        df.loc[df[col].str.strip('')=='SIX', col] = '6'
        df.loc[df[col].str.strip('')=='SEVEN', col] = '7'
        df.loc[df[col].str.strip('')=='EIGHT', col] = '8'
        df.loc[df[col].str.strip('')=='NINE', col] = '9'

        df.loc[df[col].str.strip('')=='NAN', col] = ''
        df.loc[df[col].str.strip('')=='nan', col] = ''
        
        df.loc[df[col].str.strip('')=='NO NUMBER', col] = ''
        
    return data_cols

In [179]:
def fix_Binary_Column(df, ds):
    data_cols = findColumns(ds, ['Landmarked','Owned', 'Filled'])
    
    for col in data_cols:
        
        df[col].fillna('N', inplace=True)
        df.loc[df[col]=='X', col] = 'Y'
        
        df.loc[df[col]=='Y', col] = True
        df.loc[df[col]!=True, col] = False
        
        df[col] = df[col].astype('bool')
        
    return data_cols

In [180]:
def fix_Monetary_Column(df, ds):
    data_cols = findColumns(ds, ['Cost','cost', 'fee', 'Fee'])
    
    for col in data_cols:
        
        df[col] = df[col].str.replace("$", '', regex=False)
        df[col] = df[col].astype('float')
        df.loc[df[col] < 0, col] *= -1
        
        
    return data_cols

In [181]:
def fix_Numerical_Column(df, ds):
    data_cols = findColumns(ds, ['Units','units', 'Height', 'height', 'Length', 'length', 'Footage', 'footage', 'Sqft', 'sqft'])
    
    for col in data_cols:
        
        df[col] = df[col].str.replace('-', '', regex=False)
        df[col] = df[col].str.replace('NONE', '0', regex=False)
        df[col] = df[col].str.replace('none', '0', regex=False)
        df[col] = df[col].str.replace('NAN', '0', regex=False)
        df[col] = df[col].str.replace('nan', '0', regex=False)
        
        
    return data_cols

In [182]:
from openclean.profiling.anomalies.sklearn import DBSCANOutliers

def findDateOutliers(df, ds, column_name, eps_setting = 0.05):
    datetime_data = df[column_name]

    light_outliers = DBSCANOutliers().find(datetime_data)
    
    return light_outliers

In [183]:
def fix_datetime_Column(df, ds):
    data_cols = findColumns(ds, ['date', 'Date', 'DATE'])
    
    for col in data_cols:
    
        
        light_outliers = findDateOutliers(df, ds, col)
        
        for item in light_outliers:
            
            df[col] = df[col].replace(item, None)
        
    return data_cols

In [184]:
from openclean.data.refdata import RefStore

refdata = RefStore()
city_df = refdata\
    .load('encyclopaedia_britannica:us_cities', auto_download=True)\
    .df()


In [185]:
# Cluster string using kNN clusterer (with the default n-gram setting)
# using the Levenshtein distance as the similarity measure.

from openclean.cluster.knn import knn_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan

def getClusters(df, ds, col, minsize = 2, preds = 0.5):
    dba = ds.select(col).distinct()
    clusters = knn_clusters(
        values=dba,
        sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(preds)),
        minsize=minsize
    )
    return clusters

def print_cluster(cnumber, cluster):
    item_count = 0

def updateUsingClusters(df, ds, col, clusters, isPrint = False):
    
    orignal_list = []
    suggestion_list = []
    clusters.sort(key=lambda c: len(c), reverse=True)
       
    for i, cluster in enumerate(clusters):        
        suggestion = cluster.suggestion()
        orignal_list = []
        suggestion_list = []
        if isPrint and i <5:
            print_cluster(i, cluster)
        
        for val, count in cluster.items(): 
            orignal_list.append(val)
            suggestion_list.append(suggestion)
    
    df[col] = df[col].replace(orignal_list, suggestion_list)

In [None]:
from openclean.function.eval.base import Col, Eval
from openclean.function.eval.logic import And
from openclean.function.value.phonetic import Soundex, soundex


def fix_city_and_name_Column(df, ds, file):
    data_cols = findColumns(ds, ['NAME', 'name', 'Name','city', 'City', 'CITY', 'BOROUGH', 'Borough', 'borough'])
    name_cols = findColumns(ds, ['NAME', 'name', 'Name'])
    
    # mapping list to replace outliers
    outlier1 = ['', 'MR. ROSS ADAM C', 'MICHAEL', 'N. J.', 'WILLIAM 11', 'JOSEP;H``', 'DAID/11/2007', 'CHUNG   LUN', '718 9215010', 'ANTHONY', 'HSIA0-NAN', 'JOSEPH', '``````````', 'ROBERT  `', 'RAJENDRA9956700', '2', 'G.B.M.', 'EUGENE......JR', '6312100', 'CLAUDE,JR.', 'THOMAS``', 'ALAN  L', 'Nab53', 'MR. Y. B', 'J.J', 'PH8ILIP', 'I. M', 'RICHARD', 'ALBERTA S 111 D', 'P ;', 'GENECG.C. ENG &', 'J.J.', '2126202794', 'SHAW  HWA', 'HARRY         H', 'MR DOU8GLAS', '`1D', 'PAUL', 'K. T.', 'JOHN', '...NORMAN', 'EVAN   D', '7184361278BERNA', 'S.D. DON', 'KY00 SUK', 'JJ', 'YURI.`', 'MAD/Y/ARNI', 'ES ON SCH B', 'EUGENE.......JR', 'NEAL', 'F._ERIC', 'RYAN,  JR', 'AASDFASDFASDF', 'LA0-TECH', 'RODNEY   __', 'DAVID', 'G. L.', 'JAMES', 'LESLI8E', '7186054055', 'GEORGE', 'G.B.M', 'DAVID    JON', 'CHUNG---YAO', 'PETER', 'YUBUN(JACK)', 'GLEN A. L.', '1P', 'JUDE.....N.O', 'LEONARD--', 'WILLIAM', 'ANTHONY,111', 'WU(WOODY)', 'GAD/HON-AN', 'GLEN  A.L.', 'J.B. Jr.', 'LORENZO..A', 'J J', '..RAMSEY', 'HUI LI I', 'ANTONIO9', 'ROBERT', '0.BERT', 'DUMMY 2', '...JOSEPH', 'RUSSELL 111', 'THOMAS', 'H./E./CAMELLE', 'LALAL', 'M.E. P.E', 'R0OBIN VINCENT', '--young', 'AKM', 'LE1', 'IK.T.', 'LEO, JR.', 'J. Butch A. Jr.', 'WU (WOODY0', 'PAUL   N', 'CHRISTOPHER']
    mapping1 = [None, 'ROSS ADAM C', 'MICHAEL', 'N. J.', 'WILLIAM', 'JOSEPH', None, 'CHUNG LUN', None, 'ANTHONY', 'HSIA0 NAN', 'JOSEPH', None, 'ROBERT', 'RAJENDRA', None, 'G.B.M.', 'EUGENEJR', None, 'CLAUDE JR.', 'THOMAS', 'ALAN  L', 'Nab', 'MR. Y. B', 'J.J', 'PHILIP', 'I. M', 'RICHARD', 'ALBERTA', None, 'GENECG.C. ENG', 'J.J.', None, 'SHAW HWA', 'HARRYH', 'MR DOUGLAS', None, 'PAUL', 'K. T.', 'JOHN', 'NORMAN', 'EVAND', 'BERNA', 'S.D. DON', 'KY00 SUK', 'JJ', 'YURI.`', 'MADYARNI', 'ES ON SCH B', 'EUGENEJR', 'NEAL', 'FERIC', 'RYAN,  JR', 'AASDFASDFASDF', 'LA0 TECH', 'RODNEY', 'DAVID', 'G. L.', 'JAMES', 'LESLIE', None, 'GEORGE', 'G.B.M', 'DAVID JON', 'CHUNG YAO', 'PETER', 'YUBUN(JACK)', 'GLEN A. L.', None, 'JUDE N.O', 'LEONARD--', 'WILLIAM', 'ANTHONY,111', 'WU(WOODY)', 'GAD HON-AN', 'GLEN A.L.', 'J.B. Jr.', 'LORENZOA', 'J J', 'RAMSEY', 'HUI LI I', 'ANTONIO9', 'ROBERT', '0.BERT', 'DUMMY', 'JOSEPH', 'RUSSELL', 'THOMAS', 'H.E.CAMELLE', 'LALAL', 'M.E. P.E', 'R0OBIN VINCENT', 'young', 'AKM', 'LE1', 'IK.T.', 'LEO, JR.', 'J. Butch A. Jr.', 'WU (WOODY0', 'PAUL   N', 'CHRISTOPHER']

    outlier2 = ['SHARMA #0', "0'CONNOR", 'RUSHTON    UEL', 'UDDIN   Z', 'HINKLEY 1', 'O&#039;CONNOR, P.E.', '.OOK', 'SAMUELS111', 'O&#039;CONNOR', 'CALIENDO', 'SMITH   JR.', 'LO  BUE', '7AN', '+-+ETTIERI', 'SMITH, 111', 'KAMEN   1', '.EE', 'MASS, 1', '.EI', 'Zagaroli 3rd', 'RINI   II', 'KAMEN   R', 'RYAN 11', 'SPI8EZIA L S', 'MUFTIC..A.I.A', 'COSTELLO9 RA A I A', 'CALVANICO', 'LLC.', 'POEPPEL, P.E.', 'HAMA07', 'HINLEY,1', '1212', "O  ' CONNELL", 'HURT,JR.,', 'WESOLOWSKI', 'CHEN', '`ING, R.A', 'MARTARELLA 111', 'Gandhi, Ph.D., P.E.', '90I', 'ENNIS 2', 'COSTELLO R A A I A', '3UI', 'N/A', 'HURT,  JR', 'LEHR,1', 'KOHLER, 111', 'GERAZOUNIS', 'Alexander,1', 'LUBOW, R.A. LEED AP', 'RINI,111', '08CZAK', '````````````````````', 'CHAO  R.A.', 'Geier 11', '08NGEL', '08SOLOWSKI', 'I11', 'HINKLEY, 1', 'RUDIKOFF, P.E.', "O'CONNOR", 'SHAH   EZ', 'MIELE, JR., P.E.', 'RITTENHOUSE 111', 'AMADI   ISIOFIA', 'HINKLEY,1', 'RENFORE````````', "O'HARA,JR.", '73020012', 'PHAGOO   I', 'BRAY.....,', 'LLL', 'BHATHIA,1', 'GANDHI, PH. D., P.E', 'KO K', 'VASSALOTTI 11', 'HURT, JR .', '0018LKLE', 'RINI -111', 'PARIHAR', 'EE', 'L00802', 'ELISE.111', 'KING , R.A', 'CHRYSLER  P E', 'LEHR 1', 'Walters   Jr.', 'LEE', 'RINI  III', 'D&#039;ANGELO', '0UDOLPH III', 'VIEHE-NAESS 111', ',MO', '08E', '47DIKOFF', 'Yu,', '420865380', 'COPELAND', 'ZWIEFEL 3RD', 'PETERSEN', 'King, R.A.,', 'RINI, III', '7APA', 'CHEN   S', 'Hurt  Jr.', 'KATZ', 'NIZAMBAD.(P.E.)', '901BEN', '4153LOO', 'SYED-NAQVI', 'RYAN , JR.', 'K O K O R I S', 'ELISEO111', 'O&#039;CONNELL', 'ZEID61', '---Lewis', '00CHELI', 'MOHAMMAD       +++++', 'METZLER  P E', 'BAILEY', 'GANDHI, PH. D., P.E.', 'TIEMANN.111', 'SMITH.111', 'DI GER0NIMO', 'GANDHI, PH,D., P.E', 'III', 'J C', 'MAGAMI-QAIM-MAGAMI', '+M', 'LO G1UDICE', 'HOQUE', 'RUDIKOFF', 'Y10007OR', 'SMITH,111', 'KING R A FAIA', 'RYAN III, AIA', '08AN', 'STARK 1', 'MASS', 'VICTORI0, R.A', 'RIZVI   A', '21029677', "3'CONNOR", 'Wong /  Lai', 'KAPLAN 3', 'GRAICHEN.JR./DAWN/DI', 'GROSSMAN ,PE,F.A.C.I']
    mapping2 = ['SHARMA ', "CONNOR", 'RUSHTON UEL', 'UDDIN Z', 'HINKLEY ', 'CONNOR P.E.', None, 'SAMUELS', 'CONNOR', 'CALIENDO', 'SMITH JR.', 'LO BUE', None, 'ETTIERI', 'SMITH', 'KAMEN', '.EE', 'MASS', '.EI', 'Zagaroli', 'RINI', 'KAMEN R', 'RYAN', 'SPIEZIA L S', 'MUFTIC.A.I.A', 'COSTELLO9 RA A I A', 'CALVANICO', 'LLC.', 'POEPPEL P.E.', 'HAMA', 'HINLEY', None, "CONNELL", 'HURT JR.', 'WESOLOWSKI', 'CHEN', 'ING R.A', 'MARTARELLA', 'Gandhi', None, 'ENNIS ', 'COSTELLO R A A I A', None, None, 'HUR  JR', 'LEHR', 'KOHLER 111', 'GERAZOUNIS', 'Alexander', 'LUBOW R.A. LEED AP', 'RINI',None, None, 'CHAO R.A.', 'Geier', None, 'SOLOWSKI', None, 'HINKLEY', 'RUDIKOFF, P.E.', "CONNOR", 'SHAH EZ', 'MIELE JR. P.E.', 'RITTENHOUSE', 'AMADI   ISIOFIA', 'HINKLEY', 'RENFORE', "O'HARA,JR.", None, 'PHAGOO I', 'BRAY,', 'LLL', 'BHATHIA', 'GANDHI', 'KO K', 'VASSALOTTI', 'HURT JR.',None, 'RINI', 'PARIHAR', 'EE', None, 'ELISE', 'KING R.A', 'CHRYSLER  P E', 'LEHR', 'Walters Jr.', 'LEE', 'RINI  III', 'ANGELO', '0UDOLPH III', 'VIEHE-NAESS', 'MO', '08E', None, 'Yu,', None, 'COPELAND', 'ZWIEFEL 3RD', 'PETERSEN', 'King, R.A.,', 'RINI, III', '7APA', 'CHEN   S', 'Hurt  Jr.', 'KATZ', 'NIZAMBAD.(P.E.)', None, None, None, 'RYAN JR.', 'KOKORIS', 'ELISE', 'CONNELL', None, 'Lewis', 'CHELI', 'MOHAMMAD', 'METZLER  P E', 'BAILEY', 'GANDHI', 'TIEMANN', 'SMITH', 'DI GER0NIMO', 'GANDHI', 'III', 'J C', 'MAGAMI QAIM MAGAMI', None, 'LO G1UDICE', 'HOQUE', 'RUDIKOFF', None, 'SMITH', 'KING R A FAIA', 'RYAN III AIA', None, 'STARK', 'MASS', 'VICTORI0 R.A', 'RIZVIA', None, "CONNOR", 'Wong Lai', 'KAPLAN', 'GRAICHEN.JR. DAWN DI', 'GROSSMAN']

    outlier3 = ['', '....DEMO', '050069', 'DEM. CONTR.,', 'XXXXX', 'G/C 10114H9', 'CGWC10114H99', '00', 'X S000155', '082-36-1245', 'G.G', 'LESSEE', '......GC', "'", '..OWNER', 'GC 2293', '--', 'XXXXXX', 'LS 31,721', '...GC', 'gen.cont.', 'G.C TK#4592', 'PE', 'RLA - 818', '.....OWNER', 'RLA 16077', 'G C', 'X 4129892', 'G. C.', 'R.L.A', 'GC 1028350', 'WC10114H99', 'LEESEE', 'GEN.CONT.', 'SIGN..HANGER', 'DEMO 20451', 'D8615', '.X', 'P.L.L.C', '..DEMO', 'G .C', 'L A', 'G.C NY11101', '32820', '....OWNER', 'GC(DEMO)', 'C0NTRACTOR', 'EXPEDITORC99792', 'X 1341946', 'TRACK# 1390', 'EXPED.R4466', 'PLLC 9599691', 'G.C 1110101', '029649', '(CHECK)', 'DEM. CONTR,', 'EXPEDIT(H66172)', '.........GC', 'CITY OF N Y', 'GC 1170386', 'G. C', 'CO0OWNER', '(CHECKED)', 'C.C', '23392 1159774', 'DEMO {', 'RA', 'T. 31132', '....GC', 'RLA-787', 'TRACK #1390', 'D C', 'G.CONTR.', 'DEMO  CONT', '1GC', 'CC', 'demo G.C.', 'TRACK. #1390', 'M.F.S.P.C.', '...DEMO', 'DEMO G C', '13328', 'GEN  CONT', 'GC 1221073', "GC;'", 'DEMO 1341946', '11234', 'G.C.,', '.....GC', 'LIC.133668259 1', '?', '0WNER', 'C10892', 'GEN..CONT']
    mapping3 = [None, 'DEMO', None, 'DEM. CONTR', None, 'G/C', 'CGWC', None, 'X S', None, 'G.G', 'LESSEE', 'GC', None, 'OWNER', 'GC', None, None, 'LS ', 'GC', 'gen.cont.', 'G.C TK', 'PE', 'RLA ', 'OWNER', 'RLA ', 'G C', 'X', 'G. C.', 'R.L.A', 'GC', 'WC', 'LEESEE', 'GEN.CONT.', 'SIGN.HANGER', 'DEMO', None,None, 'P.L.L.C', 'DEMO', 'G.C', 'L A', 'G.C ', None, 'OWNER', 'GC(DEMO)', 'C0NTRACTOR', 'EXPEDITORC', None, 'TRACK', 'EXPED.R', 'PLLC ', 'G.C', None, None, 'DEM. CONTR,', 'EXPEDIT', 'GC', None, 'GC', 'G.C', 'CO0OWNER', None, 'C.C', None, 'DEMO', 'RA', None, 'GC', 'RLA', None, 'D C', 'G.CONTR.', 'DEMO  CONT', 'GC', 'CC', 'demo G.C.', None, 'M.F.S.P.C.', 'DEMO', 'DEMO G C', None, 'GEN  CONT', 'GC ', "GC ", 'DEMO ', None, 'G.C.', 'GC', 'LIC', None, '0WNER',None, 'GEN.CONT']

    outlier4 = ['', '0000GC', '083278', 'DD5615', '0000PB', '00ASB4', 'B81923', '99998', '000N/A', '65569+', '01827O', 'R9526', 'LP0256', 'N/A', '1964', 'ISLAND', '1609', '000PW1', '00DEMO', '0688.6', '00000', '.20929', 'LP0258', '000TOR', '0D8615', '0SWITA', '818', 'O02200', 'DEMO', '196', '1075', '0000NT', '215', '0', '00000`', "D'ALTO", '0455', '22377', 'DD8615', '050579', '226', 'SWITA', 'DD6815', 'X02689']
    mapping4 = [None, '0000GC', '083278', 'DD5615', '0000PB', '00ASB4', 'B81923', '099998', '000000', '065569', '01827O', '0R9526', 'LP0256',None, '001964',None, '001609', '000PW1', '00DEMO', '006886', '000000', '020929', 'LP0258', '000TOR', '0D8615', '0SWITA', '000818', 'O02200', None, '000196', '001075', '0000NT', '000215', '000000', '000000', None, '000455', '022377', 'DD8615', '050579', '000226', None, 'DD6815', 'X02689']

    outliers = outlier1+ outlier2+ outlier3+ outlier4
    mappings = mapping1+ mapping2+ mapping3+ mapping4
       
    for col in data_cols:
        
        df[col] = df[col].replace(outliers, mappings)
        light_outliers = findDateOutliers(df, ds, col)
        
        for item in light_outliers:
            
            df[col] = df[col].replace(item, None)
        
        if file != 'hg8x-zxpr.tsv.gz' and col in name_cols:
            col_clusters = getClusters(df, ds, col)
            updateUsingClusters(df, ds, col, col_clusters, True)
            
        df[col] = df[col].replace(['N/A', '', 'NA','NONE'], [None,None,None,None])
        
    return data_cols    

In [187]:
def saveDf(df, datafile):
    outputpath = datafile[2:11]+'_cleaned_data_improved.csv'
    df.to_csv(outputpath,sep=',',index=False,header=True) 

In [188]:
def dataCleanOnDataset(file):
    
    print('working on file: ', file)
    datafile, df, ds = readData(file)
    
    cleaned_cols = []
    
    print('fixing ID_Number_Column......')
    cleaned_cols += fix_ID_Number_Column(df, ds)
    
    print('fixing Binary_Column......')
    cleaned_cols += fix_Binary_Column(df, ds)
    
    print('fixing Monetary_Column......')
    cleaned_cols += fix_Monetary_Column(df, ds)
    
    print('fixing Numerical_Column......')
    cleaned_cols += fix_Numerical_Column(df, ds)
    
    print('fixing datetime_Column......')
    cleaned_cols += fix_datetime_Column(df, ds)
    
    print('fixing city_and_name_Column......')
    cleaned_cols += fix_city_and_name_Column(df, ds, file)
    
    saveDf(df, datafile)
    
    return cleaned_cols, datafile, df

In [None]:
cleaned_cols, datafile, df = dataCleanOnDataset(file_list[3])
print(cleaned_cols)

working on file:  rbx6-tga4.tsv.gz
fixing ID_Number_Column......
fixing Binary_Column......
fixing Monetary_Column......
fixing Numerical_Column......
fixing datetime_Column......
fixing city_and_name_Column......


In [None]:
def precision(tp, fp):
    return tp/(tp+fp)

def recall(tp, fn):
    return tp/(tp+fn)

In [None]:
def precision_recall(cleaned_columns, datafile, df):
    df2  = pd.read_csv(datafile, dtype='object', sep='\t')
    df2 = df2.drop_duplicates()

    sample_size = 50

    df_sample_data =  df2.sample(sample_size).copy()
    
    print('sample size: ',sample_size)
    
    print('total size: ',sample_size * len(cleaned_columns))
    print('======================\n\n')

    same = 0

    for col in cleaned_columns:
        print("column: ", col)
        print("Original,\t Cleaned\n")
        for i in range(sample_size):
            if df[col].iloc[i]== df2[col].iloc[i]:
                print(df[col].iloc[i], '\t', df2[col].iloc[i], '\t')
                same += 1
            else:
                print(df[col].iloc[i], '\t', df2[col].iloc[i], '\t*')

        print('*   ', same, ' same records   *\n')
        same = 0

        print('======================\n\n')