# General version of data cleaning

This code will automatically perform data cleaning on each of the dataset in the directoty, to run this notebook you need to download all the datasets and place them in the current directoty with the notebook.

Because we automatically find columns and fix them, we can not afford clustering to find outliers for each datasets so remove it in this notebook, the precision and recall rate might be influenced.

Run the code in oreder to perform data clean, cleaned data will be save in the current directoty, and to calculate the precision and recall rate, you will need to manually inspect the result.

In [1]:
import openclean
import glob
import pandas as pd
import numpy as np
import re

In [2]:
import gzip
import humanfriendly
import os

from openclean.data.source.socrata import Socrata

In [3]:
import pandas as pd
from openclean.pipeline import stream

In [4]:
# dataset_list = ['hg8x-zxpr','w9ak-ipjd','dm9a-ab7w','bx6-tga4','ipu4-2q9a','xubg-57si', 'bty7-2jhb', 'hcir-3275, 'pitm-atqc','iz2q-9x8d']
# dataset_names = ['Housing New York Units by Building',
#                  'DOB NOW: Build – Job Application Filings',
#                  'DOB NOW: Electrical Permit Applications', 
#                  'DOB NOW: Build – Approved Permits',
#                  'DOB Permit Issuance', 
#                  'DOB NOW: Safety – Facades Compliance Filings',
#                  'Historical DOB Permit Issuance', 
#                  'Buildings Selected for the Alternative Enforcement Program (AEP)', 
#                  'Open Restaurant Applications', 
#                  'DOB Cellular Antenna Filings']

# dataset = Socrata().dataset('pitm-atqc')
# datafile = './pitm-atqc.tsv.gz'

# if not os.path.isfile(datafile):
#     with gzip.open(datafile, 'wb') as f:
#         print('Downloading ...\n')
#         dataset.write(f)


# fsize = humanfriendly.format_size(os.stat(datafile).st_size)
# print("Using '{}' in file {} of size {}".format(dataset.name, datafile, fsize))

In [5]:
# DOB Cellular Antenna Filings: iz2q-9x8d
# Open Restaurant Applications: pitm-atqc
# DOB NOW: Safety – Facades Compliance Filings: xubg-57si
# DOB Permit Issuance: ipu4-2q9a



In [6]:
file_list = glob.glob("*.tsv.gz")

In [7]:
file_list

['ipu4-2q9a.tsv.gz',
 'iz2q-9x8d.tsv.gz',
 'pitm-atqc.tsv.gz',
 'xubg-57si.tsv.gz']

In [8]:
def readData(file):
    
    datafile = './'+file
    
    df  = pd.read_csv(datafile, dtype='object', sep='\t')
    #ds = stream(datafile)
    
    return datafile, df#, ds

In [9]:
def fixColumnNames(df):
    rename_list = list(df.columns)
    rename_dict = dict()

    for i in rename_list:
        col_name = str(i)

        col_name = col_name.strip().replace("_", " ").replace("’", "'").replace(".", "")


        # https://stackoverflow.com/questions/2277352/split-a-string-at-uppercase-letters
        # Split on upper case to seperate cocnatenated words:
        if (not col_name.islower()) and (not col_name.isupper()) and (col_name.find(" ") == -1):
            col_name = " ".join(re.sub("([A-Z])", r" \1", col_name).split())


        if col_name.islower(): 
            col_name = col_name.title()

        if (col_name.isupper()) and (col_name.find(" ") != -1):
            col_name = col_name.title()

        col_name = col_name.replace("No", "Number")
        col_name = col_name.replace("#", "Number")

        rename_dict[i] = col_name
    return rename_dict    

In [10]:
def findColumns(df, column_name_list):
    data_cols = []

    for col in df.columns:
        for name in column_name_list:
            if name.lower() in col.lower():
                data_cols.append(col)
                
    return  data_cols        

In [36]:
def fix_House_Number_Column(df, col):
    df[col] = df[col].str.replace(pat='(?P<one>\\d)(?P<two>[A-Z]+)', repl='\g<one> \g<two>', regex=True)
    df[col] = df[col].str.replace(pat='(?P<one>GAR$)', repl='GARAGE', regex=True)
    df[col] = df[col].str.replace(pat='NORTH([A-Z]+)?', repl='', regex=True)
    df[col] = df[col].str.replace(pat='EAST([A-Z]+)?', repl='', regex=True)
    df[col] = df[col].str.replace(pat='SOUTH([A-Z]+)?', repl='', regex=True)
    df[col] = df[col].str.replace(pat='WEST([A-Z]+)?', repl='', regex=True)
    df.loc[(~df[col].str.contains('\\d', regex=True)), col] = ''


In [37]:
def fix_Phone_Number_Column(df, col):
    
    df[col] = df[col].str.replace(pat="[^\\d]", repl="", regex=True)    
    df[col] = df[col].str.lstrip('1')
    df.loc[df[col].str.len()!=10, col] = ''

    #return data_cols

In [38]:
def fix_ID_Number_Column(df):
    data_cols = findColumns(df, ['number', 'Number',' No',' NO'])
    
    for col in data_cols:
        
        df[col].fillna('', inplace=True)
        df[col] = df[col].astype('str')
        df[col] = df[col].str.upper()

        df.loc[df[col].str.strip('')=='ONE', col] = '1'
        df.loc[df[col].str.strip('')=='TWO', col] = '2'
        df.loc[df[col].str.strip('')=='THREE', col] = '3'
        df.loc[df[col].str.strip('')=='FOUR', col] = '4'
        df.loc[df[col].str.strip('')=='FIVE', col] = '5'
        df.loc[df[col].str.strip('')=='SIX', col] = '6'
        df.loc[df[col].str.strip('')=='SEVEN', col] = '7'
        df.loc[df[col].str.strip('')=='EIGHT', col] = '8'
        df.loc[df[col].str.strip('')=='NINE', col] = '9'

        df.loc[df[col].str.strip('')=='NONE', col] = ''
        df.loc[df[col].str.strip('')=='none', col] = ''
        df.loc[df[col].str.strip('')=='None', col] = ''

        df.loc[df[col].str.strip('')=='NAN', col] = ''
        df.loc[df[col].str.strip('')=='nan', col] = ''
        df.loc[df[col].str.strip('')=='NaN', col] = ''
        
        df.loc[df[col].str.strip('')=='NO NUMBER', col] = ''
    
        if ('house' in col.lower()) or ('building' in col.lower()):
            fix_House_Number_Column(df, col)
        if ("phone" in col.lower()):
            fix_Phone_Number_Column(df, col)
    
    return data_cols

In [39]:
def fix_Binary_Column(df):
    #data_cols = findColumns(ds, ['Landmarked','Owned', 'Filled'])
    #
    data_cols = []

    for col in df.columns:
        # If column has small # of values, and potentially boolean values like Yes/ No, Y/N, X, True, etc.
        if (df[col].nunique() < 5) and (df[df[col].str.lower()==('y|yes|x|true|n|no|false')][col].count() != 0):
            data_cols.append(col)
        
        
    for col in data_cols:
        
        df.loc[df[col].str.lower().isin(['y|yes|x|true']), col] = True
        df.loc[df[col].str.lower().isin(['n|no|nan|false']), col] = False
        df[col].fillna(False, inplace=True)
        
        if df[col].nunique()==2:
            df[col] = df[col].astype('bool')
        
    return data_cols

In [40]:
def fix_Monetary_Column(df):
    data_cols = findColumns(df, ['Cost','cost', 'fee', 'Fee'])
    
    for col in data_cols:
        
        df[col] = df[col].astype('str')
        df[col] = df[col].str.replace("$", '', regex=False)
        df[col] = df[col].str.replace("-", '', regex=False)
        df[col] = pd.to_numeric(df[col], errors='coerce')

    return data_cols

In [41]:
def fix_Numerical_Column(df):
    data_cols = findColumns(df, ['Units','units', 'Height', 'height', 'Length', 'length', 'Footage', 'footage', 'Sqft', 'sqft'])
    
    for col in data_cols:
        df[col] = df[col].astype('str')

        df[col] = df[col].str.replace('-', '', regex=False)
        df[col] = df[col].str.replace('NONE', '0', regex=False)
        df[col] = df[col].str.replace('none', '0', regex=False)
        df[col] = df[col].str.replace('NAN', '0', regex=False)
        df[col] = df[col].str.replace('NaN', '0', regex=False)
        df[col] = df[col].str.replace('nan', '0', regex=False)
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
    return data_cols

In [17]:
from openclean.profiling.anomalies.sklearn import DBSCANOutliers

def findDateOutliers(df, column_name, eps_setting = 0.05):
    datetime_data = df[column_name]

    light_outliers = DBSCANOutliers().find(datetime_data)
    
    return light_outliers

In [18]:
def fix_datetime_Column(df):
    data_cols = findColumns(df, ['date', 'Date', 'DATE'])
    
    for col in data_cols:
    
        
        light_outliers = findDateOutliers(df, col)
        
        for item in light_outliers:
            
            df[col] = df[col].replace(item, None)
            df[col] = pd.to_datetime(df[col], errors='coerce')
         
    return data_cols

In [19]:
from openclean.data.refdata import RefStore

refdata = RefStore()
city_df = refdata\
    .load('encyclopaedia_britannica:us_cities', auto_download=True)\
    .df()


In [20]:
# Cluster string using kNN clusterer (with the default n-gram setting)
# using the Levenshtein distance as the similarity measure.

from openclean.cluster.knn import knn_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan

def getClusters(df, col, minsize = 2, preds = 0.5): #ds
    dba = df.select(col).distinct()
    clusters = knn_clusters(
        values=dba,
        sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(preds)),
        minsize=minsize
    )
    return clusters

def print_cluster(cnumber, cluster):
    item_count = 0

def updateUsingClusters(df, col, clusters, isPrint = False): # ds,
    
    orignal_list = []
    suggestion_list = []
    clusters.sort(key=lambda c: len(c), reverse=True)
       
    for i, cluster in enumerate(clusters):        
        suggestion = cluster.suggestion()
        orignal_list = []
        suggestion_list = []
        if isPrint and i <5:
            print_cluster(i, cluster)
        
        for val, count in cluster.items(): 
            orignal_list.append(val)
            suggestion_list.append(suggestion)
    
    df[col] = df[col].replace(orignal_list, suggestion_list)

In [21]:
from openclean.function.eval.base import Col, Eval
from openclean.function.eval.logic import And
from openclean.function.value.phonetic import Soundex, soundex


def fix_city_and_name_Column(df, file): #ds,
    data_cols = findColumns(df, ['NAME', 'name', 'Name','city', 'City', 'CITY', 'BOROUGH', 'Borough', 'borough'])
    name_cols = findColumns(df, ['NAME', 'name', 'Name'])
    
    # mapping list to replace outliers
    outlier1 = ['', 'MR. ROSS ADAM C', 'MICHAEL', 'N. J.', 'WILLIAM 11', 'JOSEP;H``', 'DAID/11/2007', 'CHUNG   LUN', '718 9215010', 'ANTHONY', 'HSIA0-NAN', 'JOSEPH', '``````````', 'ROBERT  `', 'RAJENDRA9956700', '2', 'G.B.M.', 'EUGENE......JR', '6312100', 'CLAUDE,JR.', 'THOMAS``', 'ALAN  L', 'Nab53', 'MR. Y. B', 'J.J', 'PH8ILIP', 'I. M', 'RICHARD', 'ALBERTA S 111 D', 'P ;', 'GENECG.C. ENG &', 'J.J.', '2126202794', 'SHAW  HWA', 'HARRY         H', 'MR DOU8GLAS', '`1D', 'PAUL', 'K. T.', 'JOHN', '...NORMAN', 'EVAN   D', '7184361278BERNA', 'S.D. DON', 'KY00 SUK', 'JJ', 'YURI.`', 'MAD/Y/ARNI', 'ES ON SCH B', 'EUGENE.......JR', 'NEAL', 'F._ERIC', 'RYAN,  JR', 'AASDFASDFASDF', 'LA0-TECH', 'RODNEY   __', 'DAVID', 'G. L.', 'JAMES', 'LESLI8E', '7186054055', 'GEORGE', 'G.B.M', 'DAVID    JON', 'CHUNG---YAO', 'PETER', 'YUBUN(JACK)', 'GLEN A. L.', '1P', 'JUDE.....N.O', 'LEONARD--', 'WILLIAM', 'ANTHONY,111', 'WU(WOODY)', 'GAD/HON-AN', 'GLEN  A.L.', 'J.B. Jr.', 'LORENZO..A', 'J J', '..RAMSEY', 'HUI LI I', 'ANTONIO9', 'ROBERT', '0.BERT', 'DUMMY 2', '...JOSEPH', 'RUSSELL 111', 'THOMAS', 'H./E./CAMELLE', 'LALAL', 'M.E. P.E', 'R0OBIN VINCENT', '--young', 'AKM', 'LE1', 'IK.T.', 'LEO, JR.', 'J. Butch A. Jr.', 'WU (WOODY0', 'PAUL   N', 'CHRISTOPHER']
    mapping1 = [None, 'ROSS ADAM C', 'MICHAEL', 'N. J.', 'WILLIAM', 'JOSEPH', None, 'CHUNG LUN', None, 'ANTHONY', 'HSIA0 NAN', 'JOSEPH', None, 'ROBERT', 'RAJENDRA', None, 'G.B.M.', 'EUGENEJR', None, 'CLAUDE JR.', 'THOMAS', 'ALAN  L', 'Nab', 'MR. Y. B', 'J.J', 'PHILIP', 'I. M', 'RICHARD', 'ALBERTA', None, 'GENECG.C. ENG', 'J.J.', None, 'SHAW HWA', 'HARRYH', 'MR DOUGLAS', None, 'PAUL', 'K. T.', 'JOHN', 'NORMAN', 'EVAND', 'BERNA', 'S.D. DON', 'KY00 SUK', 'JJ', 'YURI.`', 'MADYARNI', 'ES ON SCH B', 'EUGENEJR', 'NEAL', 'FERIC', 'RYAN,  JR', 'AASDFASDFASDF', 'LA0 TECH', 'RODNEY', 'DAVID', 'G. L.', 'JAMES', 'LESLIE', None, 'GEORGE', 'G.B.M', 'DAVID JON', 'CHUNG YAO', 'PETER', 'YUBUN(JACK)', 'GLEN A. L.', None, 'JUDE N.O', 'LEONARD--', 'WILLIAM', 'ANTHONY,111', 'WU(WOODY)', 'GAD HON-AN', 'GLEN A.L.', 'J.B. Jr.', 'LORENZOA', 'J J', 'RAMSEY', 'HUI LI I', 'ANTONIO9', 'ROBERT', '0.BERT', 'DUMMY', 'JOSEPH', 'RUSSELL', 'THOMAS', 'H.E.CAMELLE', 'LALAL', 'M.E. P.E', 'R0OBIN VINCENT', 'young', 'AKM', 'LE1', 'IK.T.', 'LEO, JR.', 'J. Butch A. Jr.', 'WU (WOODY0', 'PAUL   N', 'CHRISTOPHER']

    outlier2 = ['SHARMA #0', "0'CONNOR", 'RUSHTON    UEL', 'UDDIN   Z', 'HINKLEY 1', 'O&#039;CONNOR, P.E.', '.OOK', 'SAMUELS111', 'O&#039;CONNOR', 'CALIENDO', 'SMITH   JR.', 'LO  BUE', '7AN', '+-+ETTIERI', 'SMITH, 111', 'KAMEN   1', '.EE', 'MASS, 1', '.EI', 'Zagaroli 3rd', 'RINI   II', 'KAMEN   R', 'RYAN 11', 'SPI8EZIA L S', 'MUFTIC..A.I.A', 'COSTELLO9 RA A I A', 'CALVANICO', 'LLC.', 'POEPPEL, P.E.', 'HAMA07', 'HINLEY,1', '1212', "O  ' CONNELL", 'HURT,JR.,', 'WESOLOWSKI', 'CHEN', '`ING, R.A', 'MARTARELLA 111', 'Gandhi, Ph.D., P.E.', '90I', 'ENNIS 2', 'COSTELLO R A A I A', '3UI', 'N/A', 'HURT,  JR', 'LEHR,1', 'KOHLER, 111', 'GERAZOUNIS', 'Alexander,1', 'LUBOW, R.A. LEED AP', 'RINI,111', '08CZAK', '````````````````````', 'CHAO  R.A.', 'Geier 11', '08NGEL', '08SOLOWSKI', 'I11', 'HINKLEY, 1', 'RUDIKOFF, P.E.', "O'CONNOR", 'SHAH   EZ', 'MIELE, JR., P.E.', 'RITTENHOUSE 111', 'AMADI   ISIOFIA', 'HINKLEY,1', 'RENFORE````````', "O'HARA,JR.", '73020012', 'PHAGOO   I', 'BRAY.....,', 'LLL', 'BHATHIA,1', 'GANDHI, PH. D., P.E', 'KO K', 'VASSALOTTI 11', 'HURT, JR .', '0018LKLE', 'RINI -111', 'PARIHAR', 'EE', 'L00802', 'ELISE.111', 'KING , R.A', 'CHRYSLER  P E', 'LEHR 1', 'Walters   Jr.', 'LEE', 'RINI  III', 'D&#039;ANGELO', '0UDOLPH III', 'VIEHE-NAESS 111', ',MO', '08E', '47DIKOFF', 'Yu,', '420865380', 'COPELAND', 'ZWIEFEL 3RD', 'PETERSEN', 'King, R.A.,', 'RINI, III', '7APA', 'CHEN   S', 'Hurt  Jr.', 'KATZ', 'NIZAMBAD.(P.E.)', '901BEN', '4153LOO', 'SYED-NAQVI', 'RYAN , JR.', 'K O K O R I S', 'ELISEO111', 'O&#039;CONNELL', 'ZEID61', '---Lewis', '00CHELI', 'MOHAMMAD       +++++', 'METZLER  P E', 'BAILEY', 'GANDHI, PH. D., P.E.', 'TIEMANN.111', 'SMITH.111', 'DI GER0NIMO', 'GANDHI, PH,D., P.E', 'III', 'J C', 'MAGAMI-QAIM-MAGAMI', '+M', 'LO G1UDICE', 'HOQUE', 'RUDIKOFF', 'Y10007OR', 'SMITH,111', 'KING R A FAIA', 'RYAN III, AIA', '08AN', 'STARK 1', 'MASS', 'VICTORI0, R.A', 'RIZVI   A', '21029677', "3'CONNOR", 'Wong /  Lai', 'KAPLAN 3', 'GRAICHEN.JR./DAWN/DI', 'GROSSMAN ,PE,F.A.C.I']
    mapping2 = ['SHARMA ', "CONNOR", 'RUSHTON UEL', 'UDDIN Z', 'HINKLEY ', 'CONNOR P.E.', None, 'SAMUELS', 'CONNOR', 'CALIENDO', 'SMITH JR.', 'LO BUE', None, 'ETTIERI', 'SMITH', 'KAMEN', '.EE', 'MASS', '.EI', 'Zagaroli', 'RINI', 'KAMEN R', 'RYAN', 'SPIEZIA L S', 'MUFTIC.A.I.A', 'COSTELLO9 RA A I A', 'CALVANICO', 'LLC.', 'POEPPEL P.E.', 'HAMA', 'HINLEY', None, "CONNELL", 'HURT JR.', 'WESOLOWSKI', 'CHEN', 'ING R.A', 'MARTARELLA', 'Gandhi', None, 'ENNIS ', 'COSTELLO R A A I A', None, None, 'HUR  JR', 'LEHR', 'KOHLER 111', 'GERAZOUNIS', 'Alexander', 'LUBOW R.A. LEED AP', 'RINI',None, None, 'CHAO R.A.', 'Geier', None, 'SOLOWSKI', None, 'HINKLEY', 'RUDIKOFF, P.E.', "CONNOR", 'SHAH EZ', 'MIELE JR. P.E.', 'RITTENHOUSE', 'AMADI   ISIOFIA', 'HINKLEY', 'RENFORE', "O'HARA,JR.", None, 'PHAGOO I', 'BRAY,', 'LLL', 'BHATHIA', 'GANDHI', 'KO K', 'VASSALOTTI', 'HURT JR.',None, 'RINI', 'PARIHAR', 'EE', None, 'ELISE', 'KING R.A', 'CHRYSLER  P E', 'LEHR', 'Walters Jr.', 'LEE', 'RINI  III', 'ANGELO', '0UDOLPH III', 'VIEHE-NAESS', 'MO', '08E', None, 'Yu,', None, 'COPELAND', 'ZWIEFEL 3RD', 'PETERSEN', 'King, R.A.,', 'RINI, III', '7APA', 'CHEN   S', 'Hurt  Jr.', 'KATZ', 'NIZAMBAD.(P.E.)', None, None, None, 'RYAN JR.', 'KOKORIS', 'ELISE', 'CONNELL', None, 'Lewis', 'CHELI', 'MOHAMMAD', 'METZLER  P E', 'BAILEY', 'GANDHI', 'TIEMANN', 'SMITH', 'DI GER0NIMO', 'GANDHI', 'III', 'J C', 'MAGAMI QAIM MAGAMI', None, 'LO G1UDICE', 'HOQUE', 'RUDIKOFF', None, 'SMITH', 'KING R A FAIA', 'RYAN III AIA', None, 'STARK', 'MASS', 'VICTORI0 R.A', 'RIZVIA', None, "CONNOR", 'Wong Lai', 'KAPLAN', 'GRAICHEN.JR. DAWN DI', 'GROSSMAN']

    outlier3 = ['', '....DEMO', '050069', 'DEM. CONTR.,', 'XXXXX', 'G/C 10114H9', 'CGWC10114H99', '00', 'X S000155', '082-36-1245', 'G.G', 'LESSEE', '......GC', "'", '..OWNER', 'GC 2293', '--', 'XXXXXX', 'LS 31,721', '...GC', 'gen.cont.', 'G.C TK#4592', 'PE', 'RLA - 818', '.....OWNER', 'RLA 16077', 'G C', 'X 4129892', 'G. C.', 'R.L.A', 'GC 1028350', 'WC10114H99', 'LEESEE', 'GEN.CONT.', 'SIGN..HANGER', 'DEMO 20451', 'D8615', '.X', 'P.L.L.C', '..DEMO', 'G .C', 'L A', 'G.C NY11101', '32820', '....OWNER', 'GC(DEMO)', 'C0NTRACTOR', 'EXPEDITORC99792', 'X 1341946', 'TRACK# 1390', 'EXPED.R4466', 'PLLC 9599691', 'G.C 1110101', '029649', '(CHECK)', 'DEM. CONTR,', 'EXPEDIT(H66172)', '.........GC', 'CITY OF N Y', 'GC 1170386', 'G. C', 'CO0OWNER', '(CHECKED)', 'C.C', '23392 1159774', 'DEMO {', 'RA', 'T. 31132', '....GC', 'RLA-787', 'TRACK #1390', 'D C', 'G.CONTR.', 'DEMO  CONT', '1GC', 'CC', 'demo G.C.', 'TRACK. #1390', 'M.F.S.P.C.', '...DEMO', 'DEMO G C', '13328', 'GEN  CONT', 'GC 1221073', "GC;'", 'DEMO 1341946', '11234', 'G.C.,', '.....GC', 'LIC.133668259 1', '?', '0WNER', 'C10892', 'GEN..CONT']
    mapping3 = [None, 'DEMO', None, 'DEM. CONTR', None, 'G/C', 'CGWC', None, 'X S', None, 'G.G', 'LESSEE', 'GC', None, 'OWNER', 'GC', None, None, 'LS ', 'GC', 'gen.cont.', 'G.C TK', 'PE', 'RLA ', 'OWNER', 'RLA ', 'G C', 'X', 'G. C.', 'R.L.A', 'GC', 'WC', 'LEESEE', 'GEN.CONT.', 'SIGN.HANGER', 'DEMO', None,None, 'P.L.L.C', 'DEMO', 'G.C', 'L A', 'G.C ', None, 'OWNER', 'GC(DEMO)', 'C0NTRACTOR', 'EXPEDITORC', None, 'TRACK', 'EXPED.R', 'PLLC ', 'G.C', None, None, 'DEM. CONTR,', 'EXPEDIT', 'GC', None, 'GC', 'G.C', 'CO0OWNER', None, 'C.C', None, 'DEMO', 'RA', None, 'GC', 'RLA', None, 'D C', 'G.CONTR.', 'DEMO  CONT', 'GC', 'CC', 'demo G.C.', None, 'M.F.S.P.C.', 'DEMO', 'DEMO G C', None, 'GEN  CONT', 'GC ', "GC ", 'DEMO ', None, 'G.C.', 'GC', 'LIC', None, '0WNER',None, 'GEN.CONT']

    outlier4 = ['', '0000GC', '083278', 'DD5615', '0000PB', '00ASB4', 'B81923', '99998', '000N/A', '65569+', '01827O', 'R9526', 'LP0256', 'N/A', '1964', 'ISLAND', '1609', '000PW1', '00DEMO', '0688.6', '00000', '.20929', 'LP0258', '000TOR', '0D8615', '0SWITA', '818', 'O02200', 'DEMO', '196', '1075', '0000NT', '215', '0', '00000`', "D'ALTO", '0455', '22377', 'DD8615', '050579', '226', 'SWITA', 'DD6815', 'X02689']
    mapping4 = [None, '0000GC', '083278', 'DD5615', '0000PB', '00ASB4', 'B81923', '099998', '000000', '065569', '01827O', '0R9526', 'LP0256',None, '001964',None, '001609', '000PW1', '00DEMO', '006886', '000000', '020929', 'LP0258', '000TOR', '0D8615', '0SWITA', '000818', 'O02200', None, '000196', '001075', '0000NT', '000215', '000000', '000000', None, '000455', '022377', 'DD8615', '050579', '000226', None, 'DD6815', 'X02689']

    outliers = outlier1+ outlier2+ outlier3+ outlier4
    mappings = mapping1+ mapping2+ mapping3+ mapping4
       
    for col in data_cols:
        
        df[col] = df[col].replace(outliers, mappings)
        
        '''
        if file != 'rbx6-tga4.tsv.gz':
            light_outliers = findDateOutliers(df, ds, col)

            for item in light_outliers:

                df[col] = df[col].replace(item, None)
        
        
        if file != 'hg8x-zxpr.tsv.gz' and col in name_cols:
            col_clusters = getClusters(df, ds, col)
            updateUsingClusters(df, ds, col, col_clusters, True)
         '''
            
        df[col] = df[col].replace(['N/A', '', 'NA','NONE'], [None,None,None,None])
        
    return data_cols    

In [22]:
def saveDf(df, datafile):
    outputpath = datafile[2:11]+'_cleaned_data_improved.csv'
    df.to_csv(outputpath,sep=',',index=False,header=True) 

In [23]:
def dataCleanOnDataset(file):
    
    print('working on file: ', file)
    datafile, df = readData(file)
    
    cleaned_cols = []
    
    print("fixing Column Names.......")
    col_rename_dict = fixColumnNames(df)
    #print("Column renaming dictionary:")
    #print(col_rename_dict)
    
    df = df.rename(columns=col_rename_dict)

    print('fixing ID Number Columns......')
    cleaned_cols += fix_ID_Number_Column(df)
    
    print('fixing Binary Columns......')
    cleaned_cols += fix_Binary_Column(df)
    
    print('fixing Monetary Columns......')
    cleaned_cols += fix_Monetary_Column(df)
    
    print('fixing Numerical Columns......')
    cleaned_cols += fix_Numerical_Column(df)
    
    print('fixing Datetime Columns......')
    cleaned_cols += fix_datetime_Column(df)
    
    print('fixing City And Name Column......')
    cleaned_cols += fix_city_and_name_Column(df, file)
    
    saveDf(df, datafile)
    
    return cleaned_cols, datafile, df, col_rename_dict

In [24]:
def precision(tp, fp):
    return tp/(tp+fp)

def recall(tp, fn):
    return tp/(tp+fn)

In [25]:
def precision_recall(cleaned_columns, datafile, df, sample_size, col_rename_dict):
    df2  = pd.read_csv(datafile, dtype='object', sep='\t')
    df2 = df2.drop_duplicates()
    df2 = df2.rename(columns=col_rename_dict)
    

    df_sample_data =  df2.sample(sample_size).copy()
    df_new_sample_data = df.loc[df_sample_data.index].copy()
    print('sample size: ',sample_size)
    
    print('total size: ',sample_size * len(cleaned_columns))
    print('======================\n\n')

    same = 0

    for col in cleaned_columns:
        print("column: ", col)
        print("Original,\t Cleaned\n")
        for i in range(sample_size):
            df_sample_data.iloc[i].index
            if df_new_sample_data[col].iloc[i] == df_sample_data[col].iloc[i]:
                print(df_sample_data[col].iloc[i], '\t',df_new_sample_data[col].iloc[i], '\t')
                same += 1
            else:
                print(df_sample_data[col].iloc[i], '\t', df_new_sample_data[col].iloc[i], '\t*')

        print('*   ', same, ' same records   *\n')
        same = 0

        print('======================\n\n')

# DOB Permit Issuance

In [26]:
cleaned_cols, datafile, df, col_rename_dict = dataCleanOnDataset(file_list[0])
cleaned_cols = list(set(cleaned_cols))

print(cleaned_cols)

working on file:  ipu4-2q9a.tsv.gz
fixing Column Names.......
fixing ID Number Columns......
fixing Binary Columns......
fixing Monetary Columns......
fixing Numerical Columns......
fixing Datetime Columns......


  mask |= arr == x


fixing City And Name Column......
['Job doc Number', "Owner's Phone Number", "Owner's Business Name", 'Street Name', "Site Safety Mgr's First Name", 'Nta Name', 'BOROUGH', 'Permit Si Number', 'Job Start Date', 'Superintendent Business Name', 'Issuance Date', "Permittee's Last Name", 'Permit Sequence Number', "Owner's Last Name", "Owner's House Street Name", "Permittee's First Name", 'Superintendent First & Last Name', "Permittee's Phone Number", 'Job Number', 'Filing Date', 'Numbern- Profit', "Owner's House City", 'Expiration Date', 'Site Safety Mgr Business Name', 'D O B Run Date', "Owner's House Number", "Site Safety Mgr's Last Name", "Permittee's License Number", 'House Number', 'Bin Number', "Permittee's Business Name", "Owner's First Name"]


In [27]:
precision_recall(cleaned_cols, datafile, df, 50, col_rename_dict)

sample size:  50
total size:  1600


column:  Job doc Number
Original,	 Cleaned

01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
02 	 02 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
02 	 02 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
01 	 01 	
*    50  same records   *



column:  Owner's Phone Number
Original,	 Cleaned

2123628495 	 2123628495 	
9176421442 	 9176421442 	
7187379500 	 7187379500 	
2127042000 	 2127042000 	
6466901823 	 6466901823 	
7184349400 	 7184349400 	
2126397850 	 2126397850 	
2125866400 	 2125866400 	
nan 	  	*
9174402399 	 9174402399 	
3479947347 	 3479947347 	
7183462929 	 7183462929 	
2124072400 	 2124072400 	
9177337097 	 91773

2017-09-29 	 2017-09-29 00:00:00 	*
2006-12-07 	 2006-12-07 00:00:00 	*
2006-01-04 	 2006-01-04 00:00:00 	*
01/31/2019 	 2019-01-31 00:00:00 	*
2009-07-01 	 2009-07-01 00:00:00 	*
2015-11-30 	 2015-11-30 00:00:00 	*
2011-04-20 	 2011-04-20 00:00:00 	*
2009-05-11 	 2009-05-11 00:00:00 	*
2007-07-17 	 2007-07-17 00:00:00 	*
2010-10-28 	 2010-10-28 00:00:00 	*
2008-06-16 	 2008-06-16 00:00:00 	*
2007-02-08 	 2007-02-08 00:00:00 	*
2004-05-10 	 2004-05-10 00:00:00 	*
*    0  same records   *



column:  Superintendent Business Name
Original,	 Cleaned

nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
GENERAL PLUMBING CORP 	 GENERAL PLUMBING CORP 	
nan 	 nan 	*
CROOM PLUMBING & HEATING CORP 	 CROOM PLUMBING & HEATING CORP 	
nan 	 nan 	*
AKRON CONTRACTORS INC. 	 AKRON CONTRACTORS INC. 	
SO-HO CUSTOM INTERIORS 	 SO-HO CUSTOM INTERIORS 	
nan 	 nan 	*
nan 	 nan 	*
M.R.L.S. CONSTR. 	 M.R.L.S. CONSTR. 	
BOVIS LEND LEASE LMB 	 BOVIS LEND LEASE LMB 	
nan 	 nan 	*
INTERIOR CONSTRUCTION CORP. 	 INTERIOR CONSTRU

*    50  same records   *



column:  Superintendent First & Last Name
Original,	 Cleaned

nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
FRED BRENNER 	 FRED BRENNER 	
nan 	 nan 	*
LATHAN CROOM 	 LATHAN CROOM 	
nan 	 nan 	*
NICHOLAS KIOUZELLIS 	 NICHOLAS KIOUZELLIS 	
SANDY FRIEDMAN 	 SANDY FRIEDMAN 	
nan 	 nan 	*
nan 	 nan 	*
MORRIS REME 	 MORRIS REME 	
JIM MIRABILE 	 JIM MIRABILE 	
nan 	 nan 	*
ANNA BRUZZESE 	 ANNA BRUZZESE 	
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
ROY WILDENBERGER 	 ROY WILDENBERGER 	
JOHN WIDING 	 JOHN WIDING 	
ROBERT MCMANUS 	 ROBERT MCMANUS 	
CARLOS SILBERMAN 	 CARLOS SILBERMAN 	
nan 	 nan 	*
MATHEW WEISS 	 MATHEW WEISS 	
nan 	 nan 	*
nan 	 nan 	*
ALVIN WITTLIN 	 ALVIN WITTLIN 	
nan 	 nan 	*
JOSE BATISTA 	 JOSE BATISTA 	
PETER WARD 	 PETER WARD 	
WILLIAM HARVEY 	 WILLIAM HARVEY 	
nan 	 nan 	*
nan 	 nan 	*
DONALD ADLER 	 DONALD ADLER 	
NIKKI MOSKOVER 	 NIKKI MOSKOVER 	
nan 	 nan 	*
JOHN JUHN 	 JOHN JUHN 	
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 

2017-11-03  	 2019-09-13 00:00:00 	*
2017-11-03  	 2020-01-23 00:00:00 	*
2017-11-03  	 2019-08-30 00:00:00 	*
2017-11-03  	 2019-10-02 00:00:00 	*
2017-11-03  	 2019-09-05 00:00:00 	*
2017-11-03  	 2019-08-29 00:00:00 	*
12/22/2020 00:00:00 	 2020-12-22 00:00:00 	*
2017-11-03  	 2020-01-23 00:00:00 	*
2017-11-03  	 2019-10-04 00:00:00 	*
2017-11-03  	 2019-10-03 00:00:00 	*
2017-11-03  	 2019-09-11 00:00:00 	*
2017-11-03  	 2019-09-06 00:00:00 	*
2017-11-03  	 2020-01-29 00:00:00 	*
2017-11-03  	 2019-08-27 00:00:00 	*
2017-11-03  	 2019-10-12 00:00:00 	*
2017-11-03  	 2019-09-17 00:00:00 	*
2017-11-03  	 2019-08-29 00:00:00 	*
2017-11-03  	 2019-09-29 00:00:00 	*
2017-11-03  	 2019-10-08 00:00:00 	*
03/25/2021 00:00:00 	 2021-03-25 00:00:00 	*
2017-11-03  	 2019-09-26 00:00:00 	*
2017-11-03  	 2019-09-11 00:00:00 	*
2017-11-03  	 2019-10-04 00:00:00 	*
07/30/2020 00:00:00 	 2020-07-30 00:00:00 	*
2017-11-03  	 2019-05-17 00:00:00 	*
2017-11-03  	 2020-01-31 00:00:00 	*
2017-11-03  	 

In [None]:
tp = 100
fp = 7

fn = 0
tn = 500 - tp - fp - fn

In [None]:
precision(tp, fp)

In [None]:
recall(tp,fn)

# DOB Cellular Antenna Filings

In [29]:
cleaned_cols, datafile, df, col_rename_dict = dataCleanOnDataset(file_list[1])
cleaned_cols = list(set(cleaned_cols))

print(cleaned_cols)

working on file:  iz2q-9x8d.tsv.gz
fixing Column Names.......
fixing ID Number Columns......
fixing Binary Columns......
fixing Monetary Columns......
fixing Numerical Columns......
fixing Datetime Columns......


  mask |= arr == x


fixing City And Name Column......
['Pre- Filing Date', "Owner's Business Name", 'Initial Cost', 'Street Name', "Owner's  House Number", 'Applicant License Number', 'Doc Number', 'Total Est Fee', 'First Permit  Date', 'Latest Action Date', "Owner's Last Name", "Applicant's Last Name", 'City', "Owner's  Phone Number", 'Job Number', "Applicant's First Name", 'Numbern- Profit', 'Fee Status', 'D O B Run Date', 'Borough', 'House Number', 'Bin Number', "Owner's First Name"]


In [31]:
precision_recall(cleaned_cols, datafile, df, 50, col_rename_dict)

sample size:  50
total size:  1150


column:  Pre- Filing Date
Original,	 Cleaned

12/29/2016 12:00:00 AM 	 2016-12-29 00:00:00 	*
08/06/2015 12:00:00 AM 	 2015-08-06 00:00:00 	*
03/02/2016 12:00:00 AM 	 2016-03-02 00:00:00 	*
12/08/2014 12:00:00 AM 	 2014-12-08 00:00:00 	*
07/07/2014 12:00:00 AM 	 2014-07-07 00:00:00 	*
08/04/2015 12:00:00 AM 	 2015-08-04 00:00:00 	*
10/09/2014 12:00:00 AM 	 2014-10-09 00:00:00 	*
03/10/2014 12:00:00 AM 	 2014-03-10 00:00:00 	*
08/04/2016 12:00:00 AM 	 2016-08-04 00:00:00 	*
10/16/2015 12:00:00 AM 	 2015-10-16 00:00:00 	*
02/24/2016 12:00:00 AM 	 2016-02-24 00:00:00 	*
12/11/2014 12:00:00 AM 	 2014-12-11 00:00:00 	*
08/17/2015 12:00:00 AM 	 2015-08-17 00:00:00 	*
09/22/2015 12:00:00 AM 	 2015-09-22 00:00:00 	*
08/06/2015 12:00:00 AM 	 2015-08-06 00:00:00 	*
09/05/2013 12:00:00 AM 	 2013-09-05 00:00:00 	*
07/10/2015 12:00:00 AM 	 2015-07-10 00:00:00 	*
nan 	 2016-08-01 00:00:00 	*
04/23/2015 12:00:00 AM 	 2015-04-23 00:00:00 	*
07/28/2014 12:00:00 AM 	

nan 	 2016-09-13 00:00:00 	*
07/09/2015 12:00:00 AM 	 2015-07-09 00:00:00 	*
08/05/2013 12:00:00 AM 	 2013-08-05 00:00:00 	*
03/24/2017 12:00:00 AM 	 2017-03-24 00:00:00 	*
09/19/2016 12:00:00 AM 	 2016-09-19 00:00:00 	*
02/04/2016 12:00:00 AM 	 2016-02-04 00:00:00 	*
11/19/2015 12:00:00 AM 	 2015-11-19 00:00:00 	*
07/24/2013 12:00:00 AM 	 2013-07-24 00:00:00 	*
06/13/2014 12:00:00 AM 	 2014-06-13 00:00:00 	*
10/11/2017 12:00:00 AM 	 2017-10-11 00:00:00 	*
08/04/2016 12:00:00 AM 	 2016-08-04 00:00:00 	*
08/24/2016 12:00:00 AM 	 2016-08-24 00:00:00 	*
07/29/2013 12:00:00 AM 	 2013-07-29 00:00:00 	*
*    0  same records   *



column:  Latest Action Date
Original,	 Cleaned

04/11/2017 00:00:00 	 2017-04-11 00:00:00 	*
04/07/2017 00:00:00 	 2017-04-07 00:00:00 	*
03/18/2016 00:00:00 	 2016-03-18 00:00:00 	*
08/25/2017 00:00:00 	 2017-08-25 00:00:00 	*
08/27/2014 00:00:00 	 2014-08-27 00:00:00 	*
03/24/2017 00:00:00 	 2017-03-24 00:00:00 	*
03/02/2015 00:00:00 	 2015-03-02 00:00:00 	*
10/2

STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
nan 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
nan 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
STANDARD 	 nan 	*
*    0  same records   *



column:  D O B Run Date
Original,	 Cleaned

04/12/2017 00:00:00 	 2017-04-12 00:00:00 	*
04/08/2017 00:00:00 	 2017-04-08 00:00:00 	*
03/19/2016 00:00:00 	 2016-03-19 00:00:00 	*
08/26/2017 00:00:00 	 2017-08-26 00:00:00 	*
08/28/2014 00:00:00 	 2014-08-28 00:00:00 	*
03/25/2017 00:00:00 	 2017-03-25 00:00:00 	*
03/03/2015 00:00:00 	 2015-03-03 00:00:00 	*
10/21/2015 00:00:00 	 2015-10-21 00:00:00 	*
06/22/2017 00:00:00 

In [32]:
tp = 100
fp = 0

fn = 13
tn = 1350 - tp - fp - fn

In [33]:
precision(tp, fp)

1.0

In [34]:
recall(tp,fn)

0.8849557522123894

# Open Restaurant Applications

In [42]:
cleaned_cols, datafile, df, col_rename_dict = dataCleanOnDataset(file_list[2])
cleaned_cols = list(set(cleaned_cols))

print(cleaned_cols)

working on file:  pitm-atqc.tsv.gz
fixing Column Names.......
fixing ID Number Columns......
fixing Binary Columns......
fixing Monetary Columns......
fixing Numerical Columns......
fixing Datetime Columns......
fixing City And Name Column......
['Sidewalk Dimensions (Length)', 'Roadway Dimensions (Length)', 'SLA Serial Number', 'Restaurant Name', 'Legal Business Name', 'Borough', 'Food Service Establishment Permit Number', 'Building Number']


In [43]:
precision_recall(cleaned_cols, datafile, df, 50, col_rename_dict)

sample size:  50
total size:  400


column:  Sidewalk Dimensions (Length)
Original,	 Cleaned

nan 	 0 	*
20 	 20 	*
15 	 15 	*
90 	 90 	*
14 	 14 	*
24 	 24 	*
20 	 20 	*
20 	 20 	*
245 	 245 	*
60 	 60 	*
15 	 15 	*
nan 	 0 	*
10 	 10 	*
11 	 11 	*
15 	 15 	*
nan 	 0 	*
14 	 14 	*
19 	 19 	*
34 	 34 	*
26 	 26 	*
28 	 28 	*
10 	 10 	*
18 	 18 	*
20 	 20 	*
14 	 14 	*
20 	 20 	*
24 	 24 	*
80 	 80 	*
nan 	 0 	*
nan 	 0 	*
15 	 15 	*
nan 	 0 	*
36 	 36 	*
25 	 25 	*
15 	 15 	*
34 	 34 	*
22 	 22 	*
37 	 37 	*
32 	 32 	*
nan 	 0 	*
25 	 25 	*
64 	 64 	*
20 	 20 	*
23 	 23 	*
54 	 54 	*
45 	 45 	*
8 	 8 	*
22 	 22 	*
12 	 12 	*
17 	 17 	*
*    0  same records   *



column:  Roadway Dimensions (Length)
Original,	 Cleaned

42 	 42 	*
20 	 20 	*
19 	 19 	*
18 	 18 	*
8 	 8 	*
nan 	 0 	*
nan 	 0 	*
nan 	 0 	*
nan 	 0 	*
40 	 40 	*
nan 	 0 	*
14 	 14 	*
20 	 20 	*
11 	 11 	*
nan 	 0 	*
19 	 19 	*
nan 	 0 	*
18 	 18 	*
34 	 34 	*
nan 	 0 	*
nan 	 0 	*
25 	 25 	*
18 	 18 	*
29 	 29 	*
14 	 14 	

In [44]:
tp = 442
fp = 128

fn = 39
tn = 1900 - tp - fp - fn

In [45]:
precision(tp, fp)

0.775438596491228

In [46]:
recall(tp,fn)

0.918918918918919

# DOB NOW: Safety – Facades Compliance Filings

In [47]:
cleaned_cols, datafile, df, col_rename_dict = dataCleanOnDataset(file_list[3])
cleaned_cols = list(set(cleaned_cols))

print(cleaned_cols)

working on file:  xubg-57si.tsv.gz
fixing Column Names.......
fixing ID Number Columns......
fixing Binary Columns......
fixing Monetary Columns......
fixing Numerical Columns......
fixing Datetime Columns......
fixing City And Name Column......
['Qewi City', 'Field Inspection Completed Date', 'Street Name', 'Sequence Number', 'BOROUGH', 'Owner Bus Name', 'Qewi Name', 'Prior Cycle Filing Date', 'Qewi Bus Name', 'Owner Name', 'Filing Date', 'Qewi Signed Date', 'Owner City', 'Qewi Nys Lic Number', 'Control Number', 'Owner Bus Street Name', 'Qewi Bus Street Name', 'Tr6 Number', 'House Number']


In [48]:
precision_recall(cleaned_cols, datafile, df, 50, col_rename_dict)

sample size:  50
total size:  950


column:  Qewi City
Original,	 Cleaned

NEW YORK 	 NEW YORK 	
nan 	 nan 	*
NANUET 	 NANUET 	
nan 	 nan 	*
ENGLEWOOD CLIFFS 	 ENGLEWOOD CLIFFS 	
nan 	 nan 	*
FAIRFAX 	 FAIRFAX 	
YONKERS 	 YONKERS 	
NEW YORK 	 NEW YORK 	
nan 	 nan 	*
BROOKLYN 	 BROOKLYN 	
nan 	 nan 	*
NEW YORK 	 NEW YORK 	
NEW YORK 	 NEW YORK 	
ENGLEWOOD CLIFF 	 ENGLEWOOD CLIFF 	
nan 	 nan 	*
nan 	 nan 	*
NEW YORK 	 NEW YORK 	
NEW YORK 	 NEW YORK 	
nan 	 nan 	*
NEW ROCHELLE 	 NEW ROCHELLE 	
NEW YORK 	 NEW YORK 	
nan 	 nan 	*
NEW YORK 	 NEW YORK 	
nan 	 nan 	*
LIC 	 LIC 	
NY 	 NY 	
NEW YORK 	 NEW YORK 	
nan 	 nan 	*
NEW YORK 	 NEW YORK 	
NEW YORK 	 NEW YORK 	
BEDFORD 	 BEDFORD 	
NEW YORK 	 NEW YORK 	
NEW YORK 	 NEW YORK 	
BAYSIDE 	 BAYSIDE 	
DOBBS FERRY 	 DOBBS FERRY 	
NEW YORK 	 NEW YORK 	
WESTFIELD 	 WESTFIELD 	
OYSTER BAY 	 OYSTER BAY 	
nan 	 nan 	*
L.I.C 	 L.I.C 	
DOBBS FERRY 	 DOBBS FERRY 	
NEW YORK 	 NEW YORK 	
BAYSIDE 	 BAYSIDE 	
nan 	 nan 	*
NEW YORK 	 NEW YORK 	
DOBBS FERRY 	 DO

LESLIE  SEVERINO 	 LESLIE  SEVERINO 	
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
JAY  YABLONSKY 	 JAY  YABLONSKY 	
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
ROBERT  APFEL 	 ROBERT  APFEL 	
nan 	 nan 	*
FRANK  LANG 	 FRANK  LANG 	
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
TANYA  FRIEDMAN 	 TANYA  FRIEDMAN 	
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
RICHARD  MORRISON 	 RICHARD  MORRISON 	
*    11  same records   *



column:  Filing Date
Original,	 Cleaned

04/21/2021 12:00:00 AM 	 2021-04-21 00:00:00 	*
nan 	 2015-03-11 00:00:00 	*
09/12/2007 12:00:00 AM 	 2007-09-12 00:00:00 	*
nan 	 2012-08-10 00:00:00 	*
09/17/2021 12:00:00 AM 	 2021-09-17 00:00:00 	*
nan 	 2015-11-23 00:00:00 	*
03/20/2013 12:00:00 AM 	 2013-03-20 00:00:00 	*
08/10/2012 12:00:00 AM 	 2012-08-10 00:00:00 	*
02/21/2007 12:00:00 AM 	 2007-02-21 00:00:00 	*
nan 	 2015-07-22 00:00:00 	*
07/06/2009 12:00:00 AM 	 2009-07-06 00:00:00 

Original,	 Cleaned

TR6-905258-9A-I1 	 TR6-905258-9A-I1 	
TR6-805384-8B-N1 	 TR6-805384-8B-N1 	
TR6-606926-NA-I1 	 TR6-606926-NA-I1 	
TR6-705817-7B-N1 	 TR6-705817-7B-N1 	
TR6-905097-9A-I1 	 TR6-905097-9A-I1 	
TR6-807192-8C-N1 	 TR6-807192-8C-N1 	
TR6-705338-7A-S1 	 TR6-705338-7A-S1 	
TR6-703098-7B-I1 	 TR6-703098-7B-I1 	
TR6-601574-NA-I1 	 TR6-601574-NA-I1 	
TR6-812663-8B-N1 	 TR6-812663-8B-N1 	
TR6-603208-NA-S1 	 TR6-603208-NA-S1 	
TR6-814380-8A-N1 	 TR6-814380-8A-N1 	
TR6-707341-7A-I1 	 TR6-707341-7A-I1 	
TR6-800258-8C-S1 	 TR6-800258-8C-S1 	
TR6-712938-7B-I1 	 TR6-712938-7B-I1 	
TR6-607179-NA-N1 	 TR6-607179-NA-N1 	
TR6-800985-8B-N1 	 TR6-800985-8B-N1 	
TR6-805446-8A-I4 	 TR6-805446-8A-I4 	
TR6-701813-7B-I1 	 TR6-701813-7B-I1 	
TR6-813819-8C-N1 	 TR6-813819-8C-N1 	
TR6-607548-NA-I1 	 TR6-607548-NA-I1 	
TR6-705863-7A-I1 	 TR6-705863-7A-I1 	
TR6-814523-8A-I1 	 TR6-814523-8A-I1 	
TR6-812728-8C-I1 	 TR6-812728-8C-I1 	
TR6-810621-8B-N1 	 TR6-810621-8B-N1 	
TR6-710917-7A-S1 	 TR6-710917-

In [49]:
tp = 273
fp = 50

fn = 86
tn = 1150 - tp - fp - fn

In [50]:
precision(tp, fp)

0.8452012383900929

In [51]:
recall(tp,fn)

0.7604456824512534