# General version of data cleaning

This code will automatically perform data cleaning on each of the dataset in the directoty, to run this notebook you need to download all the datasets and place them in the current directoty with the notebook.

Because we automatically find columns and fix them, we can not afford clustering to find outliers for each datasets so remove it in this notebook, the precision and recall rate might be influenced.

Run the code in oreder to perform data clean, cleaned data will be save in the current directoty, and to calculate the precision and recall rate, you will need to manually inspect the result.

In [48]:
import openclean
import glob
import pandas as pd
import numpy as np


In [49]:
import gzip
import humanfriendly
import os

from openclean.data.source.socrata import Socrata

In [50]:
import pandas as pd
from openclean.pipeline import stream

In [92]:
names = ['w9ak-ipjd', "dm9a-ab7w", 'bty7-2jhb']

for name in names:
    dataset = Socrata().dataset(name)
    datafile = './' + name + '.tsv.gz'

    if not os.path.isfile(datafile):
        with gzip.open(datafile, 'wb') as f:
            print('Downloading ...\n')
            dataset.write(f)


    fsize = humanfriendly.format_size(os.stat(datafile).st_size)
    print("Using '{}' in file {} of size {}".format(dataset.name, datafile, fsize))

Downloading ...

Using 'DOB NOW: Build – Job Application Filings' in file ./w9ak-ipjd.tsv.gz of size 27.01 MB
Downloading ...

Using 'DOB NOW: Electrical Permit Applications' in file ./dm9a-ab7w.tsv.gz of size 41.03 MB
Downloading ...

Using 'Historical DOB Permit Issuance' in file ./bty7-2jhb.tsv.gz of size 321.34 MB


In [95]:
file_list = glob.glob("*.tsv.gz")

In [96]:
print(file_list)

['bty7-2jhb.tsv.gz', 'dm9a-ab7w.tsv.gz', 'hcir-3275.tsv.gz', 'hg8x-zxpr.tsv.gz', 'rbx6-tga4.tsv.gz', 'w9ak-ipjd.tsv.gz']


In [52]:
def readData(file):
    
    datafile = './'+file
    
    df  = pd.read_csv(datafile, dtype='object', sep='\t')
    ds = stream(datafile)
    
    return datafile, df, ds

In [53]:
def findColumns(ds, column_name_list):
    data_cols = []

    for col in ds.columns:
        for name in column_name_list:
            if name in col:
                data_cols.append(col)
                
    return  data_cols        

In [54]:
def fix_ID_Number_Column(df, ds):
    data_cols = findColumns(ds, ['ID','id','#','Number', 'number',' No',' NO'])
    
    for col in data_cols:
        
        df[col].fillna('', inplace=True)
        df[col] = df[col].astype('str')
        df[col] = df[col].str.upper()

        df.loc[df[col].str.strip('')=='ONE', col] = '1'
        df.loc[df[col].str.strip('')=='TWO', col] = '2'
        df.loc[df[col].str.strip('')=='THREE', col] = '3'
        df.loc[df[col].str.strip('')=='FOUR', col] = '4'
        df.loc[df[col].str.strip('')=='FIVE', col] = '5'
        df.loc[df[col].str.strip('')=='SIX', col] = '6'
        df.loc[df[col].str.strip('')=='SEVEN', col] = '7'
        df.loc[df[col].str.strip('')=='EIGHT', col] = '8'
        df.loc[df[col].str.strip('')=='NINE', col] = '9'

        df.loc[df[col].str.strip('')=='NAN', col] = ''
        df.loc[df[col].str.strip('')=='nan', col] = ''
        
        df.loc[df[col].str.strip('')=='NO NUMBER', col] = ''
        
    return data_cols

In [55]:
def fix_Binary_Column(df, ds):
    data_cols = findColumns(ds, ['Landmarked','Owned', 'Filled'])
    
    for col in data_cols:
        
        df[col].fillna('N', inplace=True)
        df.loc[df[col]=='X', col] = 'Y'
        
        df.loc[df[col]=='Y', col] = True
        df.loc[df[col]!=True, col] = False
        
        df[col] = df[col].astype('bool')
        
    return data_cols

In [56]:
def fix_Monetary_Column(df, ds):
    data_cols = findColumns(ds, ['Cost','cost', 'fee', 'Fee'])
    
    for col in data_cols:
        
        df[col] = df[col].str.replace("$", '', regex=False)
        df[col] = df[col].str.replace("-", '', regex=False)
        
    return data_cols

In [57]:
def fix_Numerical_Column(df, ds):
    data_cols = findColumns(ds, ['Units','units', 'Height', 'height', 'Length', 'length', 'Footage', 'footage', 'Sqft', 'sqft'])
    
    for col in data_cols:
        
        df[col] = df[col].str.replace('-', '', regex=False)
        df[col] = df[col].str.replace('NONE', '0', regex=False)
        df[col] = df[col].str.replace('none', '0', regex=False)
        df[col] = df[col].str.replace('NAN', '0', regex=False)
        df[col] = df[col].str.replace('nan', '0', regex=False)
        
        
    return data_cols

In [58]:
from openclean.profiling.anomalies.sklearn import DBSCANOutliers

def findDateOutliers(df, ds, column_name, eps_setting = 0.05):
    datetime_data = df[column_name]

    light_outliers = DBSCANOutliers().find(datetime_data)
    
    return light_outliers

In [59]:
def fix_datetime_Column(df, ds):
    data_cols = findColumns(ds, ['date', 'Date', 'DATE'])
    
    for col in data_cols:
    
        
        light_outliers = findDateOutliers(df, ds, col)
        
        for item in light_outliers:
            
            df[col] = df[col].replace(item, None)
        
    return data_cols

In [60]:
from openclean.data.refdata import RefStore

refdata = RefStore()
city_df = refdata\
    .load('encyclopaedia_britannica:us_cities', auto_download=True)\
    .df()


In [61]:
# Cluster string using kNN clusterer (with the default n-gram setting)
# using the Levenshtein distance as the similarity measure.

from openclean.cluster.knn import knn_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan

def getClusters(df, ds, col, minsize = 2, preds = 0.5):
    dba = ds.select(col).distinct()
    clusters = knn_clusters(
        values=dba,
        sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(preds)),
        minsize=minsize
    )
    return clusters

def print_cluster(cnumber, cluster):
    item_count = 0

def updateUsingClusters(df, ds, col, clusters, isPrint = False):
    
    orignal_list = []
    suggestion_list = []
    clusters.sort(key=lambda c: len(c), reverse=True)
       
    for i, cluster in enumerate(clusters):        
        suggestion = cluster.suggestion()
        orignal_list = []
        suggestion_list = []
        if isPrint and i <5:
            print_cluster(i, cluster)
        
        for val, count in cluster.items(): 
            orignal_list.append(val)
            suggestion_list.append(suggestion)
    
    df[col] = df[col].replace(orignal_list, suggestion_list)

In [126]:
from openclean.function.eval.base import Col, Eval
from openclean.function.eval.logic import And
from openclean.function.value.phonetic import Soundex, soundex


def fix_city_and_name_Column(df, ds, file):
    data_cols = findColumns(ds, ['NAME', 'name', 'Name','city', 'City', 'CITY', 'BOROUGH', 'Borough', 'borough'])
    name_cols = findColumns(ds, ['NAME', 'name', 'Name'])
    
    # mapping list to replace outliers
    outlier1 = ['', 'MR. ROSS ADAM C', 'MICHAEL', 'N. J.', 'WILLIAM 11', 'JOSEP;H``', 'DAID/11/2007', 'CHUNG   LUN', '718 9215010', 'ANTHONY', 'HSIA0-NAN', 'JOSEPH', '``````````', 'ROBERT  `', 'RAJENDRA9956700', '2', 'G.B.M.', 'EUGENE......JR', '6312100', 'CLAUDE,JR.', 'THOMAS``', 'ALAN  L', 'Nab53', 'MR. Y. B', 'J.J', 'PH8ILIP', 'I. M', 'RICHARD', 'ALBERTA S 111 D', 'P ;', 'GENECG.C. ENG &', 'J.J.', '2126202794', 'SHAW  HWA', 'HARRY         H', 'MR DOU8GLAS', '`1D', 'PAUL', 'K. T.', 'JOHN', '...NORMAN', 'EVAN   D', '7184361278BERNA', 'S.D. DON', 'KY00 SUK', 'JJ', 'YURI.`', 'MAD/Y/ARNI', 'ES ON SCH B', 'EUGENE.......JR', 'NEAL', 'F._ERIC', 'RYAN,  JR', 'AASDFASDFASDF', 'LA0-TECH', 'RODNEY   __', 'DAVID', 'G. L.', 'JAMES', 'LESLI8E', '7186054055', 'GEORGE', 'G.B.M', 'DAVID    JON', 'CHUNG---YAO', 'PETER', 'YUBUN(JACK)', 'GLEN A. L.', '1P', 'JUDE.....N.O', 'LEONARD--', 'WILLIAM', 'ANTHONY,111', 'WU(WOODY)', 'GAD/HON-AN', 'GLEN  A.L.', 'J.B. Jr.', 'LORENZO..A', 'J J', '..RAMSEY', 'HUI LI I', 'ANTONIO9', 'ROBERT', '0.BERT', 'DUMMY 2', '...JOSEPH', 'RUSSELL 111', 'THOMAS', 'H./E./CAMELLE', 'LALAL', 'M.E. P.E', 'R0OBIN VINCENT', '--young', 'AKM', 'LE1', 'IK.T.', 'LEO, JR.', 'J. Butch A. Jr.', 'WU (WOODY0', 'PAUL   N', 'CHRISTOPHER']
    mapping1 = [None, 'ROSS ADAM C', 'MICHAEL', 'N. J.', 'WILLIAM', 'JOSEPH', None, 'CHUNG LUN', None, 'ANTHONY', 'HSIA0 NAN', 'JOSEPH', None, 'ROBERT', 'RAJENDRA', None, 'G.B.M.', 'EUGENEJR', None, 'CLAUDE JR.', 'THOMAS', 'ALAN  L', 'Nab', 'MR. Y. B', 'J.J', 'PHILIP', 'I. M', 'RICHARD', 'ALBERTA', None, 'GENECG.C. ENG', 'J.J.', None, 'SHAW HWA', 'HARRYH', 'MR DOUGLAS', None, 'PAUL', 'K. T.', 'JOHN', 'NORMAN', 'EVAND', 'BERNA', 'S.D. DON', 'KY00 SUK', 'JJ', 'YURI.`', 'MADYARNI', 'ES ON SCH B', 'EUGENEJR', 'NEAL', 'FERIC', 'RYAN,  JR', 'AASDFASDFASDF', 'LA0 TECH', 'RODNEY', 'DAVID', 'G. L.', 'JAMES', 'LESLIE', None, 'GEORGE', 'G.B.M', 'DAVID JON', 'CHUNG YAO', 'PETER', 'YUBUN(JACK)', 'GLEN A. L.', None, 'JUDE N.O', 'LEONARD--', 'WILLIAM', 'ANTHONY,111', 'WU(WOODY)', 'GAD HON-AN', 'GLEN A.L.', 'J.B. Jr.', 'LORENZOA', 'J J', 'RAMSEY', 'HUI LI I', 'ANTONIO9', 'ROBERT', '0.BERT', 'DUMMY', 'JOSEPH', 'RUSSELL', 'THOMAS', 'H.E.CAMELLE', 'LALAL', 'M.E. P.E', 'R0OBIN VINCENT', 'young', 'AKM', 'LE1', 'IK.T.', 'LEO, JR.', 'J. Butch A. Jr.', 'WU (WOODY0', 'PAUL   N', 'CHRISTOPHER']

    outlier2 = ['SHARMA #0', "0'CONNOR", 'RUSHTON    UEL', 'UDDIN   Z', 'HINKLEY 1', 'O&#039;CONNOR, P.E.', '.OOK', 'SAMUELS111', 'O&#039;CONNOR', 'CALIENDO', 'SMITH   JR.', 'LO  BUE', '7AN', '+-+ETTIERI', 'SMITH, 111', 'KAMEN   1', '.EE', 'MASS, 1', '.EI', 'Zagaroli 3rd', 'RINI   II', 'KAMEN   R', 'RYAN 11', 'SPI8EZIA L S', 'MUFTIC..A.I.A', 'COSTELLO9 RA A I A', 'CALVANICO', 'LLC.', 'POEPPEL, P.E.', 'HAMA07', 'HINLEY,1', '1212', "O  ' CONNELL", 'HURT,JR.,', 'WESOLOWSKI', 'CHEN', '`ING, R.A', 'MARTARELLA 111', 'Gandhi, Ph.D., P.E.', '90I', 'ENNIS 2', 'COSTELLO R A A I A', '3UI', 'N/A', 'HURT,  JR', 'LEHR,1', 'KOHLER, 111', 'GERAZOUNIS', 'Alexander,1', 'LUBOW, R.A. LEED AP', 'RINI,111', '08CZAK', '````````````````````', 'CHAO  R.A.', 'Geier 11', '08NGEL', '08SOLOWSKI', 'I11', 'HINKLEY, 1', 'RUDIKOFF, P.E.', "O'CONNOR", 'SHAH   EZ', 'MIELE, JR., P.E.', 'RITTENHOUSE 111', 'AMADI   ISIOFIA', 'HINKLEY,1', 'RENFORE````````', "O'HARA,JR.", '73020012', 'PHAGOO   I', 'BRAY.....,', 'LLL', 'BHATHIA,1', 'GANDHI, PH. D., P.E', 'KO K', 'VASSALOTTI 11', 'HURT, JR .', '0018LKLE', 'RINI -111', 'PARIHAR', 'EE', 'L00802', 'ELISE.111', 'KING , R.A', 'CHRYSLER  P E', 'LEHR 1', 'Walters   Jr.', 'LEE', 'RINI  III', 'D&#039;ANGELO', '0UDOLPH III', 'VIEHE-NAESS 111', ',MO', '08E', '47DIKOFF', 'Yu,', '420865380', 'COPELAND', 'ZWIEFEL 3RD', 'PETERSEN', 'King, R.A.,', 'RINI, III', '7APA', 'CHEN   S', 'Hurt  Jr.', 'KATZ', 'NIZAMBAD.(P.E.)', '901BEN', '4153LOO', 'SYED-NAQVI', 'RYAN , JR.', 'K O K O R I S', 'ELISEO111', 'O&#039;CONNELL', 'ZEID61', '---Lewis', '00CHELI', 'MOHAMMAD       +++++', 'METZLER  P E', 'BAILEY', 'GANDHI, PH. D., P.E.', 'TIEMANN.111', 'SMITH.111', 'DI GER0NIMO', 'GANDHI, PH,D., P.E', 'III', 'J C', 'MAGAMI-QAIM-MAGAMI', '+M', 'LO G1UDICE', 'HOQUE', 'RUDIKOFF', 'Y10007OR', 'SMITH,111', 'KING R A FAIA', 'RYAN III, AIA', '08AN', 'STARK 1', 'MASS', 'VICTORI0, R.A', 'RIZVI   A', '21029677', "3'CONNOR", 'Wong /  Lai', 'KAPLAN 3', 'GRAICHEN.JR./DAWN/DI', 'GROSSMAN ,PE,F.A.C.I']
    mapping2 = ['SHARMA ', "CONNOR", 'RUSHTON UEL', 'UDDIN Z', 'HINKLEY ', 'CONNOR P.E.', None, 'SAMUELS', 'CONNOR', 'CALIENDO', 'SMITH JR.', 'LO BUE', None, 'ETTIERI', 'SMITH', 'KAMEN', '.EE', 'MASS', '.EI', 'Zagaroli', 'RINI', 'KAMEN R', 'RYAN', 'SPIEZIA L S', 'MUFTIC.A.I.A', 'COSTELLO9 RA A I A', 'CALVANICO', 'LLC.', 'POEPPEL P.E.', 'HAMA', 'HINLEY', None, "CONNELL", 'HURT JR.', 'WESOLOWSKI', 'CHEN', 'ING R.A', 'MARTARELLA', 'Gandhi', None, 'ENNIS ', 'COSTELLO R A A I A', None, None, 'HUR  JR', 'LEHR', 'KOHLER 111', 'GERAZOUNIS', 'Alexander', 'LUBOW R.A. LEED AP', 'RINI',None, None, 'CHAO R.A.', 'Geier', None, 'SOLOWSKI', None, 'HINKLEY', 'RUDIKOFF, P.E.', "CONNOR", 'SHAH EZ', 'MIELE JR. P.E.', 'RITTENHOUSE', 'AMADI   ISIOFIA', 'HINKLEY', 'RENFORE', "O'HARA,JR.", None, 'PHAGOO I', 'BRAY,', 'LLL', 'BHATHIA', 'GANDHI', 'KO K', 'VASSALOTTI', 'HURT JR.',None, 'RINI', 'PARIHAR', 'EE', None, 'ELISE', 'KING R.A', 'CHRYSLER  P E', 'LEHR', 'Walters Jr.', 'LEE', 'RINI  III', 'ANGELO', '0UDOLPH III', 'VIEHE-NAESS', 'MO', '08E', None, 'Yu,', None, 'COPELAND', 'ZWIEFEL 3RD', 'PETERSEN', 'King, R.A.,', 'RINI, III', '7APA', 'CHEN   S', 'Hurt  Jr.', 'KATZ', 'NIZAMBAD.(P.E.)', None, None, None, 'RYAN JR.', 'KOKORIS', 'ELISE', 'CONNELL', None, 'Lewis', 'CHELI', 'MOHAMMAD', 'METZLER  P E', 'BAILEY', 'GANDHI', 'TIEMANN', 'SMITH', 'DI GER0NIMO', 'GANDHI', 'III', 'J C', 'MAGAMI QAIM MAGAMI', None, 'LO G1UDICE', 'HOQUE', 'RUDIKOFF', None, 'SMITH', 'KING R A FAIA', 'RYAN III AIA', None, 'STARK', 'MASS', 'VICTORI0 R.A', 'RIZVIA', None, "CONNOR", 'Wong Lai', 'KAPLAN', 'GRAICHEN.JR. DAWN DI', 'GROSSMAN']

    outlier3 = ['', '....DEMO', '050069', 'DEM. CONTR.,', 'XXXXX', 'G/C 10114H9', 'CGWC10114H99', '00', 'X S000155', '082-36-1245', 'G.G', 'LESSEE', '......GC', "'", '..OWNER', 'GC 2293', '--', 'XXXXXX', 'LS 31,721', '...GC', 'gen.cont.', 'G.C TK#4592', 'PE', 'RLA - 818', '.....OWNER', 'RLA 16077', 'G C', 'X 4129892', 'G. C.', 'R.L.A', 'GC 1028350', 'WC10114H99', 'LEESEE', 'GEN.CONT.', 'SIGN..HANGER', 'DEMO 20451', 'D8615', '.X', 'P.L.L.C', '..DEMO', 'G .C', 'L A', 'G.C NY11101', '32820', '....OWNER', 'GC(DEMO)', 'C0NTRACTOR', 'EXPEDITORC99792', 'X 1341946', 'TRACK# 1390', 'EXPED.R4466', 'PLLC 9599691', 'G.C 1110101', '029649', '(CHECK)', 'DEM. CONTR,', 'EXPEDIT(H66172)', '.........GC', 'CITY OF N Y', 'GC 1170386', 'G. C', 'CO0OWNER', '(CHECKED)', 'C.C', '23392 1159774', 'DEMO {', 'RA', 'T. 31132', '....GC', 'RLA-787', 'TRACK #1390', 'D C', 'G.CONTR.', 'DEMO  CONT', '1GC', 'CC', 'demo G.C.', 'TRACK. #1390', 'M.F.S.P.C.', '...DEMO', 'DEMO G C', '13328', 'GEN  CONT', 'GC 1221073', "GC;'", 'DEMO 1341946', '11234', 'G.C.,', '.....GC', 'LIC.133668259 1', '?', '0WNER', 'C10892', 'GEN..CONT']
    mapping3 = [None, 'DEMO', None, 'DEM. CONTR', None, 'G/C', 'CGWC', None, 'X S', None, 'G.G', 'LESSEE', 'GC', None, 'OWNER', 'GC', None, None, 'LS ', 'GC', 'gen.cont.', 'G.C TK', 'PE', 'RLA ', 'OWNER', 'RLA ', 'G C', 'X', 'G. C.', 'R.L.A', 'GC', 'WC', 'LEESEE', 'GEN.CONT.', 'SIGN.HANGER', 'DEMO', None,None, 'P.L.L.C', 'DEMO', 'G.C', 'L A', 'G.C ', None, 'OWNER', 'GC(DEMO)', 'C0NTRACTOR', 'EXPEDITORC', None, 'TRACK', 'EXPED.R', 'PLLC ', 'G.C', None, None, 'DEM. CONTR,', 'EXPEDIT', 'GC', None, 'GC', 'G.C', 'CO0OWNER', None, 'C.C', None, 'DEMO', 'RA', None, 'GC', 'RLA', None, 'D C', 'G.CONTR.', 'DEMO  CONT', 'GC', 'CC', 'demo G.C.', None, 'M.F.S.P.C.', 'DEMO', 'DEMO G C', None, 'GEN  CONT', 'GC ', "GC ", 'DEMO ', None, 'G.C.', 'GC', 'LIC', None, '0WNER',None, 'GEN.CONT']

    outlier4 = ['', '0000GC', '083278', 'DD5615', '0000PB', '00ASB4', 'B81923', '99998', '000N/A', '65569+', '01827O', 'R9526', 'LP0256', 'N/A', '1964', 'ISLAND', '1609', '000PW1', '00DEMO', '0688.6', '00000', '.20929', 'LP0258', '000TOR', '0D8615', '0SWITA', '818', 'O02200', 'DEMO', '196', '1075', '0000NT', '215', '0', '00000`', "D'ALTO", '0455', '22377', 'DD8615', '050579', '226', 'SWITA', 'DD6815', 'X02689']
    mapping4 = [None, '0000GC', '083278', 'DD5615', '0000PB', '00ASB4', 'B81923', '099998', '000000', '065569', '01827O', '0R9526', 'LP0256',None, '001964',None, '001609', '000PW1', '00DEMO', '006886', '000000', '020929', 'LP0258', '000TOR', '0D8615', '0SWITA', '000818', 'O02200', None, '000196', '001075', '0000NT', '000215', '000000', '000000', None, '000455', '022377', 'DD8615', '050579', '000226', None, 'DD6815', 'X02689']

    outliers = outlier1+ outlier2+ outlier3+ outlier4
    mappings = mapping1+ mapping2+ mapping3+ mapping4
    
    for col in data_cols:
        
        if col == "Owner鈥檚 House City":           
            continue
        
        df[col] = df[col].replace(outliers, mappings)
        '''

        if file != 'rbx6-tga4.tsv.gz':
            light_outliers = findDateOutliers(df, ds, col)

            for item in light_outliers:

                df[col] = df[col].replace(item, None)


        if file != 'hg8x-zxpr.tsv.gz' and col in name_cols:
            col_clusters = getClusters(df, ds, col)
            updateUsingClusters(df, ds, col, col_clusters, True)
         '''

        df[col] = df[col].replace(['N/A', '', 'NA','NONE'], [None,None,None,None])
        
    return data_cols    

In [127]:
def saveDf(df, datafile):
    outputpath = datafile[2:11]+'_cleaned_data_improved.csv'
    df.to_csv(outputpath,sep=',',index=False,header=True) 

In [128]:
def dataCleanOnDataset(file):
    
    print('working on file: ', file)
    datafile, df, ds = readData(file)
    
    cleaned_cols = []
    
    print('fixing ID_Number_Column......')
    cleaned_cols += fix_ID_Number_Column(df, ds)
    
    print('fixing Binary_Column......')
    cleaned_cols += fix_Binary_Column(df, ds)
    
    print('fixing Monetary_Column......')
    cleaned_cols += fix_Monetary_Column(df, ds)
    
    print('fixing Numerical_Column......')
    cleaned_cols += fix_Numerical_Column(df, ds)
    
    if file != 'w9ak-ipjd.tsv.gz':
        print('fixing datetime_Column......')
        cleaned_cols += fix_datetime_Column(df, ds)
    
    print('fixing city_and_name_Column......')
    cleaned_cols += fix_city_and_name_Column(df, ds, file)
    
    saveDf(df, datafile)
    
    return cleaned_cols, datafile, df

In [118]:
def precision(tp, fp):
    return tp/(tp+fp)

def recall(tp, fn):
    return tp/(tp+fn)

In [119]:
def precision_recall(cleaned_columns, datafile, df, sample_size = 50 ):
    df2  = pd.read_csv(datafile, dtype='object', sep='\t')
    df2 = df2.drop_duplicates()

    

    df_sample_data =  df2.sample(sample_size).copy()
    
    print('sample size: ',sample_size)
    
    print('total size: ',sample_size * len(cleaned_columns))
    print('======================\n\n')

    same = 0

    for col in cleaned_columns:
        print("column: ", col)
        print("Original,\t Cleaned\n")
        for i in range(sample_size):
            if df[col].iloc[i]== df2[col].iloc[i]:
                print(df2[col].iloc[i], '\t', df[col].iloc[i], '\t')
                same += 1
            else:
                print(df2[col].iloc[i], '\t', df[col].iloc[i], '\t*')

        print('*   ', same, ' same records   *\n')
        same = 0

        print('======================\n\n')

# Buildings-Selected-for-the-Alternative-Enforcement

In [72]:
cleaned_cols, datafile, df = dataCleanOnDataset(file_list[2])
print(cleaned_cols)

working on file:  hcir-3275.tsv.gz
fixing ID_Number_Column......
fixing Binary_Column......
fixing Monetary_Column......
fixing Numerical_Column......
fixing datetime_Column......
fixing city_and_name_Column......
['BUILDING_ID', '# OF B/C VIOLATIONS AT START', 'AEP_START_DATE', 'DISCHARGE_DATE', 'BOROUGH']


In [74]:
precision_recall(cleaned_cols, datafile, df, 100)

sample size:  100
total size:  500


column:  BUILDING_ID
Original,	 Cleaned

118921 	 118921 	
338100 	 338100 	
354272 	 354272 	
148503 	 148503 	
309707 	 309707 	
214801 	 214801 	
527226 	 527226 	
104592 	 104592 	
78261 	 78261 	
387868 	 387868 	
326302 	 326302 	
327115 	 327115 	
129406 	 129406 	
315762 	 315762 	
373055 	 373055 	
328321 	 328321 	
52173 	 52173 	
297806 	 297806 	
91605 	 91605 	
334920 	 334920 	
62596 	 62596 	
360479 	 360479 	
379420 	 379420 	
314816 	 314816 	
432731 	 432731 	
311297 	 311297 	
355491 	 355491 	
129403 	 129403 	
327191 	 327191 	
22678 	 22678 	
309399 	 309399 	
337955 	 337955 	
284464 	 284464 	
68350 	 68350 	
325837 	 325837 	
359235 	 359235 	
298165 	 298165 	
768885 	 768885 	
166224 	 166224 	
145007 	 145007 	
297483 	 297483 	
342514 	 342514 	
371699 	 371699 	
288090 	 288090 	
320847 	 320847 	
394455 	 394455 	
222650 	 222650 	
535348 	 535348 	
332612 	 332612 	
380440 	 380440 	
949009 	 949009 	
373589 	 373589 

In [75]:
tp = 100
fp = 7

fn = 0
tn = 500 - tp - fp - fn

In [76]:
precision(tp, fp)

0.9345794392523364

In [77]:
recall(tp,fn)

1.0

# Housing-New-York-Units-by-Building

In [78]:
cleaned_cols, datafile, df = dataCleanOnDataset(file_list[3])
print(cleaned_cols)

working on file:  hg8x-zxpr.tsv.gz
fixing ID_Number_Column......
fixing Binary_Column......
fixing Monetary_Column......
fixing Numerical_Column......
fixing datetime_Column......
fixing city_and_name_Column......
['Project ID', 'Building ID', 'Number', 'Middle Income Units', 'Extremely Low Income Units', 'Very Low Income Units', 'Low Income Units', 'Moderate Income Units', 'Middle Income Units', 'Other Income Units', 'Studio Units', '1-BR Units', '2-BR Units', '3-BR Units', '4-BR Units', '5-BR Units', '6-BR+ Units', 'Unknown-BR Units', 'Counted Rental Units', 'Counted Homeownership Units', 'All Counted Units', 'Total Units', 'Project Start Date', 'Project Completion Date', 'Building Completion Date', 'Project Name', 'Borough']


In [79]:
precision_recall(cleaned_cols, datafile, df)

sample size:  50
total size:  1350


column:  Project ID
Original,	 Cleaned

44223 	 44223 	
44223 	 44223 	
44223 	 44223 	
44223 	 44223 	
58871 	 58871 	
58871 	 58871 	
58871 	 58871 	
58871 	 58871 	
58871 	 58871 	
58871 	 58871 	
64543 	 64543 	
64543 	 64543 	
64543 	 64543 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65358 	 65358 	
65358 	 65358 	
65358 	 65358 	
65358 	 65358 	
65358 	 65358 	
65358 	 65358 	
65358 	 65358 	
65358 	 65358 	
66909 	 66909 	
67910 	 67910 	
67910 	 67910 	
67910 	 67910 	
67910 	 67910 	
67910 	 67910 	
67910 	 67910 	
69280 	 69280 	
69428 	 69428 	
69763 	 69763 	
69763 	 69763 	
69763 	 69763 	
69763 	 69763 	
69763 	 69763 	
69763 	 69763 	
69763 	 69763 	
69765 	 69765 	
69765 	 69765 	
*    50  same records   *



column:  Building ID
Original,	 Cleaned

927737 	 927737 	
969695 	 969695 	
975702 	 975702 	

21 	 21 	
19 	 19 	
6 	 6 	
13 	 13 	
13 	 13 	
6 	 6 	
6 	 6 	
5 	 5 	
8 	 8 	
6 	 6 	
10 	 10 	
12 	 12 	
3 	 3 	
*    50  same records   *



column:  4-BR Units
Original,	 Cleaned

0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
6 	 6 	
0 	 0 	
0 	 0 	
0 	 0 	
1 	 1 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
*    50  same records   *



column:  5-BR Units
Original,	 Cleaned

0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 	 0 	
0 

In [80]:
tp = 100
fp = 0

fn = 13
tn = 1350 - tp - fp - fn

In [81]:
precision(tp, fp)

1.0

In [82]:
recall(tp,fn)

0.8849557522123894

# DOB-NOW-Build-Approved-Permits

In [83]:
cleaned_cols, datafile, df = dataCleanOnDataset(file_list[4])
print(cleaned_cols)

working on file:  rbx6-tga4.tsv.gz
fixing ID_Number_Column......
fixing Binary_Column......
fixing Monetary_Column......
fixing Numerical_Column......
fixing datetime_Column......
fixing city_and_name_Column......
['Job Filing Number', 'House No', 'C B NO', 'Apt/Condo No(s)', 'Applicant License #', 'Applicant Middle Name', 'Filing Representative Middle Initial', 'Estimated Job Costs', 'Approved Date', 'Issued Date', 'Expired Date', 'Street Name', 'Borough', 'Applicant First Name', 'Applicant Middle Name', 'Applicant Last Name', 'Applicant Business Name', 'Filing Representative First Name', 'Filing Representative Last Name', 'Filing Representative Business Name', 'Owner Business Name', 'Owner Name', 'Owner City']


In [84]:
precision_recall(cleaned_cols, datafile, df)

sample size:  50
total size:  1150


column:  Job Filing Number
Original,	 Cleaned

M00531234-I1 	 M00531234-I1 	
X00496654-I1 	 X00496654-I1 	
B00579209-I1 	 B00579209-I1 	
M00514656-I1 	 M00514656-I1 	
M00431823-I1 	 M00431823-I1 	
B00583762-I1 	 B00583762-I1 	
B00516232-I1 	 B00516232-I1 	
B00472417-I1 	 B00472417-I1 	
B29303378-I1 	 B29303378-I1 	
M00379533-I1 	 M00379533-I1 	
B00369280-I1 	 B00369280-I1 	
B00378717-I1 	 B00378717-I1 	
Q00284352-I1 	 Q00284352-I1 	
B00513806-I1 	 B00513806-I1 	
S00426773-I1 	 S00426773-I1 	
S00253862-I1 	 S00253862-I1 	
X00434717-I1 	 X00434717-I1 	
B00055891-I1 	 B00055891-I1 	
B00491585-I1 	 B00491585-I1 	
M00514118-I1 	 M00514118-I1 	
B00283296-I1 	 B00283296-I1 	
B00516235-I1 	 B00516235-I1 	
B00500645-I1 	 B00500645-I1 	
M00308545-I1 	 M00308545-I1 	
M00431432-I1 	 M00431432-I1 	
M00492347-I1 	 M00492347-I1 	
X00468553-I1 	 X00468553-I1 	
M00489394-I1 	 M00489394-I1 	
M00406969-I1 	 M00406969-I1 	
M00510532-I1 	 M00510532-I1 	
Q00513093-I1 	 Q

PEDERSON 	 PEDERSON 	
DIMAGGIO 	 DIMAGGIO 	
KELLY 	 KELLY 	
KOMAL 	 KOMAL 	
HE 	 HE 	
BRAUN 	 BRAUN 	
MITCHELL 	 MITCHELL 	
DOWNES 	 DOWNES 	
ASSOULINE 	 ASSOULINE 	
SINGH 	 SINGH 	
CONWAY 	 CONWAY 	
LUPINO 	 LUPINO 	
KELLY 	 KELLY 	
LEVINE 	 LEVINE 	
CHIN 	 CHIN 	
SCULLY 	 SCULLY 	
LUO 	 LUO 	
RIZWAN 	 RIZWAN 	
LINDGREN 	 LINDGREN 	
ZHANG 	 ZHANG 	
HE 	 HE 	
TSAMPAS 	 TSAMPAS 	
ROCKHILL 	 ROCKHILL 	
BOEGEMANN 	 BOEGEMANN 	
RUSI 	 RUSI 	
BRAGOLI 	 BRAGOLI 	
DOWNES 	 DOWNES 	
JACCARINO 	 JACCARINO 	
JUNG 	 JUNG 	
LINDGREN 	 LINDGREN 	
LOMBARDO 	 LOMBARDO 	
KLEIN 	 KLEIN 	
DIMAGGIO 	 DIMAGGIO 	
CHEN 	 CHEN 	
LOMBARDO 	 LOMBARDO 	
REDDY 	 REDDY 	
*    50  same records   *



column:  Applicant Business Name
Original,	 Cleaned

MANHATTAN D ENTERPRISES 	 MANHATTAN D ENTERPRISES 	
ULTIMATE SIGNS&DESIGNS CO 	 ULTIMATE SIGNS&DESIGNS CO 	
MANHATTAN D ENTERPRISES 	 MANHATTAN D ENTERPRISES 	
CS BRIDGE CORP 	 CS BRIDGE CORP 	
ALL CITY MECHANICAL INC 	 ALL CITY MECHANICAL INC 	
L.S. SIGN CO., INC 	

In [88]:
tp = 273
fp = 50

fn = 86
tn = 1150 - tp - fp - fn

In [89]:
precision(tp, fp)

0.8452012383900929

In [90]:
recall(tp,fn)

0.7604456824512534

# Historical DOB Permit Issuance

In [134]:
cleaned_cols, datafile, df = dataCleanOnDataset(file_list[0])
print(cleaned_cols)

working on file:  bty7-2jhb.tsv.gz
fixing ID_Number_Column......
fixing Binary_Column......
fixing Monetary_Column......
fixing Numerical_Column......
fixing datetime_Column......
fixing city_and_name_Column......
['Number', 'Job #', 'Job doc. #', 'Residential', 'Permit Sequence #', "Permittee's Phone #", "Permittee's License #", "Owner's House #", "Owner's Phone #", 'Filing Date', 'Issuance Date', 'Expiration Date', 'Job Start Date', 'DOBRunDate', 'BOROUGH', "Permittee's First Name", "Permittee's Last Name", "Permittee's Business Name", "Site Safety Mgr's First Name", "Site Safety Mgr's Last Name", 'Site Safety Mgr Business Name', 'Superintendent First & Last Name', 'Superintendent Business Name', "Owner's Business Name", "Owner's First Name", "Owner's Last Name", "Owner's House Street Name", 'Owner鈥檚 House City']


In [140]:
cleaned_cols.remove('Owner鈥檚 House City')
precision_recall(cleaned_cols, datafile, df)

sample size:  50
total size:  1350


column:  Number
Original,	 Cleaned

2960 	 2960 	
100 	 100 	
1898 	 1898 	
1998 	 1998 	
565 	 565 	
606 	 606 	
730 	 730 	
345 	 345 	
4487 	 4487 	
575 	 575 	
69 	 69 	
1400 	 1400 	
444 	 444 	
140 	 140 	
1 	 1 	
921 	 921 	
1214 	 1214 	
840 	 840 	
2165 	 2165 	
968 	 968 	
465 	 465 	
1300 	 1300 	
1926 	 1926 	
730 	 730 	
3245 	 3245 	
2 	 2 	
2350 	 2350 	
1318 	 1318 	
1100 	 1100 	
1624 	 1624 	
3269 	 3269 	
730 	 730 	
140 	 140 	
1845 	 1845 	
2225 	 2225 	
1368 	 1368 	
755 	 755 	
6305 	 6305 	
100 	 100 	
4453 	 4453 	
2614 	 2614 	
92 	 92 	
1125 	 1125 	
1750 	 1750 	
3469 	 3469 	
1980 	 1980 	
120 	 120 	
1385 	 1385 	
217 	 217 	
2870 	 2870 	
*    50  same records   *



column:  Job #
Original,	 Cleaned

201088492 	 201088492 	
200716298 	 200716298 	
200974650 	 200974650 	
200278118 	 200278118 	
201119173 	 201119173 	
200089251 	 200089251 	
200896762 	 200896762 	
201015613 	 201015613 	
200348524 	 200348524 	
20092

nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
*    0  same records   *



column:  Site Safety Mgr's Last Name
Original,	 Cleaned

nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
*    0  same records   *



column:  Site Safety Mgr Business Name
Original,	 Cleaned

nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan

In [162]:
tp = 253
fp = 6

fn = 47
tn = 1350 - tp - fp - fn

In [163]:
precision(tp, fp)

0.9768339768339769

In [164]:
recall(tp,fn)

0.8433333333333334

# DOB NOW: Electrical Permit Applications

In [98]:
cleaned_cols, datafile, df = dataCleanOnDataset(file_list[1])
print(cleaned_cols)

working on file:  dm9a-ab7w.tsv.gz
fixing ID_Number_Column......
fixing Binary_Column......
fixing Monetary_Column......
fixing Numerical_Column......
fixing datetime_Column......
fixing city_and_name_Column......
['AMOUNT_PAID', 'FILING_DATE', 'GENERAL_LIABILITY_EXPIRATION_DATE', 'WORKER_COMP_EXPIRATION_DATE', 'DISABILITY_EXPIRATION_DATE', 'PERMIT_ISSUED_DATE', 'JOB_START_DATE', 'COMPLETION_DATE', 'STREET_NAME', 'BOROUGH', 'APPLICANT_FIRST_NAME', 'APPLICANT_LAST_NAME', 'FIRM_NAME', 'CITY', 'WORKER_COMP_COMPANY_NAME', 'DISABILITY_COMPANY_NAME', 'OWNER_FIRST_NAME', 'OWNER_LAST_NAME', 'BUSINESS_NAME', 'OWNER_CITY', 'AUTH_REP_FIRST_NAME', 'AUTH_REP_LAST_NAME', 'GIS_NTA_NAME']


In [99]:
precision_recall(cleaned_cols, datafile, df)

sample size:  50
total size:  1150


column:  AMOUNT_PAID
Original,	 Cleaned

40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
15 	 15 	
40 	 40 	
40 	 40 	
40 	 40 	
15 	 15 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
440 	 440 	
40 	 40 	
40 	 40 	
40 	 40 	
40 	 40 	
15 	 15 	
40 	 40 	
40 	 40 	
15 	 15 	
40 	 40 	
40 	 40 	
*    50  same records   *



column:  FILING_DATE
Original,	 Cleaned

06/02/2021 12:00:00 AM 	 06/02/2021 12:00:00 AM 	
04/27/2021 12:00:00 AM 	 04/27/2021 12:00:00 AM 	
06/02/2021 12:00:00 AM 	 06/02/2021 12:00:00 AM 	
06/02/2021 12:00:00 AM 	 06/02/2021 12:00:00 AM 	
06/02/2021 12:00:00 AM 	 06/02/2021 12:00:00 AM 	
07/27/2021 12:00:00 AM 	 07/27/2021 12:00:00 AM 	
06/02/2021 12:00:00 AM 	 06/02/2021 12:00:00 AM 	


NEW YORK STATE INS FUND 	 NEW YORK STATE INS FUND 	
PROPERTY &amp; CASUALTY INS C 	 PROPERTY &amp; CASUALTY INS C 	
AMTRUST INS COMPANY KANSA 	 AMTRUST INS COMPANY KANSA 	
NEW YORK STATE INS FUND 	 NEW YORK STATE INS FUND 	
GUARD INSURANCE COMPANY 	 GUARD INSURANCE COMPANY 	
NORGUARD INS COMPANY 	 NORGUARD INS COMPANY 	
NY STATE INS FUND 	 NY STATE INS FUND 	
NYS INSURANCE FUND 	 NYS INSURANCE FUND 	
UTICA NATIONAL INS COMPAN 	 UTICA NATIONAL INS COMPAN 	
UTICA NATIONAL INSURANCE 	 UTICA NATIONAL INSURANCE 	
NORGUARD INSURANCE COMPAN 	 NORGUARD INSURANCE COMPAN 	
TRAVELERS INDEMNITY CO 	 TRAVELERS INDEMNITY CO 	
NORGUARD INS COMPANY 	 NORGUARD INS COMPANY 	
AMTRUST INS COMPANY 	 AMTRUST INS COMPANY 	
NEW YORK STATE COMP 	 NEW YORK STATE COMP 	
WESCO INSURANCE CO 	 WESCO INSURANCE CO 	
TRAVELERS INS COMPANY 	 TRAVELERS INS COMPANY 	
NORGUARD INSURANCE COMPAN 	 NORGUARD INSURANCE COMPAN 	
NEW YORK STATE FUND 	 NEW YORK STATE FUND 	
*    50  same records   *



column:  DISABILITY_COMPANY

In [106]:
tp = 178
fp = 25

fn = 19
tn = 1150 - tp - fp - fn

In [107]:
precision(tp, fp)

0.8768472906403941

In [108]:
recall(tp,fn)

0.9035532994923858

# DOB NOW: Build – Job Application Filings

In [129]:
cleaned_cols, datafile, df = dataCleanOnDataset(file_list[5])
print(cleaned_cols)

working on file:  w9ak-ipjd.tsv.gz
fixing ID_Number_Column......
fixing Binary_Column......
fixing Monetary_Column......
fixing Numerical_Column......
fixing city_and_name_Column......
['Job Filing Number', 'House No', 'Apt./Condo No(s)', 'Applicant License #', 'Applicants Middle Initial', 'Filing Representative Middle Initial', 'Proposed No of Stories', 'Special Inspection Agency Number', 'Sidewalk Shed (Work Type)', 'Initial Cost', 'Existing Height', 'Existing Dwelling Units', 'Proposed Height', 'Proposed Dwelling Units', 'Street Name', 'Borough', 'Applicant First Name', 'Applicant Last Name', "Owner's Business Name", "Owner's Street Name", 'City', 'Filing Representative First Name', 'Filing Representative Last Name', 'Filing Representative Business Name', 'Filing Representative Street Name', 'Filing Representative City']


In [130]:
precision_recall(cleaned_cols, datafile, df)

sample size:  50
total size:  1300


column:  Job Filing Number
Original,	 Cleaned

B00000471-I1 	 B00000471-I1 	
B00004656-I1 	 B00004656-I1 	
B00008377-I1 	 B00008377-I1 	
B00015822-P1 	 B00015822-P1 	
B00016519-P1 	 B00016519-P1 	
B00024365-I1 	 B00024365-I1 	
B00032357-P1 	 B00032357-P1 	
B00027541-I1 	 B00027541-I1 	
B00031890-I1 	 B00031890-I1 	
B00033883-I1 	 B00033883-I1 	
B00033891-I1 	 B00033891-I1 	
B00032362-P2 	 B00032362-P2 	
B00045931-I1 	 B00045931-I1 	
B00046640-I1 	 B00046640-I1 	
B00050787-I1 	 B00050787-I1 	
B00050354-I1 	 B00050354-I1 	
B00054966-I1 	 B00054966-I1 	
B00056750-P2 	 B00056750-P2 	
B00051815-P1 	 B00051815-P1 	
B00068075-I1 	 B00068075-I1 	
B00060201-P1 	 B00060201-P1 	
B00076107-P1 	 B00076107-P1 	
S00422528-P1 	 S00422528-P1 	
B00108318-P1 	 B00108318-P1 	
B00108318-P2 	 B00108318-P2 	
B00088459-P1 	 B00088459-P1 	
B00091099-I1 	 B00091099-I1 	
B00120682-P1 	 B00120682-P1 	
B00097254-I1 	 B00097254-I1 	
B00114896-P1 	 B00114896-P1 	
B00135241-P1 	 B

1 	 1 	
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
nan 	 nan 	*
522 	 522 	
0 	 0 	
0 	 0 	
nan 	 nan 	*
21 	 21 	
1 	 1 	
1 	 1 	
1 	 1 	
1 	 1 	
25 	 25 	
2 	 2 	
2 	 2 	
249 	 249 	
249 	 249 	
15 	 15 	
15 	 15 	
*    16  same records   *



column:  Street Name
Original,	 Cleaned

PORTER AVENUE 	 PORTER AVENUE 	
CROWN STREET 	 CROWN STREET 	
CORTELYOU ROAD 	 CORTELYOU ROAD 	
GLENMORE AVENUE 	 GLENMORE AVENUE 	
GLENMORE AVENUE 	 GLENMORE AVENUE 	
SEELEY STREET 	 SEELEY STREET 	
BALTIC STREET 	 BALTIC STREET 	
VERNON AVENUE 	 VERNON AVENUE 	
BOX STREET 	 BOX STREET 	
CONEY ISLAND AVENUE 	 CONEY ISLAND AVENUE 	
CONEY ISLAND AVENUE 	 CONEY ISLAND AVENUE 	
BALTIC STREET 	 BALTIC STREET 	
48 STREET 	 48 STREET 	
4 AVENUE 	 4 AVENUE 	
5 AVENUE 	 5 AVENUE 	
NEW YORK AVENUE 	 NEW YORK AVENUE 	
6 STREET 	 6 STREET 	
1 AVENUE 	 1 AVENUE 	
STERLING PLACE 	 STERLING PLACE 	
FULTON STREET 	 FULTON STREET 	
PA

In [131]:
tp = 452
fp = 27

fn = 18
tn = 1300 - tp - fp - fn

In [132]:
precision(tp, fp)

0.9436325678496869

In [133]:
recall(tp,fn)

0.9617021276595744