In [1]:
import pandas as pd
import csv
import io

In [63]:
class CSVCleaner:
    def __init__(self, csvfile):
        self.filename = csvfile
        self.cleaned = None
    
        # Read raw csv data
        self.read_raw_csv()
    
    def read_raw_csv(self):
        ''' Reading raw input from a csv file.

        Reads a csv file and returns a list of all lines.
        '''
        with io.open(self.filename, newline=None, encoding='utf-8') as f:
            data = f.read().splitlines()
            self.header = data[0].split(',')
            self.raw_data = [tuple(line.split(',')) for line in data[1:]]
        
        self.cleaned = self.raw_data
    
    def remove_duplicates(self):
        ''' Remove all duplicate lines in the data.
        '''
        cleaned = list(dict.fromkeys(self.cleaned))
        self.num_duplicates = len(self.cleaned) - len(cleaned)
        self.cleaned = cleaned
        
        return self
    
    def remove_corrupt(self):
        ''' Remove all corrupt lines in the data.
        
        Corrupt lines are defined for us as those that will
        not fit in a dataframe - that is, those that have a different
        column count as our header.
        '''
        cleaned = []
        self.num_corrupt = 0
        
        col_count = len(self.header)
        
        for row in self.cleaned:
            if len(row) == col_count:
                cleaned.append(row)
            else:
                self.num_corrupt += 1
                
        self.cleaned = cleaned
        return self
    
    def dataframe(self):
        ''' Return a pandas dataframe of the cleaned data.
        '''
        return None
    
    def __repr__(self):
        return "CSVCleaner(file='%s')" % (self.filename)

In [72]:
csvfile = 'dirty_sample_small.csv'

dup_corr_cleaner = CSVCleaner(csvfile).remove_duplicates().remove_corrupt()
corr_dup_cleaner = CSVCleaner(csvfile).remove_corrupt().remove_duplicates()

In [85]:
def get_stats(cleaner, title):
    print(title)
    print('Total entries: ', len(cleaner.raw_data))
    print('Cleaned entries: ', len(cleaner.cleaned))
    print('Number of duplicates removed:' , cleaner.num_duplicates)
    print('Number of corrupt entries removed: ', cleaner.num_corrupt)

In [86]:
get_stats(dup_corr_cleaner, 'Duplicates removed first, then corrupt:\n')

Duplicates removed first, then corrupt:

Total entries:  661486
Cleaned entries:  49981
Number of duplicates removed: 604264
Number of corrupt entries removed:  7241


In [87]:
get_stats(corr_dup_cleaner, 'Corrupt removed first, then duplicates:\n')

Corrupt removed first, then duplicates:

Total entries:  661486
Cleaned entries:  49981
Number of duplicates removed: 595797
Number of corrupt entries removed:  15708
