In [11]:
import pandas as pd
 
stage_1 = pd.read_csv('../../cleaning_stages/1_openrefine/cddb-tsv.tsv', sep='\t')
stage_5 = pd.read_csv('../../cleaning_stages/5_genre_repair/cddb_4.tsv', sep='\t')
final_drop = pd.read_csv('../../cleaning_stages/6_openrefine_drop_bad/cddb-4-drop.tsv', sep='\t')

## IC 1: Null Year

In [12]:
def null_years(df):
    return df[df['year'].isna()]

def null_years_percent(df):
    return len(null_years(df)) / len(df)

In [13]:
null_years_stage_1 = null_years_percent(stage_1)
null_years_stage_5 = null_years_percent(stage_5)

print(f'stage 1 percent of null years: {round(null_years_stage_1, 2)}')
print(f'stage 5 percent of null years: {round(null_years_stage_5, 2)}')
print(f'Difference: {round(null_years_stage_1 - null_years_stage_5, 2)}')

stage 1 percent of null years: 0.46
stage 5 percent of null years: 0.21
Difference: 0.25


## IC 2: Null Genre

In [14]:
def null_genre(df):
    return df[df['genre'].isna()]

def null_genre_percent(df):
    return len(null_genre(df)) / len(df)

In [15]:
null_genre_stage_1 = null_genre_percent(stage_1)
null_genre_stage_5 = null_genre_percent(stage_5)

print(f'stage 1 percent of null genre: {round(null_genre_stage_1, 2)}')
print(f'stage 5 percent of null genre: {round(null_genre_stage_5, 2)}')
print(f'Difference: {round(null_genre_stage_1 - null_genre_stage_5, 2)}')

stage 1 percent of null genre: 0.35
stage 5 percent of null genre: 0.0
Difference: 0.35


## IC 3: Years with 9999 

In [16]:
def exists_9999_years(df):
    return df[df['year'] == '9999']

In [17]:
print(exists_9999_years(final_drop))

Empty DataFrame
Columns: [artist, category, genre, title, tracks, year, id, merged_values]
Index: []


## Dropping Rows

For our final openrefine we used that to drop rows that could not be repaired.

We ended up having to drop about 22% of the data, which is significantly better than dropping 50% like we would have had to do before cleaning. 

In [18]:
print(f'Stage 5 Count: {len(stage_5)}')
print(f'Final Count: {len(final_drop)}')
print(f'Number of Nulls Dropped: {len(stage_5) - len(final_drop)}')
print(f'Loss Percent: {round(1.0 - (len(final_drop) / len(stage_5)), 2) * 100}%')

Stage 5 Count: 9763
Final Count: 7663
Number of Nulls Dropped: 2100
Loss Percent: 22.0%


In [19]:
integrity_constraints = [null_years, null_genre, exists_9999_years]

def run_tests(df):
    for ic in integrity_constraints:
        result = "PASS" if ic(df).empty else "FAIL"
        print(f'{ic.__name__} .... {result}')

In [20]:
run_tests(final_drop)

null_years .... PASS
null_genre .... PASS
exists_9999_years .... PASS
