## load in cddb table

In [129]:
import numpy as np
import pandas as pd
 

data = pd.read_csv("../../cleaning_stages/6_openrefine_drop_bad/cddb-4-drop.tsv", sep='\t')
tracks = pd.read_csv("../../cleaning_stages/6_openrefine_drop_bad/cddb-tracks.tsv", sep='\t')


# printing data
print(data)

                              artist                category   
0                    Backstreet Boys                   Blues  \
1                          No Return                    Data   
2          Ã¤â¸â­ã¦?â€˜ã©â€ºâ€¦ã¤â¿å                     Pop   
3                            Emanuel  Classic Portuguese Pop   
4                   Luv Lite Massive              Electronic   
...                              ...                     ...   
7658  Various Artists - Notting Hill                   Blues   
7659                          Lenine                   Blues   
7660                      Ben Harper                   Blues   
7661                       Alvin Lee                   Blues   
7662                 Manuel Barrueco                   Blues   

                       genre                                         title   
0                        Pop                                    Millennium  \
1                       Data                               Self Mutilation 

In [130]:
# Removing merged_values column becasue it is all null
data.drop('merged_values', axis=1, inplace=True)

# outpus test
print(data)

                              artist                category   
0                    Backstreet Boys                   Blues  \
1                          No Return                    Data   
2          Ã¤â¸â­ã¦?â€˜ã©â€ºâ€¦ã¤â¿å                     Pop   
3                            Emanuel  Classic Portuguese Pop   
4                   Luv Lite Massive              Electronic   
...                              ...                     ...   
7658  Various Artists - Notting Hill                   Blues   
7659                          Lenine                   Blues   
7660                      Ben Harper                   Blues   
7661                       Alvin Lee                   Blues   
7662                 Manuel Barrueco                   Blues   

                       genre                                         title   
0                        Pop                                    Millennium  \
1                       Data                               Self Mutilation 

## Methods

In [131]:
# this method is a custom value checking
# returns true if input is not unit code
# returns false if input should be deleted

def check_mangled_code(string):
    
    return_var = True
    check_string = string.lower().strip()

    # accptable characters
    # mainly want to catch mangled unit code
    # tried to allow some foreign language
    ok = dict.fromkeys('''0123456789qwertyuioplkjhgfdsazxcvbnm?¦%!±‡ªš½“Ãã /'°~.,-()‰&:;±®ã­ã¨[*]_ãº´+¡ÿ©¶ã¤¸\\#`³²¹â§º¥¼¾''')
    
    # check if ?????
    # two ?? in a row is mangled code
    if "??" in check_string:
        return_var = False
    
    # check if mangled
    elif all(c in ok for c in check_string) == False:
        return_var = False

    return return_var

In [132]:
good_test = "BackstreetBoys,Blues,Pop,Millennium,1000,1999,10000"
bad_test = "Ã¤â¸â­ã¦?â€˜ã©â€ºâ€¦ã¤â¿å,Pop,Pop,Ã¦æ’â³ã£?â€žã¥â€¡âºã£?â®ã£?â€¹ã£?â€˜ã£â€šâ€°,1003,1989,100003"
another_test = "LesjaskogTrekkspillklubb,Folk,Gammeldans,TrekkspilltreffISã¸ristua,1273,1998,100282"
print(check_mangled_code(good_test))
print(check_mangled_code(bad_test))
print(check_mangled_code(another_test))

True
False
True


In [133]:
def unit_code_repair(df):
    rows_to_drop = []
    
    for i, row in df.iterrows():
        # 1. get row values as string
        row_str = df.iloc[i,:].to_string(header=False, index=False)
        row_str = row_str.replace(" ", "").replace('\n', ',').replace('\r', ',').replace('\t', '').strip()
        
        # 2. condition checking
        if check_mangled_code(row_str) == False:
            # 3. remove from df - add to dictionary
            rows_to_drop.append(i)
            print(row_str)

    return df, rows_to_drop
    

## running method

In [134]:
# test_data = data.head(3)
# clean_data_test, rr1_t = unit_code_repair(test_data)
# # output below shows what was cleaned
# print("Test Data removed: " + str(len(rr1_t)) + " rows")

clean_data, rr1 = unit_code_repair(data)
print("------------------------------------------------")
clean_tracks, rr2 = unit_code_repair(tracks)

# output below shows what was cleaned
print("Data removed: " + str(len(rr1)) + " rows")
print("Tracks removed: " + str(len(rr2)) + " rows")

Ã¤â¸â­ã¦?â€˜ã©â€ºâ€¦ã¤â¿å,Pop,Pop,Ã¦æ’â³ã£?â€žã¥â€¡âºã£?â®ã£?â€¹ã£?â€˜ã£â€šâ€°,1003,1989,100003
V.A,Ccm,Ccm,Ãƒã âºâ¹(cd09),1024,2001,100027
Ã¥â±â±ã¥?â£ã§â„¢â¾ã¦?âµ,Jpop,Jpop,Disc13Ã¨å â±ã£?â€“ã£?â€¹ã£â€šå,1027,2003,100030
Ã¥?â€šã¦å“â¬ã§å“å¸ã§â¶â¾,Pop,Pop,Lucy,1030,2001,100034
V.A.,Ã£â€šâµã£â€šâ¦ã£æ’â³ã£æ’â€°ã£æ’ë†ã£æ’â©ã£æ’æ’ã£...,Ã£â€šâµã£â€šâ¦ã£æ’â³ã£æ’â€°ã£æ’ë†ã£æ’â©ã£æ’æ’ã£...,BeatingSurviver-another-,1039,2002,100043
Ã¥â±â±ã¥â´å½ã£?â¾ã£?â€¢ã£â€šë†ã£?â€”,Jpop,Jpop,Home,1047,1997,100051
Ã¤ã®ã«ã¥ã°ã¤Ã¹ã¬Ã¤ã¢ã¸ã¥ã¡,Data,Data,Ã¤ã£ã¢Ã°ã§ã¹,1049,2019,100054
Ã©ë†â´ã¦å“â¨ã£?â€šã£?â¿,Jpop,Jpop,InfinityEighteenVol.1,1052,2000,100057
Ã§å¸â¢ã¦â²â¢ã¦â°â¸ã¥?â€°,Rock,Rock,SuperLiveÃ¦â€”â¥ã¦å“â¬ã¦â­â¦ã©?â€œã©â¤â¨Disc1,1054,1990,100059
Â¿ã€ããÿâ±ã¢ãˆâ¹,ContemporaryChristianMusic,ContemporaryChristianMusic,Â»ãµâºâ®â±ã¢âµâµâ¸â¦Ã€â§ã‡ã‘Â°ã¦ã€â½â¾ã‡6,1065,2002,100071
Ã±ã¡ã®ã°ã­ã¨ãª,Pop,Pop,Absolutãã›ã‰Ã•ãˆã’2,1068,2003,100074
Ã£?â€¢ã£?â ã£?â¾ã£?â€¢ã£?â€”,Folk,Folk,Ã¤â¸â‚¬ã¤âºâºã§â„¢â¾ã¦â­å’Ã¥â·â»ã¤âºå

In [135]:
# before
print(len(clean_data))

## remove indexes from dataframes according to rr1 and rr2
clean_data.drop(clean_data.index[rr1], inplace=True)
clean_tracks.drop(clean_tracks.index[rr2], inplace=True)

#after
print(len(clean_data))

7663
7332


In [136]:
# renaming tracks column to album_id for uniformity
clean_data.rename(columns={"tracks": "album id"})

Unnamed: 0,artist,category,genre,title,album id,year,id
0,Backstreet Boys,Blues,Pop,Millennium,1000,1999,10000
1,No Return,Data,Data,Self Mutilation,1002,2019,100002
3,Emanuel,Classic Portuguese Pop,Classic Portuguese Pop,Felicidade,1004,1998,100004
4,Luv Lite Massive,Electronic,Electronic,One Love,1006,2014,100006
5,Various Artists,Electronic,Electronic,Nop Pop 2,1007,2002,100007
...,...,...,...,...,...,...,...
7658,Various Artists - Notting Hill,Blues,Soundtrack,Notting Hill,10758,1999,9838
7659,Lenine,Blues,Samba,Na Pressao,10759,1999,9863
7660,Ben Harper,Blues,Blues,The Will To Live,10760,1997,9868
7661,Alvin Lee,Blues,Blues,I Hear You Rockin',10761,1993,9876


## Export to TSV

In [137]:
clean_data.to_csv('../../cleaning_stages/7_unit_code_repair/cddb_5.tsv', sep="\t", index=False)
tracks.to_csv('../../cleaning_stages/7_unit_code_repair/cddb_tracks_2.tsv', sep="\t", index=False)