## load in cddb table

In [66]:
import numpy as np
import pandas as pd
import re
 

data = pd.read_csv("../../cleaning_stages/6_openrefine_drop_bad/cddb-4-drop.tsv", sep='\t')
tracks = pd.read_csv("../../cleaning_stages/6_openrefine_drop_bad/cddb-tracks.tsv", sep='\t')


# printing data
print(data)

                              artist                category   
0                    Backstreet Boys                   Blues  \
1                          No Return                    Data   
2          Ã¤â¸â­ã¦?â€˜ã©â€ºâ€¦ã¤â¿å                     Pop   
3                            Emanuel  Classic Portuguese Pop   
4                   Luv Lite Massive              Electronic   
...                              ...                     ...   
7658  Various Artists - Notting Hill                   Blues   
7659                          Lenine                   Blues   
7660                      Ben Harper                   Blues   
7661                       Alvin Lee                   Blues   
7662                 Manuel Barrueco                   Blues   

                       genre                                         title   
0                        Pop                                    Millennium  \
1                       Data                               Self Mutilation 

In [67]:
# Removing merged_values column becasue it is all null
data.drop('merged_values', axis=1, inplace=True)

# outpus test
print(data)

                              artist                category   
0                    Backstreet Boys                   Blues  \
1                          No Return                    Data   
2          Ã¤â¸â­ã¦?â€˜ã©â€ºâ€¦ã¤â¿å                     Pop   
3                            Emanuel  Classic Portuguese Pop   
4                   Luv Lite Massive              Electronic   
...                              ...                     ...   
7658  Various Artists - Notting Hill                   Blues   
7659                          Lenine                   Blues   
7660                      Ben Harper                   Blues   
7661                       Alvin Lee                   Blues   
7662                 Manuel Barrueco                   Blues   

                       genre                                         title   
0                        Pop                                    Millennium  \
1                       Data                               Self Mutilation 

In [68]:
# Pre processing tracks step
# tracks table seems to have a lot of values that start with "???/" In "Name" column
# the rest of the string seems okay, so we will just strip them out here
for i, row in tracks.iterrows():
        # 1. get name as string
        name = tracks["Name"][i]
        if "??? /" in name:
              name.replace("??? /", "")


## Methods

In [69]:
# this method is a custom value checking
# returns true if input is not unit code
# returns false if input should be deleted
def check_mangled_code(string):
    check_string = string.lower().strip()

    # accptable characters
    # mainly want to catch mangled unit code
    # for this we are looking for four non-alphanumeric characters in a row
    reg = "[^a-zA-Z0-9_][^a-zA-Z0-9_][^a-zA-Z0-9_][^a-zA-Z0-9_]"
    p = re.compile(reg)

    # check if ?????
    # two ?? in a row is mangled code
    if "??" in check_string:
        return True
    
    # check if mangled
    else:
        return bool(p.match(check_string))

In [70]:
good_test = "BackstreetBoys,Blues,Pop,Millennium,1000,1999,10000"
bad_test = "Ã¤â¸â­ã¦?â€˜ã©â€ºâ€¦ã¤â¿å,Pop,Pop,Ã¦æ’â³ã£?â€žã¥â€¡âºã£?â®ã£?â€¹ã£?â€˜ã£â€šâ€°,1003,1989,100003"
another_test = "LesjaskogTrekkspillklubb,Folk,Gammeldans,TrekkspilltreffISã¸ristua,1273,1998,100282"
print(check_mangled_code(good_test))
print(check_mangled_code(bad_test))
print(check_mangled_code(another_test))

False
True
False


In [71]:
def unit_code_repair(df):
    rows_to_drop = []
    
    for i, row in df.iterrows():
        # 1. get row values as string
        row_str = df.iloc[i,:].to_string(header=False, index=False)
        row_str = row_str.replace(" ", "").replace('\n', ',').replace('\r', ',').replace('\t', '').strip()
        
        # 2. condition checking
        if check_mangled_code(row_str) == True:
            # 3. remove from df - add to dictionary
            rows_to_drop.append(i)
            print(row_str)

    return df, rows_to_drop
    

## running method

In [72]:
# test_data = data.head(3)
# clean_data_test, rr1_t = unit_code_repair(test_data)
# # output below shows what was cleaned
# print("Test Data removed: " + str(len(rr1_t)) + " rows")

clean_data, rr1 = unit_code_repair(data)
print("------------------------------------------------")
clean_tracks, rr2 = unit_code_repair(tracks)

# output below shows what was cleaned
print("Data removed: " + str(len(rr1)) + " rows")
print("Tracks removed: " + str(len(rr2)) + " rows")

Ã¤â¸â­ã¦?â€˜ã©â€ºâ€¦ã¤â¿å,Pop,Pop,Ã¦æ’â³ã£?â€žã¥â€¡âºã£?â®ã£?â€¹ã£?â€˜ã£â€šâ€°,1003,1989,100003
Ã¥â±â±ã¥?â£ã§â„¢â¾ã¦?âµ,Jpop,Jpop,Disc13Ã¨å â±ã£?â€“ã£?â€¹ã£â€šå,1027,2003,100030
Ã¥?â€šã¦å“â¬ã§å“å¸ã§â¶â¾,Pop,Pop,Lucy,1030,2001,100034
Ã¥â±â±ã¥â´å½ã£?â¾ã£?â€¢ã£â€šë†ã£?â€”,Jpop,Jpop,Home,1047,1997,100051
Ã¤ã®ã«ã¥ã°ã¤Ã¹ã¬Ã¤ã¢ã¸ã¥ã¡,Data,Data,Ã¤ã£ã¢Ã°ã§ã¹,1049,2019,100054
Ã©ë†â´ã¦å“â¨ã£?â€šã£?â¿,Jpop,Jpop,InfinityEighteenVol.1,1052,2000,100057
Ã§å¸â¢ã¦â²â¢ã¦â°â¸ã¥?â€°,Rock,Rock,SuperLiveÃ¦â€”â¥ã¦å“â¬ã¦â­â¦ã©?â€œã©â¤â¨Disc1,1054,1990,100059
Â¿ã€ããÿâ±ã¢ãˆâ¹,ContemporaryChristianMusic,ContemporaryChristianMusic,Â»ãµâºâ®â±ã¢âµâµâ¸â¦Ã€â§ã‡ã‘Â°ã¦ã€â½â¾ã‡6,1065,2002,100071
Ã±ã¡ã®ã°ã­ã¨ãª,Pop,Pop,Absolutãã›ã‰Ã•ãˆã’2,1068,2003,100074
Ã£?â€¢ã£?â ã£?â¾ã£?â€¢ã£?â€”,Folk,Folk,Ã¤â¸â‚¬ã¤âºâºã§â„¢â¾ã¦â­å’Ã¥â·â»ã¤âºå’,1113,2019,100120
VariousArtists,Folk,Folk,Ã¤â¸â­ã¥â³â¶ã£?â¿ã£â€šâ€ ã£??SongLibrary1,1146,1997,100153
Â°ãâ·ã§â±j,Folk,Folk,Â¤sâ¨â£â¤ã«â©ãº,1177,1981,100184
Ã€ã«ã¥ãªã±ã ã­ã¤ã°Ãã®ã¢ã¨ãªã®ã¢,Fo

In [73]:
# before
print(len(clean_data))

## remove indexes from dataframes according to rr1 and rr2
clean_data.drop(clean_data.index[rr1], inplace=True)
clean_tracks.drop(clean_tracks.index[rr2], inplace=True)

#after
print(len(clean_data))

7663
7458


In [74]:
# renaming tracks column to album_id for uniformity
clean_data = clean_data.rename(columns={"tracks": "album id"})

In [75]:
# we could have intorduced a album dependancy failure, so we need to check for that
def album_dependancy_check(df, tracks):
    id_to_remove = []

    for i, row in df.iterrows():
        if i == 0:
            pass
        else:
            album_id = row[4]

            tracks_for_album = tracks[tracks['Album Id'] == album_id]
            if len(tracks_for_album) == 0:
                id_to_remove.append(album_id)

    return id_to_remove

In [76]:
Albumns_with_no_songs_idx = album_dependancy_check(clean_data, clean_tracks)
print("Before found: " + str(len(Albumns_with_no_songs_idx)) + " out of " + str(len(clean_data)))

for id in Albumns_with_no_songs_idx:
    idx = clean_data[ clean_data['album id'] == id].index
    clean_data.drop(idx, inplace = True)


Albumns_with_no_songs_idx_2 = album_dependancy_check(clean_data, clean_tracks)
print("After found: " + str(len(Albumns_with_no_songs_idx_2)) + " out of " + str(len(clean_data)))


Before found: 1 out of 7458
After found: 0 out of 7457


## Export to TSV

In [78]:
clean_data.to_csv('../../cleaning_stages/7_unit_code_repair/cddb_5.tsv', sep="\t", index=False)
tracks.to_csv('../../cleaning_stages/7_unit_code_repair/cddb_tracks_2.tsv', sep="\t", index=False)