# Data pre-processing notebook

This notebook has been used to pre-process the data we have used throughout our project. We describe in further detail the different steps that have been followed. 

In [1]:
import pandas as pd
import os
import json
import numpy as np
import unidecode
from tqdm import tqdm
from music21 import *
import editdistance
import pickle

## Scores filtering

We first start by filtering the scores with which the lyrics file was provided. To this end, we used the metadata csv file that keeps track of each score xml file and their respective lyrics file. 

In [3]:
test = 'arab-andalusian-music/lyrics/transliterated/json/0d1d3cff-e68e-403d-bb3f-caca302bf5ab.json'

with open(test, "r") as read_file:
    data = json.load(read_file)
    
metadata = pd.read_csv('arab-andalusian-music/metadata-all-nawbas.csv')

In [5]:
metadata.head()

Unnamed: 0,RECORDING_MBID,RECORDING_TITLE,RECORDING_TITLE_TRANSLITERATED,RECORDING_INTERNET_ARCHIVE_URL,LYRICS,MISC_NOTES
0,04b42450-8838-4a20-9e79-d42f03f8cc51,قدام الاصبهان,quddām al-iṣbahān,https://archive.org/details/TetouanOrchestra_S...,,
1,263f2386-1081-4307-b686-1c99c2f31fa1,قدام الاصبهان,quddām al-iṣbahān,https://archive.org/details/OrchestraOfTheTeto...,,
2,59fcb1d1-61f4-4cdc-9f7f-56757bf2400e,قدام الاصبهان,quddām al-iṣbahān,https://archive.org/details/BrihiOrchestra_RTM...,,
3,36822512-b1c8-42aa-9e55-fe9d10560371,بسيط الاصبهان,basīṭ al-iṣbahān,https://archive.org/details/BrihiOrchestra_RTM...,,
4,689a2401-00b8-44f5-8107-76bfe75a36fe,قدام الاصبهان,quddām al-iṣbahān,https://archive.org/details/RTMOrchestra_RTM19...,,
...,...,...,...,...,...,...
109,e22549ae-4a0c-43ef-87f4-e0f81ed49d58,قائم ونصف الاستهلال,qā’im waniṣf al-istihlal,https://archive.org/details/RTMOrchestra_RTM19...,,
110,f461045b-50bc-4b20-a731-66fbd3a264ae,قدام الاستهلال,quddām al-istihlal,https://archive.org/details/OrchestraOfTheTeto...,,
111,9b546274-eea6-459f-a0c2-918f0997fa2b,بطايحي الاستهلال,bṭāyḥī al-istihlal,https://archive.org/details/OrchestraOfTheTeto...,not available,
112,97223154-d5c2-4c37-8e6c-4c998056a674,بطايحي الاستهلال,bṭāyḥī al-istihlal,https://archive.org/details/TetouanOrchestra_S...,not available,


In [6]:
# We select only the score files (here, the mbid names) that were associated with an available lyrics file

final_metadata = metadata[metadata['LYRICS'].isnull()]

## Score to lyrics matching step

Now that we have selected the right scores to work on, we aim to match each poem annotation from each score file to its corresponding poem in the lyrics file. For this, we use a combination of a longest common sub-string function and the edit distance between the poem annotation from the score, and the first lines of the poem in the lyrics file.

In [9]:
# Function to select only the mbid name from a filename 

def get_mbid_name(filename):
    name = filename.split('.json')[0]
    name = name.split('/')[-1]
    return name

In [10]:
# Function returning the longest common sub-string of two strings

def lcs(S,T):
    m = len(S)
    n = len(T)
    counter = [[0]*(n+1) for x in range(m+1)]
    longest = 0
    lcs_set = set()
    for i in range(m):
        for j in range(n):
            if S[i] == T[j]:
                c = counter[i][j] + 1
                counter[i+1][j+1] = c
                if c > longest:
                    lcs_set = set()
                    longest = c
                    lcs_set.add(S[i-c+1:i+1])
                elif c == longest:
                    lcs_set.add(S[i-c+1:i+1])
    return list(lcs_set)

In [11]:
# Function that carries out the matching step

def get_matching(mbid_name):

    # loads the xml file
    test_score = converter.parse('arab-andalusian-music/scores-musicxml/'+mbid_name+'.xml')

    # list of the annotations that will be skipped during the matching step (instrumentals or musical notations)
    list_forbidden = ['accel.', 'rit.', 'MSHALIA', 'Tawshiyya', 'INSHAD', 'Tawshiya']
    list_contents = []
    
    # we iterate over all the TextExpression objects contained in the score
    for i in test_score.recurse().getElementsByClass(expressions.TextExpression):
        
        # we keep track of the annotation and its measure number
        list_contents.append([i.content, i.measureNumber])

    # we open the corresponding lyrics file 
    test_lyrics_path = 'arab-andalusian-music/lyrics/transliterated/json/'+mbid_name+'.json'
    
    with open(test_lyrics_path, "r") as read_file:
        test_lyrics = json.load(read_file)

    # we create the final dictionary containing the matching results
    matching = {}

    # for each annotation extracted from the music score
    for content, measure_number in list_contents:
        
        # if it is not in the forbidden list (the ones we skip)
        if content not in list_forbidden:
            
            
            test_expression = content
            edit_distances = []

            # we iterate over the lyrics file's content
            for i in range(len(test_lyrics)):

                temp = []
                
                # we get rid of the punctuation
                lyrics = unidecode.unidecode(test_lyrics[i]['identifier'])

                # we look for the longest common substring
                if len(lcs(lyrics, test_expression))>0:
                    max_sub_string = len(max(lcs(lyrics, test_expression),key=len))
                else: 
                    max_sub_string = 0

                # we process another split of the titles, as they can contain various words
                split_test_lyrics = lyrics.split(' ')
                split_expression = test_expression.split(' ')

                # we iterate over them
                for j in range(len(split_test_lyrics)):
                    for k in range(len(split_expression)):
                        
                        # threshold on the lcs
                        if max_sub_string<3:
                            # we discard the candidate by setting a high similarity value
                            temp.append(100)
                        else:
                            # similarity = edit distance + 1/lcs
                            temp.append(editdistance.eval(split_test_lyrics[j], split_expression[k])+1/max_sub_string)

                # we keep track of the results for all the candidates
                edit_distances.append(min(temp))
            

            # if the similarity is under 100, we save the identifier of the poem, its similarity measure and
            # its measure number in the score
            if np.min(edit_distances)<100:
                matching[content] = [test_lyrics[np.argmin(edit_distances)]['identifier'],np.argmin(edit_distances), measure_number]
            # if over 100, we discard it
            else:
                matching[content] = ['Not Retrieved/Instrumental', None, measure_number]
                
        # we discard elements that we chose to skip      
        else:
            matching[content] = ['Not Retrieved/Instrumental', None, measure_number]

    return matching

In [12]:
# Same function as above, with a list format

def get_matching(mbid_name):

    test_score = converter.parse('arab-andalusian-music/scores-musicxml/'+mbid_name+'.xml')

    list_forbidden = ['accel.', 'rit.', 'MSHALIA', 'Tawshiyya', 'INSHAD', 'Tawshiya']
    list_contents = []
    
    for i in test_score.recurse().getElementsByClass(expressions.TextExpression):

        list_contents.append([i.content, i.measureNumber])


    test_lyrics_path = 'arab-andalusian-music/lyrics/transliterated/json/'+mbid_name+'.json'
    #print(list_contents)

    with open(test_lyrics_path, "r") as read_file:
        test_lyrics = json.load(read_file)


    matching = []

    order = 0
    for content, measure_number in list_contents:
        
        if content not in list_forbidden:
            
            
            test_expression = content

            edit_distances = []

            for i in range(len(test_lyrics)):

                temp = []
                
                lyrics = unidecode.unidecode(test_lyrics[i]['identifier'])

                if len(lcs(lyrics, test_expression))>0:
                    max_sub_string = len(max(lcs(lyrics, test_expression),key=len))
                else: 
                    max_sub_string = 0


                split_test_lyrics = lyrics.split(' ')
                split_expression = test_expression.split(' ')

                for j in range(len(split_test_lyrics)):
                    for k in range(len(split_expression)):

                        if max_sub_string<3:
                            temp.append(100)
                        else:
                            temp.append(editdistance.eval(split_test_lyrics[j], split_expression[k])+1/max_sub_string)


                edit_distances.append(min(temp))
            


            if np.min(edit_distances)<100:


                matching.append([content,test_lyrics[np.argmin(edit_distances)]['identifier'],np.argmin(edit_distances), measure_number])
            else:
            
                matching.append([content,'Not Retrieved/Instrumental', None, measure_number])
        else:
                                
            
            matching.append([content,'Not Retrieved/Instrumental', None, measure_number])
            
        #print(content, edit_distances)

    return matching

In [13]:
def clean_sections(list_sections):
    results = []
    for i in range(len(list_sections)):
        for j in range(len(list_sections[i])):
            if j>0:
                results.append(list_sections[i][j])
    return results

In [None]:
# this code below uses the matching function and for each score, creates a txt file containing all the poems that 
# were retrieved


# we iterate over the filtered scores
for mbid_name in tqdm(final_metadata['RECORDING_MBID']):
    
    # check if all the necessary files are available
    if os.path.exists('arab-andalusian-music/lyrics/original/json/'+mbid_name+'.json') and os.path.exists('arab-andalusian-music/lyrics/transliterated/json/'+mbid_name+'.json') and os.path.exists('arab-andalusian-music/scores-musicxml/'+mbid_name+'.xml'):
        
        # if the file has been created already, we skip it
        if os.path.exists("outputs/"+mbid_name+".txt"):
            continue
            
        # call to the matching function
        matching = get_matching(mbid_name)
        
        # opens the right lyric file to get the lyrics of the poems that were found
        test_original_path ='arab-andalusian-music/lyrics/original/json/'+mbid_name+'.json'
        
        with open(test_original_path, "r") as read_file:
            test_original = json.load(read_file)
            
        for i in range(len(matching)):
            index = matching[i][2]
            if index != None:
                if test_original[index]['sections'] == []:
                    matching[i].append('')
                else:
                    matching[i].append(clean_sections(test_original[index]['sections'][0]))
                    
            
            
        # does the same for the transliterated version of the lyrics
        test_lyrics_path = 'arab-andalusian-music/lyrics/transliterated/json/'+mbid_name+'.json'

        with open(test_lyrics_path, "r") as read_file:
            test_lyrics = json.load(read_file)

        for i in range(len(matching)):
            index = matching[i][2]
            if index != None:
                if test_lyrics[index]['sections']==[]:
                    matching[i].append('')
                else:
                    matching[i].append(clean_sections(test_lyrics[index]['sections'][0]))
                    
                matching[i].append(test_lyrics[index]['poem'])


        # creates and saves the final txt fiile with all the data
        with open("outputs/"+mbid_name+".txt", "wb") as fp:   
            pickle.dump(matching, fp)

        print(mbid_name, 'ok')