In [30]:
from collections import Counter
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np

In [2]:
#translation variables
flat_to_sharp_dict = {'Db':'C#', 'Eb':'D#', 'Gb':'F#', 'Ab':'G#', 'Bb':'A#', 'Cb':'B'}
pitch_class = {1:'C#', 3:'D#', 6:'F#', 8:'G#', 10:'A#', 0:'C', 2:'D', 4:'E', 5:'F', 7:'G', 9:'A', 11:'B'}
chord_base = ['C#m', 'C#', 'Cm', 'C', 'D#m', 'D#', 'Dm', 'D', 'Em', 'E', 'F#m', 'F#', 'Fm', 'F', 'G#m', 'G#', 'Gm', 'G', 'A#m', 'A#', 'Am', 'A', 'Bm', 'B']
chord_map = pd.read_table('ChordDistanceMap.csv', sep=",", index_col="Chords")
spotify_df = pd.read_csv('playlist_tracks.csv')
tab_df = pd.read_csv('track_with_tabs.csv')

In [3]:
tab_df

Unnamed: 0,song_name,tabs
0,Back In Black,"E,D,A/C#,E,D,A/C#,E,D,A/C#,E,D,A/C#,E,D,A/C#,A..."
1,Paradise City,"G,C,F,C,G,G5,F5,C5,Bb5,C5,C5,Bb5,G,F,G,G,G,C,C..."
2,Dream On,"Fm,Fm6,Bbm6,Fm,C7sus,Fm,Fm,Fm7,Fm6,Bbm6,Fm,Fm7..."
3,Creep,"G,B,C,Cm,G,B,C,Cm,G,B,C,Cm,G,B,C,Cm,G,B,C,Cm,G..."
4,Don't Stop Believin',"E,B,C#m,A,E,B,G#m,A,E,B,C#m,A,E,B,G#m,A,E,B,C#..."
...,...,...
1940,Never Ever Worry,"A,D,A,E,A,D,A,E,A,D,A,E,A,D,A,E,A,A,E,Bm,A,A,B..."
1941,Mathilda,"C,C,C,C,D,E,F,F,F,F,E,D,C,C,F,C,G7,C,F,C,G7,C,..."
1942,Rum & Coca Cola,"F,Em,G,D,C,F,E,G,D,F,C,G,D,G,E,C,D,G,E,C,D,F,E..."
1943,London is the Place for Me,"A,F#m,Bm,E,A,A/G#,A/F#,A,F#m,Bm,E,A,A/F#,C#dim..."


In [4]:
spotify_df.columns

Index(['Unnamed: 0', 'playlist_name', 'playlist_id', 'playlist_genre',
       'track_name', 'track_id', 'track_artist_name', 'track_artist_id',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'acousticness',
       'valence', 'tempo'],
      dtype='object')

In [5]:
final_df = spotify_df.loc[:, ['track_name', 'track_id', 'track_artist_name', 'danceability', 'energy', 'acousticness', 'valence', 'key']]
final_df = final_df.rename(columns={'key':'spotify_key'})

In [6]:
def convertToSharp(chord, flat_to_sharp_dict):
    for flat in flat_to_sharp_dict:
        if flat in chord:
            chord = chord.replace(flat, flat_to_sharp_dict[flat])
    return chord

In [7]:
keyTable = []
keyTableFile = open('key_table_UTF-8.txt')
for line in keyTableFile.readlines():
    chords = line.split(' ')
    keyTable.append(chords[:-1])
keyTable

[['C', 'Dm', 'Em', 'F', 'G', 'G7', 'Am', 'Bdim'],
 ['C#', 'D#m', 'Fm', 'F#', 'G#', 'G#7', 'A#m', 'Cdim'],
 ['D', 'Em', 'F#m', 'G', 'A', 'A7', 'Bm', 'C#dim'],
 ['D#', 'Fm', 'Gm', 'G#', 'A#', 'A#7', 'Cm', 'Ddim'],
 ['E', 'F#m', 'G#m', 'A', 'B', 'B7', 'C#m', 'D#dim'],
 ['F', 'Gm', 'Am', 'A#', 'C', 'C7', 'Dm', 'Edim'],
 ['F#', 'G#m', 'A#m', 'B', 'C#', 'C#7', 'D#m', 'Fdim'],
 ['G', 'Am', 'Bm', 'C', 'D', 'D7', 'Em', 'F#dim'],
 ['G#', 'A#m', 'Cm', 'C#', 'D#', 'D#7', 'Fm', 'Gdim'],
 ['A', 'Bm', 'C#m', 'D', 'E', 'E7', 'F#m', 'G#dim'],
 ['A#', 'Cm', 'Dm', 'D#', 'F', 'F7', 'Gm', 'Adim'],
 ['B', 'C#m', 'D#m', 'E', 'F#', 'F#7', 'G#m', 'A#dim']]

In [8]:
"""
Ben Ma
Python 3.x
Contains the utility function findTonicNumNo7.
"""

import copy #for deep copy

def findTonicNumNo7(songChords, keyTable): #songChords is a list, keyTable is a list of lists
    # edit songChords to change 7ths to just major
    songChordsNo7 = copy.deepcopy(songChords)
    for i in range(0, len(songChordsNo7)):
        songChordsNo7[i] = songChordsNo7[i].replace("7", "")

    maxKey = 0 #0 thru 11 for C thru B
    maxScore = 0
    for i in range(0,len(keyTable)): #go thru each of the 12 keys--example for key of C: C Dm Em F G G7 Am Bdim
        curScore = 0
        key = keyTable[i]
        for chord in songChordsNo7:
            for j in range(0,len(key)): #go thru each note in the major scale of the key
                note = key[j]
                if chord==note:
                    if (j == 1 or j == 2 or j == 7):
                        curScore+=0.9 #tiebreaker: the ii, iii, and vii are weighted less
                    else:
                        curScore+=1 #if it's a match, add 1 to the "score" of the current key
                    break
        if curScore>maxScore:
            maxScore=curScore
            maxKey = i
    return maxKey #return key with most matches for the chords in the song

In [9]:
def convertNoteToNum(note, pitch_class, chord_base):
    note, ext = sepExt(note, chord_base)
    for item in pitch_class.items():
        if note.replace('m', '') == item[1]:
            return item[0]
    

In [10]:
def sepExt(chord, chord_base):
    
    #if '/' in chord:
    #    chord_type_split = chord.split('/')
    #else:
    #chord_type_split = [chord_type]
    #for chord in chord_type_split:
    for note in chord_base:
        #print('{}, {}'.format(note, chord))
        if note in chord:
            
            if chord.replace(note, '')[:2] == 'aj':
                continue
            else:
                ext = chord.replace(note, '')
                #if len(ext) > 0:
                    #print('{}, {}, {}, {}'.format(chord, note, chord[len(note):len(note)+2], ext))
                return note, ext
                


In [11]:
chords_token = []
song_tab_dict = {}

for i in range(len(tab_df)):
    tabs = [convertToSharp(tab, flat_to_sharp_dict) for tab in tab_df.loc[i, 'tabs'].split(',')]
    chords_token += tabs
    song_tab_dict[tab_df.loc[i, "song_name"]] = tabs

chord_types = list(set(chords_token))
chord_types
print('# of Chord Tokens: {}'.format(len(chords_token)))
print('# of Chord Types: {}'.format(len(chord_types)))
#print(chord_types)

# of Chord Tokens: 185024
# of Chord Types: 378


In [12]:
for item in song_tab_dict:
    final_df.loc[final_df.track_name == item, 'greer_key'] = findTonicNumNo7(song_tab_dict[item], keyTable)
    final_df.loc[final_df.track_name == item, 'first_note_key'] = convertNoteToNum(song_tab_dict[item][0], pitch_class, chord_base)
    final_df.loc[final_df.track_name == item, 'clean_tabs'] = ','.join(song_tab_dict[item])
    
final_df['spotify_greer_match'] = np.where(final_df["spotify_key"] == final_df["greer_key"], True, False)
final_df['greer_firstNote_match'] = np.where(final_df["first_note_key"] == final_df["greer_key"], True, False)
final_df = final_df.dropna()
final_df

Unnamed: 0,track_name,track_id,track_artist_name,danceability,energy,acousticness,valence,spotify_key,greer_key,first_note_key,clean_tabs,spotify_greer_match,greer_firstNote_match
0,Back In Black,08mG3Y1vljYA6bvDt4Wqkj,AC/DC,0.310,0.700,0.0110,0.763,9,4.0,4.0,"E,D,A/C#,E,D,A/C#,E,D,A/C#,E,D,A/C#,E,D,A/C#,A...",False,True
1,Paradise City,3YBZIN3rekqsKxbJc9FZko,Guns N' Roses,0.273,0.952,0.0169,0.472,11,0.0,7.0,"G,C,F,C,G,G5,F5,C5,A#5,C5,C5,A#5,G,F,G,G,G,C,C...",False,False
3,Dream On,5MxNLUsfh7uzROypsoO5qe,Aerosmith,0.307,0.433,0.3880,0.224,1,8.0,5.0,"Fm,Fm6,A#m6,Fm,C7sus,Fm,Fm,Fm7,Fm6,A#m6,Fm,Fm7...",False,False
4,Creep,70LcF31zb1H0PyJoS1Sx1r,Radiohead,0.515,0.430,0.0097,0.104,7,0.0,7.0,"G,B,C,Cm,G,B,C,Cm,G,B,C,Cm,G,B,C,Cm,G,B,C,Cm,G...",False,False
5,Don't Stop Believin',4bHsxqR3GMrXTxEPLuK5ue,Journey,0.500,0.748,0.1270,0.514,4,4.0,4.0,"E,B,C#m,A,E,B,G#m,A,E,B,C#m,A,E,B,G#m,A,E,B,C#...",True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2794,Never Ever Worry,4Acjs07xTgVV4ql6lYoZDP,Lord Pretender,0.690,0.708,0.3930,0.964,10,9.0,9.0,"A,D,A,E,A,D,A,E,A,D,A,E,A,D,A,E,A,A,E,Bm,A,A,B...",False,True
2795,Mathilda,1kP9Mmmcl0tGpqKeOBHZ0b,King Radio,0.625,0.353,0.8670,0.925,0,0.0,0.0,"C,C,C,C,D,E,F,F,F,F,E,D,C,C,F,C,G7,C,F,C,G7,C,...",True,True
2798,Rum & Coca Cola,1sJIFhjOaBmlodsqybKqiF,Calypso Rose,0.659,0.331,0.8210,0.852,0,7.0,5.0,"F,Em,G,D,C,F,E,G,D,F,C,G,D,G,E,C,D,G,E,C,D,F,E...",False,False
2799,London is the Place for Me,1pETAnznla509bUKb34xnS,Lord Kitchener,0.653,0.371,0.8940,0.738,2,9.0,9.0,"A,F#m,Bm,E,A,A/G#,A/F#,A,F#m,Bm,E,A,A/F#,C#dim...",False,True


In [13]:
print(final_df.spotify_greer_match.mean())
print(final_df.greer_firstNote_match.mean())

0.23266806722689076
0.36869747899159666


In [14]:
#final_df.same_key.mean()
final_df.loc[final_df.track_name == 'Smells Like Teen Spirit']

Unnamed: 0,track_name,track_id,track_artist_name,danceability,energy,acousticness,valence,spotify_key,greer_key,first_note_key,clean_tabs,spotify_greer_match,greer_firstNote_match
150,Smells Like Teen Spirit,5ghIJDpPoe3CfHMGu71E6T,Nirvana,0.502,0.912,2.5e-05,0.72,1,4.0,4.0,"E,A,G,C,E,A,G,C,E,A,G,C,E,A,G,C,E,A,G,C,E,A,G,...",False,True


In [15]:
def buildNormChord(center, chord, chord_map, chord_base):

    chord_stem, ext = sepExt(chord, chord_base)
    chord_root = chord_stem.replace('m','')
    distance = chord_map[center][chord_root]
    if chord_root[len(chord_root)-1] == 'm':
        isMajor = '0'
    else:
        isMajor = '1'
        
    norm_chord_bare = str(distance)
    norm_chord_partial = norm_chord_bare + 'X{}'.format(isMajor)
    if len(ext) > 0:
        norm_chord_full = norm_chord_partial + 'X{}'.format(ext)
    else:
        norm_chord_full = norm_chord_partial
    
    return norm_chord_full, norm_chord_partial, norm_chord_bare

In [16]:
"""
#chord_types = ['D#maj']
chord_exts = []
for chord_type in chord_types:
    if '/' in chord_type:
        chord_type_split = chord.split('/')
    else:
        chord_type_split = [chord_type]
    for chord in chord_type_split:
        for note in chord_base:
            if note in chord:
                #print('{}, {}'.format(note, chord))
                if chord.replace(note, '')[:2] == 'aj':
                    continue
                else:
                    ext = chord.replace(note, '')
                    if ext not in chord_exts:
                        #print('{}, {}, {}, {}'.format(chord, note, chord[len(note):len(note)+2], ext))
                        chord_exts.append(ext)
                    #chord_exts.append(chord.replace(note, ''))
                    break

            #pattern = note+'[^aA]?.*'
            #match = re.findall(pattern, chord)
            #if len(match) > 0:
            #    break

#print(set(chord_exts))
#print(len(list(set(chord_exts))))
"""

"\n#chord_types = ['D#maj']\nchord_exts = []\nfor chord_type in chord_types:\n    if '/' in chord_type:\n        chord_type_split = chord.split('/')\n    else:\n        chord_type_split = [chord_type]\n    for chord in chord_type_split:\n        for note in chord_base:\n            if note in chord:\n                #print('{}, {}'.format(note, chord))\n                if chord.replace(note, '')[:2] == 'aj':\n                    continue\n                else:\n                    ext = chord.replace(note, '')\n                    if ext not in chord_exts:\n                        #print('{}, {}, {}, {}'.format(chord, note, chord[len(note):len(note)+2], ext))\n                        chord_exts.append(ext)\n                    #chord_exts.append(chord.replace(note, ''))\n                    break\n\n            #pattern = note+'[^aA]?.*'\n            #match = re.findall(pattern, chord)\n            #if len(match) > 0:\n            #    break\n\n#print(set(chord_exts))\n#print(len(list(

In [17]:
chord_map = pd.read_table('ChordDistanceMap.csv', sep=",", index_col="Chords")
chord_map

Unnamed: 0_level_0,C,C#,D,D#,E,F,F#,G,G#,A,A#,B
Chords,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
C,0,11,10,9,8,7,6,5,4,3,2,1
C#,1,0,11,10,9,8,7,6,5,4,3,2
D,2,1,0,11,10,9,8,7,6,5,4,3
D#,3,2,1,0,11,10,9,8,7,6,5,4
E,4,3,2,1,0,11,10,9,8,7,6,5
F,5,4,3,2,1,0,11,10,9,8,7,6
F#,6,5,4,3,2,1,0,11,10,9,8,7
G,7,6,5,4,3,2,1,0,11,10,9,8
G#,8,7,6,5,4,3,2,1,0,11,10,9
A,9,8,7,6,5,4,3,2,1,0,11,10


In [24]:
def buildTabDict(center_col):
    tab_full_dict = {}
    tab_partial_dict = {}
    tab_bare_dict = {}
    for song in list(song_tab_dict):
        try:
            center = pitch_class[list(final_df.loc[spotify_df.track_name == song, center_col])[0]]
        except:
            continue
        full_list = []
        partial_list = []
        bare_list = []
        for chord in song_tab_dict[song]:
            if '/' in chord:
                chords = chord.split('/')
                chord_full_one, chord_partial_one, chord_bare_one = buildNormChord(center, chords[0], chord_map, chord_base)
                chord_full_two, chord_partial_two, chord_bare_two = buildNormChord(center, chords[1], chord_map, chord_base)
                chord_full = '{}/{}'.format(chord_full_one, chord_full_two)
                chord_partial = '{}/{}'.format(chord_partial_one, chord_partial_two)
                chord_bare = '{}/{}'.format(chord_bare_one, chord_bare_two)
            else:
                chord_full, chord_partial, chord_bare = buildNormChord(center, chord, chord_map, chord_base)

            full_list.append(chord_full)
            partial_list.append(chord_partial)
            bare_list.append(chord_bare)

        tab_full_dict[song] = full_list
        tab_partial_dict[song] = partial_list
        tab_bare_dict[song] = bare_list
    return tab_full_dict, tab_partial_dict, tab_bare_dict


In [25]:
tab_full_dict, tab_partial_dict, tab_bare_dict = buildTabDict('first_note_key')

In [32]:
def trainTestModel(model, dict_to_model, pred_attribute, ngram_low, ngram_high, binary, train_perc = 0.8):
    
    tab_X = []
    tab_y = []
    for item in dict_to_model.items():
        tab_X.append(' '.join(item[1]))
        tab_y.append(list(spotify_df.loc[spotify_df.track_name == item[0], pred_attribute])[0])

    train_break = int(len(tab_X)*train_perc)
    tab_X_train = tab_X[:train_break]
    tab_X_test = tab_X[train_break:]
    tab_y_train = tab_y[:train_break]
    tab_y_test = tab_y[train_break:]
    
 
    vectorizer = CountVectorizer(max_features=10000, ngram_range=(ngram_low,ngram_high), lowercase=True, strip_accents=None, binary=binary)
    X_train = vectorizer.fit_transform(tab_X_train)
    X_test = vectorizer.transform(tab_X_test)
    
    model.fit(X_train, (tab_y_train))
    pred = model.predict(X_test)
    mae = mean_absolute_error(pred, (tab_y_test))
    return mae


In [35]:
result_df = pd.DataFrame(columns=['ModelType', 'CasterType', 'PredAttribute', 'LowN', 'HighN', 'Binary', 'MAE'])

ols_model = LinearRegression()
rf_model = RandomForestRegressor(min_samples_leaf=5)

models = {'OLS':ols_model, 'RandomForest':rf_model}
model_dicts = {'Full': tab_full_dict, 'Partial': tab_partial_dict, 'Bare': tab_bare_dict}
nGramSize = 5
binary = [True, False]
pred_attributes = ['danceability', 'energy', 'acousticness', 'valence']


model_cnt = 0
for model in models:
    for model_dict in model_dicts:
        for attribute in pred_attributes:
            for n_high in range(1, nGramSize+1):
                for n_low in range(1, n_high+1):
                    for item in binary:
                        mae = trainTestModel(models[model], model_dicts[model_dict], attribute, n_low, n_high, item)
                        result_df = result_df.append(pd.DataFrame([[model, model_dict, attribute, n_low, n_high, item, mae]],columns = ['ModelType', 'CasterType', 'PredAttribute', 'LowN', 'HighN', 'Binary', 'MAE']))
                        model_cnt += 1
                        print(model_cnt)
                    
#result_df
                
 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [36]:
result_df.to_csv('FirstDraftResults.csv')

In [None]:
result_df.groupby('PredAttribute').min()

In [None]:
test_dict = list(model_in_use.items())[train_break:]
for i in range(len(pred)):
    print('({}) Pred: {} Actual: {}'.format(test_dict[i][0], pred[i], tab_y_test[i]))

In [None]:
vectorizer.vocabulary_

In [None]:
def analyze_weights(learned_model, vocab, num_to_print, printZero=True):
    reverse_vocab = {v: k for k, v in vocab.items()}

    sort_index = np.argsort(learned_model.coef_)
    
    for k in reversed(sort_index[-num_to_print:]):
        if learned_model.coef_[k] != 0 or printZero:
            print ("%.5f\t%s" % (learned_model.coef_[k], reverse_vocab[k] ))
        
    print()

    for k in sort_index[:num_to_print]:
        if learned_model.coef_[k] != 0 or printZero:
            print ("%.5f\t%s" % (learned_model.coef_[k], reverse_vocab[k] ))


In [None]:
count=0
for val in model.coef_:
    count+=1 if val != 0 else 0

print("Nonzero features: %s\n" % count)
analyze_weights(model, vectorizer.vocabulary_, 5, printZero=False)