In [89]:
import pandas as pd
from tqdm import tqdm
from scipy.sparse import dok_matrix
import os
import sys
from stemming.porter2 import stem
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer

In [6]:
def lyrics_to_bow(lyrics):
    """
    Main function to stem and create bag of words.
    It is what we used for the musiXmatch dataset.
    It is heavily oriented towards English lyrics, we apologize for that.
    INPUT
        lyrics as a string
    RETURN
        dictionary word -> count
        or None if something was wrong (e.g. not enough words)
    """
    # remove end of lines
    lyrics_flat = lyrics.replace('\r', '\n').replace('\n', ' ').lower()
    lyrics_flat = ' ' + lyrics_flat + ' '
    # special cases (English...)
    lyrics_flat = lyrics_flat.replace("'m ", " am ")
    lyrics_flat = lyrics_flat.replace("'re ", " are ")
    lyrics_flat = lyrics_flat.replace("'ve ", " have ")
    lyrics_flat = lyrics_flat.replace("'d ", " would ")
    lyrics_flat = lyrics_flat.replace("'ll ", " will ")
    lyrics_flat = lyrics_flat.replace(" he's ", " he is ")
    lyrics_flat = lyrics_flat.replace(" she's ", " she is ")
    lyrics_flat = lyrics_flat.replace(" it's ", " it is ")
    lyrics_flat = lyrics_flat.replace(" ain't ", " is not ")
    lyrics_flat = lyrics_flat.replace("n't ", " not ")
    lyrics_flat = lyrics_flat.replace("'s ", " ")
    # remove boring punctuation and weird signs
    punctuation = (',', "'", '"', ",", ';', ':', '.', '?', '!', '(', ')',
                   '{', '}', '/', '\\', '_', '|', '-', '@', '#', '*')
    for p in punctuation:
        lyrics_flat = lyrics_flat.replace(p, '')
    words = filter(lambda x: x.strip() != '', lyrics_flat.split(' '))
    # stem words
    words = map(lambda x: stem(x), words)
    bow = {}
    for w in words:
        if not w in bow.keys():
            bow[w] = 1
        else:
            bow[w] += 1
    # remove special words that are wrong
    fake_words = ('>', '<', 'outro~')
    bowwords = bow.keys()
    for bw in bowwords:
        if bw in fake_words:
            bow.pop(bw)
        elif bw.find(']') >= 0:
            bow.pop(bw)
        elif bw.find('[') >= 0:
            bow.pop(bw)
    # not big enough? remove instrumental ones among others
    if len(bow) <= 3:
        return None
    # done
    return bow

### song id mapping 

In [82]:
with open('../data/raw/mxm_779k_matches.txt', 'r') as f:
    match = f.read()

match = match.split('\n')
match_header = [el.replace('#','').replace(' ','') for el in match[7].split('|')]
match_header
match = match[18:]

matchdf = []
for line in tqdm(match):
    matchdf.append(line.split('<SEP>'))
    
matchdf = pd.DataFrame(matchdf, columns=match_header)

100%|█████████████████████████████████████████████████████████████████████████| 779057/779057 [00:02<00:00, 346485.62it/s]


### make tf-idf matrix from song-bow data

In [8]:
with open('../data/raw/mxm_dataset_train.txt', 'r') as f:
    data = f.read()

data = data.split('\n')
header = data[17][1:].split(',')
data = data[18:]

In [12]:
N = 200

sparse_matrix = dok_matrix((N, 5000), dtype=int)
id1 = []
id2 = []
for i, line in enumerate( tqdm(data[:N]) ):
    line = line.split(',')
    id1.append( line[0] )
    id2.append( line[1] )
    sparseline = line[2:]
    
    for item in sparseline:
        index, count = item.split(':')
        sparse_matrix[i, int(index)-1] = int(count)


100%|█████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 1792.41it/s]


In [99]:
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(sparse_matrix).toarray()

### Match conversation with song

In [123]:
# future: Whisper speech2text
speech2text = 'I saw a dog barking at a cat the other day. I am a cat lady, baby, a cat cat lady'
convo_sparse = lyrics_to_bow(speech2text)

In [124]:
convo_bow = np.zeros(5000)
for key, val in convo_sparse.items():
    if key in header:
        convo_bow[ header.index(key) ] = val

# map conversation bow with same tfidf tranformation as song dataset
convo_tfidf = tfidf_transformer.transform(convo_bow.reshape(1,-1))

In [125]:
maxarg = np.argmax( np.dot(tfidf_matrix, convo_tfidf.reshape(-1,1).toarray()) )
matchdf[ matchdf['tid'] == id1[maxarg] ]

Unnamed: 0,tid,artistname,title,mxmtid,artist_name,title.1
343592,TRAADYI128E078FB38,Lost Boyz,Certain Things We Do,1305577,Lost Boyz,Certain Things We Do


In [126]:
maxarg = np.argmax( np.dot(sparse_matrix.todense(), convo_bow) )
matchdf[ matchdf['tid'] == id1[maxarg] ]

Unnamed: 0,tid,artistname,title,mxmtid,artist_name,title.1
343434,TRAAJJO128F426B6AE,Mott The Hoople,American Pie,8229559,Mott the Hoople,American Pie


In [1]:
import whisper
import torch

In [8]:
# Load the model 
whisper_model = whisper.load_model("medium", device='cpu')

100%|█████████████████████████████████████| 1.42G/1.42G [02:41<00:00, 9.47MiB/s]


In [9]:
%%time
whisper_model.transcribe('/Users/danielsvendsen/Desktop/visnakker.m4a', task = 'translate')



CPU times: user 30.9 s, sys: 7.31 s, total: 38.2 s
Wall time: 15.8 s


{'text': " So, Jonathan, I'm just sitting here talking to some women. What are we talking about at the same time? Yeah, I don't know. We can also try to make some noise in the background while we're talking. So, what do you think about Donald Trump?",
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 5.0,
   'text': " So, Jonathan, I'm just sitting here talking to some women.",
   'tokens': [407,
    11,
    15471,
    11,
    286,
    478,
    445,
    3798,
    510,
    1417,
    281,
    512,
    2266,
    13],
   'temperature': 0.0,
   'avg_logprob': -0.4662783525992131,
   'compression_ratio': 1.4352941176470588,
   'no_speech_prob': 0.2605545222759247},
  {'id': 1,
   'seek': 0,
   'start': 5.0,
   'end': 6.0,
   'text': ' What are we talking about at the same time?',
   'tokens': [708, 366, 321, 1417, 466, 412, 264, 912, 565, 30],
   'temperature': 0.0,
   'avg_logprob': -0.4662783525992131,
   'compression_ratio': 1.4352941176470588,
   'no_speech_prob': 0.260554

In [13]:
%%time
whisper_model.transcribe('/Users/danielsvendsen/Desktop/visnakker.m4a', task = 'translate', temperature=1.2)



CPU times: user 38 s, sys: 9.43 s, total: 47.4 s
Wall time: 18.6 s


{'text': " Ok... So Johardhand? I'm currently probably..... Maybe we might say something together? Alright. Fine though, we could maybe make some noise one last time... I saw a... Sony sent this to Donald Trop",
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 3.0,
   'text': ' Ok... So Johardhand?',
   'tokens': [3477, 485, 407, 19180, 515, 5543, 30],
   'temperature': 1.2,
   'avg_logprob': -3.327481951032366,
   'compression_ratio': 1.2236024844720497,
   'no_speech_prob': 0.2605545222759247},
  {'id': 1,
   'seek': 0,
   'start': 3.0,
   'end': 5.0,
   'text': " I'm currently probably.....",
   'tokens': [286, 478, 4362, 1391, 13556],
   'temperature': 1.2,
   'avg_logprob': -3.327481951032366,
   'compression_ratio': 1.2236024844720497,
   'no_speech_prob': 0.2605545222759247},
  {'id': 2,
   'seek': 0,
   'start': 5.0,
   'end': 7.0200000000000005,
   'text': ' Maybe we might say something together?',
   'tokens': [2704, 321, 1062, 584, 746, 1214, 30],
   'tempera