In [229]:
import pandas as pd
import sqlite3
import json
import pickle

Create connection to database and look at structure

In [230]:
conn_lyrics = sqlite3.connect('../Data/mxm_dataset.db')
res = conn_lyrics.execute("Select * FROM sqlite_master where type = 'table'")
res.fetchall()

[('table', 'words', 'words', 2, 'CREATE TABLE words (word TEXT PRIMARY KEY)'),
 ('table',
  'lyrics',
  'lyrics',
  4,
  'CREATE TABLE lyrics (track_id, mxm_tid INT, word TEXT, count INT, is_test INT, FOREIGN KEY(word) REFERENCES words(word))')]

In [231]:
# create dataframe from lyrics. 
%time track_info = pd.read_sql("SELECT track_id, word, count FROM lyrics", con = conn_lyrics)

Wall time: 34.8 s


In [232]:
# get unique track
tracks = track_info.drop_duplicates(subset='track_id')
# reset index since it retains original ind
tracks.index=range(len(tracks))
# look at the first five records. We don't care about word or count
tracks.head()

Unnamed: 0,track_id,word,count
0,TRAAAAV128F421A322,i,6
1,TRAAABD128F429CF47,i,10
2,TRAAAED128E0783FAB,i,28
3,TRAAAEF128F4273421,i,5
4,TRAAAEW128F42930C0,i,4


In [233]:
len(tracks['track_id'])

237662

In [234]:
# Expand the words by count
# come back to this if the extra spaces matter.

my_dict = {}

## Change the second number in range to len(tracks['track_id']) for all the records. 
## Keeping it shorter for now to reduce processing time.

for i in range(0,100): 
    # assign the value of the track at current index to current track
    current_track = tracks.track_id[i]

    # pull the lyrics for that track and store it in a list
    res = conn_lyrics.execute("SELECT word, count FROM lyrics WHERE track_id = ?", [current_track])
    results = res.fetchall()
    #word = (results[0][0] + ' ') * results[0][1]
    # extract the word from results (otherwise it returns it AS ('word',))

    li = [(x[0] + ' ') * x[1] for x in results]

    # get rid of commas between words
    li = str(li).replace(',','')
    # get rid of quotes between words
    li = str(li).replace("'",'')
    # add track and lyrics to dictionary
    my_dict[current_track] = li

my_dict

{'TRAAAAV128F421A322': '[i i i i i i  the the the the  you you  to to  and and and and and  a a a  me  it  my  is is  of of of  your  that  are are  we we  am am  will will  for for for for  be  have have  so  this  like like  de  up  was was  if  got  would  been  these these  seem  someon  understand  pass  river  met  piec  damn  worth  flesh  grace  poor poor  somehow  ignor  passion  tide  season  seed  resist  order order  piti  fashion  grant  captur captur  ici  soil  patienc  social social  highest highest  slice  leaf  lifeless  arrang  wilder  shark  devast  element ]',
 'TRAAABD128F429CF47': '[i i i i i i i i i i  you you you you you you you you you you you you you you you you you  to to to to to to to to  and and  a a  me  it it it  not not  in in in  my my my my  is is is  your your your your your your your  that that that that that  do do do do do  are are are are are are  for for for for  no  have have have have have have  so so  know know know know know  but but but  w

In [180]:
'''
# this one just has unique words

my_dict = {}

## Change the second number in range to len(tracks['track_id']) for all the records. 
## Keeping it shorter for now to reduce processing time.

for i in range(0,100): 
    # assign the value of the track at current index to current track
    current_track = tracks.track_id[i]

    # pull the lyrics for that track and store it in a list
    res = conn_lyrics.execute("SELECT word FROM lyrics WHERE track_id = ?", [current_track])
    results = res.fetchall()
    # extract the word from results (otherwise it returns it AS ('word',))
    li = [x[0]for x in results]
    # get rid of commas between words
    li = str(li).replace(',','')
    # get rid of quotes between words
    li = str(li).replace("'",'')
    # add track and lyrics to dictionary
    my_dict[current_track] = li
    #my_dict = {current_track : li}
    
'''

How should this data be stored? http://stackoverflow.com/questions/7100125/storing-python-dictionaries

In [239]:
# save to json file in same directory
import json
with open('lyrics_dict.json', 'w') as fp:
    # arguments can include indent=n or None, sort_keys = True
    json.dump(my_dict, fp, indent=None)

In [240]:
with open('lyrics_dict.json', 'r') as fp:
    data_json = json.load(fp)

In [241]:
data_json

{'TRAAAAV128F421A322': '[i i i i i i  the the the the  you you  to to  and and and and and  a a a  me  it  my  is is  of of of  your  that  are are  we we  am am  will will  for for for for  be  have have  so  this  like like  de  up  was was  if  got  would  been  these these  seem  someon  understand  pass  river  met  piec  damn  worth  flesh  grace  poor poor  somehow  ignor  passion  tide  season  seed  resist  order order  piti  fashion  grant  captur captur  ici  soil  patienc  social social  highest highest  slice  leaf  lifeless  arrang  wilder  shark  devast  element ]',
 'TRAAABD128F429CF47': '[i i i i i i i i i i  you you you you you you you you you you you you you you you you you  to to to to to to to to  and and  a a  me  it it it  not not  in in in  my my my my  is is is  your your your your your your your  that that that that that  do do do do do  are are are are are are  for for for for  no  have have have have have have  so so  know know know know know  but but but  w

In [242]:
# save dictionary to pickle
import pickle
with open('lyrics_dict.p', 'wb') as fp:
    pickle.dump(my_dict, fp)

In [243]:
with open('lyrics_dict.p', 'rb') as fp:
    data_pickle = pickle.load(fp)

In [244]:
conn_lyrics.close()