In [150]:
import pandas as pd
import sqlite3
import json
import pickle

Create connection to database and look at structure

In [151]:
conn_lyrics = sqlite3.connect('../Data/mxm_dataset.db')
res = conn_lyrics.execute("Select * FROM sqlite_master where type = 'table'")
res.fetchall()

[('table', 'words', 'words', 2, 'CREATE TABLE words (word TEXT PRIMARY KEY)'),
 ('table',
  'lyrics',
  'lyrics',
  4,
  'CREATE TABLE lyrics (track_id, mxm_tid INT, word TEXT, count INT, is_test INT, FOREIGN KEY(word) REFERENCES words(word))')]

In [152]:
# create dataframe from lyrics. 
track_info = pd.read_sql("SELECT track_id, word, count FROM lyrics", con = conn_lyrics)

In [153]:
# get unique track
tracks = track_info.drop_duplicates(subset='track_id')
# reset index since it retains original ind
tracks.index=range(len(tracks))
# look at the first five records. We don't care about word or count
tracks.head()

Unnamed: 0,track_id,word,count
0,TRAAAAV128F421A322,i,6
1,TRAAABD128F429CF47,i,10
2,TRAAAED128E0783FAB,i,28
3,TRAAAEF128F4273421,i,5
4,TRAAAEW128F42930C0,i,4


In [154]:
len(tracks['track_id'])

237662

In [155]:
my_dict = {}

## Change the second number in range to len(tracks['track_id']) for all the records. 
## Keeping it shorter for now to reduce processing time.

for i in range(0,100): 
    # assign the value of the track at current index to current track
    current_track = tracks.track_id[i]

    # pull the lyrics for that track and store it in a list
    res = conn_lyrics.execute("SELECT word FROM lyrics WHERE track_id = ?", [current_track])
    results = res.fetchall()
    # extract the word from results (otherwise it returns it AS ('word',))
    li = [x[0]for x in results]
    # add track and lyrics to dictionary
    my_dict[current_track] = li
    #my_dict = {current_track : li}

In [156]:
len(my_dict)

100

How should this data be stored? http://stackoverflow.com/questions/7100125/storing-python-dictionaries

In [157]:
# save to json file in same directory
import json
with open('lyrics_dict.json', 'w') as fp:
    # arguments can include indent=n or None, sort_keys = True
    json.dump(my_dict, fp, indent=None)

In [158]:
with open('lyrics_dict.json', 'r') as fp:
    data_json = json.load(fp)

In [159]:
# don't include this if posting to github

#data_json

{'TRAAAAV128F421A322': ['i',
  'the',
  'you',
  'to',
  'and',
  'a',
  'me',
  'it',
  'my',
  'is',
  'of',
  'your',
  'that',
  'are',
  'we',
  'am',
  'will',
  'for',
  'be',
  'have',
  'so',
  'this',
  'like',
  'de',
  'up',
  'was',
  'if',
  'got',
  'would',
  'been',
  'these',
  'seem',
  'someon',
  'understand',
  'pass',
  'river',
  'met',
  'piec',
  'damn',
  'worth',
  'flesh',
  'grace',
  'poor',
  'somehow',
  'ignor',
  'passion',
  'tide',
  'season',
  'seed',
  'resist',
  'order',
  'piti',
  'fashion',
  'grant',
  'captur',
  'ici',
  'soil',
  'patienc',
  'social',
  'highest',
  'slice',
  'leaf',
  'lifeless',
  'arrang',
  'wilder',
  'shark',
  'devast',
  'element'],
 'TRAAABD128F429CF47': ['i',
  'you',
  'to',
  'and',
  'a',
  'me',
  'it',
  'not',
  'in',
  'my',
  'is',
  'your',
  'that',
  'do',
  'are',
  'for',
  'no',
  'have',
  'so',
  'know',
  'but',
  'what',
  'when',
  'time',
  'can',
  'there',
  'la',
  'get',
  'got',
  'ne

In [160]:
# save dictionary to pickle
import pickle
with open('lyrics_dict.p', 'wb') as fp:
    pickle.dump(my_dict, fp)

In [161]:
with open('lyrics_dict.p', 'rb') as fp:
    data_pickle = pickle.load(fp)

In [162]:
# don't include this if posting to github

#data_pickle

{'TRAAAAV128F421A322': ['i',
  'the',
  'you',
  'to',
  'and',
  'a',
  'me',
  'it',
  'my',
  'is',
  'of',
  'your',
  'that',
  'are',
  'we',
  'am',
  'will',
  'for',
  'be',
  'have',
  'so',
  'this',
  'like',
  'de',
  'up',
  'was',
  'if',
  'got',
  'would',
  'been',
  'these',
  'seem',
  'someon',
  'understand',
  'pass',
  'river',
  'met',
  'piec',
  'damn',
  'worth',
  'flesh',
  'grace',
  'poor',
  'somehow',
  'ignor',
  'passion',
  'tide',
  'season',
  'seed',
  'resist',
  'order',
  'piti',
  'fashion',
  'grant',
  'captur',
  'ici',
  'soil',
  'patienc',
  'social',
  'highest',
  'slice',
  'leaf',
  'lifeless',
  'arrang',
  'wilder',
  'shark',
  'devast',
  'element'],
 'TRAAABD128F429CF47': ['i',
  'you',
  'to',
  'and',
  'a',
  'me',
  'it',
  'not',
  'in',
  'my',
  'is',
  'your',
  'that',
  'do',
  'are',
  'for',
  'no',
  'have',
  'so',
  'know',
  'but',
  'what',
  'when',
  'time',
  'can',
  'there',
  'la',
  'get',
  'got',
  'ne

In [163]:
conn_lyrics.close()