# Data Formatting

Cleaning and formatting of the lyrics collection in MongoDB using pymongo.

In [5]:
# import pandas as pd
import numpy as np
import json
import csv
from pymongo import MongoClient
from bson.objectid import ObjectId
import re
import pprint

In [2]:
# load collection
client = MongoClient()
p4_db = client.proj4
lyrics = p4_db.lyrics

To get an idea of what we might need to do in the cleaning process...

In [3]:
# pulls a sample from a specific date range

sample = lyrics.aggregate([{"$match": {'Year': {"$gte": 1965, "$lte": 1975}}},
                           {"$sample": {"size": 20}}
                           ])

for record in sample:
    pprint.pprint(record)

{'Artist': 'freda payne',
 'Instrumental': False,
 'Lyrics': ' now that youre gone all thats left is a band of gold all thats '
           'left of the dreams i hold is a band of gold and the memories of '
           'what love could be if you were still here with meyou took me from '
           'the shelter of my mother i had never known or loved any other we '
           'kissed after taking vows but that night on our honeymoon we stayed '
           'in separate roomsi wait in the darkness of my lonely room filled '
           'with sadness filled with gloom hoping soon that youll walk back '
           'through that door and love me like you tried beforesince youve '
           'been gone all thats left is a band of gold all thats left of the '
           'dreams i hold is a band of gold and the dream of what love could '
           'be if you were still here with meohhhdont you know that i wait in '
           'the darkness of my lonely room filled with sadness filled with '
     

**Potential Problems**

2. An absence of lyrics is sometimes noted with 'NA' in the lyrics field.
3. Instrumentals have 'instrumental' or 'instrumental ' in the lyrics field

### DB Operations

No need to run these more than once!

In [None]:
# # remove Lyrics field if 'NA'

# result_1 = lyrics.update_many({"Lyrics": "NA"},
#                        {"$unset": {"Lyrics": True}}
#                       )

# result_1.modified_count

In [None]:
# # add an Instrumental field to all documents, set to False as default

# result_2 = lyrics.update_many({},
#                               {"$set": {"Instrumental": False}}
#                              ) 

In [7]:
# # set Instrumental field to True if Lyrics field is 'instrumental'

# result_3 = lyrics.update_many({"Lyrics": {"$regex": r"instrumental\s*"}},
#                               {"$set": {"Instrumental": True}},
#                               # {"$unset": {"Lyrics": "$"}}
#                              )
# print(result_3.matched_count)
# print(result_3.modified_count)

# # 17 songs

2
2


In [None]:
# # remove Lyrics field if 'instrumental'

# result_4 = lyrics.update_many({"Lyrics": "instrumental"},
#                               {"$unset": {"Lyrics": "$"}}
#                              )
# print(result_4.matched_count)
# print(result_4.modified_count)

# # 15 songs

In [4]:
# Confirm instrumental field added, 'NA' lyrics and Lyric field removed

lyrics.find_one({'Rank': 1, 'Year': 1966})

{'Artist': 'ssgt barry sadler',
 'Instrumental': False,
 'Rank': 1,
 'Song': 'ballad of the green berets',
 'Source': 'NA',
 'Year': 1966,
 '_id': ObjectId('59f903150f3e05a46c01d7ec')}

In [5]:
# Confirm Instrumental field True when instrumental, Lyrics field gone

lyrics.find_one({"Instrumental": True})

{'Artist': 'ramsey lewis trio',
 'Instrumental': True,
 'Rank': 18,
 'Song': 'the in crowd',
 'Source': 3,
 'Year': 1965,
 '_id': ObjectId('59f903140f3e05a46c01d799')}

**Another problem discovered later:** some songs missing lyrics had only one or two spaces in the Lyrics field instead of NA:

In [13]:
# correct for known instrumentals:
# * soul finger
spaces = lyrics.find({"Lyrics": {"$regex": r"^\s*$"}})
for song in spaces:
    print(song)

{'_id': ObjectId('59f903140f3e05a46c01d78e'), 'Rank': 3, 'Song': 'i cant get no satisfaction', 'Artist': 'the rolling stones', 'Year': 1965, 'Lyrics': '  ', 'Source': 1, 'Instrumental': False, 'Lyrics_clean': ''}
{'_id': ObjectId('59f903150f3e05a46c01d839'), 'Rank': 78, 'Song': 'love is a hurtin thing', 'Artist': 'lou rawls', 'Year': 1966, 'Lyrics': '  ', 'Source': 1, 'Instrumental': False, 'Lyrics_clean': ''}
{'_id': ObjectId('59f903150f3e05a46c01d83b'), 'Rank': 80, 'Song': 'gloria', 'Artist': 'shadows of knight', 'Year': 1966, 'Lyrics': '  ', 'Source': 1, 'Instrumental': False, 'Lyrics_clean': ''}
{'_id': ObjectId('59f903150f3e05a46c01d84d'), 'Rank': 98, 'Song': 'zorba the greek', 'Artist': 'herb alpert and the tijuana brass', 'Year': 1966, 'Lyrics': '  ', 'Source': 1, 'Instrumental': False, 'Lyrics_clean': ''}
{'_id': ObjectId('59f903150f3e05a46c01d85f'), 'Rank': 16, 'Song': 'kind of a drag', 'Artist': 'the buckinghams', 'Year': 1967, 'Lyrics': '  ', 'Source': 1, 'Instrumental': Fal

Correction, to run once:

In [15]:
# results = lyrics.update_many({"Lyrics": {"$regex": r"^\s*$"}}, 
#                              {"$unset": {"Lyrics": True}})

In [18]:
# results.modified_count

44

### Random Queries

In [32]:
foo = lyrics.find({"Artist": "james brown"})

In [33]:
for song in foo:
    pprint.pprint(song)

{'Artist': 'james brown',
 'Instrumental': False,
 'Lyrics': ' come here sister papas in the swing he aint too hip about that '
           'new breed babe he aint no drag papas got a brand new bagcome here '
           'mama and dig this crazy scene hes not too fancy but his line is '
           'pretty clean he aint no drag papas got a brand new baghes doing '
           'the jerk hes doing the fly dont play him cheap cause you know he '
           'aint shy hes doing the monkey the mashed potatoes jump back jack '
           'see you later alligatorcome here sister papas in the swing he aint '
           'too hip now but i can dig that new breed babe he aint no drag hes '
           'got a brand new bag ',
 'Lyrics_clean': 'come here sister papas in the swing he int too hip about '
                 'that new breed babe he int no drag papas got a brand new bag '
                 'come here mama and dig this crazy scene hes not too fancy '
                 'but his line is pretty clean

In [6]:
lyrics.find_one({"_id": ObjectId('59f903140f3e05a46c01d7a8')})

{'Artist': 'james brown',
 'Instrumental': False,
 'Lyrics': ' come here sister papas in the swing he aint too hip about that new breed babe he aint no drag papas got a brand new bagcome here mama and dig this crazy scene hes not too fancy but his line is pretty clean he aint no drag papas got a brand new baghes doing the jerk hes doing the fly dont play him cheap cause you know he aint shy hes doing the monkey the mashed potatoes jump back jack see you later alligatorcome here sister papas in the swing he aint too hip now but i can dig that new breed babe he aint no drag hes got a brand new bag ',
 'Lyrics_clean': 'come here sister papas in the swing he int too hip about that new breed babe he int no drag papas got a brand new bag come here mama and dig this crazy scene hes not too fancy but his line is pretty clean he int no drag papas got a brand new bashes doing the jerk hes doing the fly dont play him cheap cause you know he int shy hes doing the monkey the mashed potatoes jump ba

In [13]:
lyrics.find_one({"Song": "back stabbers"})

{'Artist': 'the ojays',
 'Instrumental': False,
 'Lyrics': ' what they do they smile in your face all the time they want to take your place the back stabbers back stabbers they smile in your face all the time they want to take your place the back stabbers back stabbersall you fellows who have someone and you really care yeah yeah then its all of you fellows who better beware yeah yeah somebodys out to get your lady a few of your buddies they sure look shady blades are long clenched tight in their fist aimin straight at your back and i dont think theyll misswhat they do they smile in your face all the time they want to take your place the back stabbers back stabbers i keep gettin all these visits from my friends yeah what they doin to me they come to my house again and again and again and again yeah so are they there to see my woman i dont even be home but they just keep on comin what can i do to get on the right track i wish theyd take some of these knives off my backwhat they do they 