# Analyzing the Music of The Beatles

# Table of Contents
  * [Importing the Packages](#imports)
  * [Getting the Song List and All Lyrics](#songgather)
  * [Getting the Album Meta Data](#albummeta)
  * [Getting the Singers List for Each Song](#singerslist)
  * [Getting the List of All Songs on Each Album](#albumslist)
  * [Merging and Cleaning the Tables](#merging)
  * [Masking Word Clouds](#masking)
  * [Songs Written by Each Artist WordCloud](#writtenby)
  * [Songs Sung by Each Artist WordCloud](#sungby)
  * [Answering Simple Questions](#questions)
  * [For the Future](#future)

## Importing the Packages <a id="imports"></a>

In [None]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import string
import codecs
from django.utils.encoding import smart_str
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import cStringIO
from scipy.misc import imread
from PIL import Image
import PIL.ImageOps    

## Getting the Song List and All Lyrics <a id="songgather"></a>

In [None]:
def get_songlist(urlname):
    sock = urllib.urlopen(allsongsurl).read()
    soup = BeautifulSoup(sock, 'html.parser')
    table = soup.find("div", {"class" : "maincont floatfix"})
    #set parameters for dataframe
    songnames = []
    rows = table.findAll('td', {"class" : "colfirst"})
    for row in rows:
        songnames.append(row.find('a'))
    songnames = pd.DataFrame(songnames[:200])
    songnames = songnames.astype(str)
    #extract URLs in Source column
    songnames['source'] = songnames[0].str.extract('(\".*?\")', expand=True)
    songnames['source'] = songnames['source'].str.replace('(\"\/)', 'http://www.lyricsfreak.com/')
    songnames['source'] = songnames['source'].str.replace('(\")', '')
    #extract title in title column
    songnames['Title'] = songnames[0].str.extract('(title\=\".*?\")', expand=False)
    songnames['Title'] = songnames['Title'].str.replace('(title\=\")', '')
    songnames['Title'] = songnames['Title'].str.replace('(Lyrics\")', '')
    #drop initial column
    return songnames.drop([0], axis=1)

def get_lyrics(list):
    lyric = []
    lyrics = pd.DataFrame()
    fulllyrics = []
    lyrics['song'] = list['Title']
    lyrics['source'] = list['source']
    for song in list['source']:
        sock = urllib.urlopen(song).read()
        soup = BeautifulSoup(sock, 'html.parser')
        table = soup.find("div", {"id" : "content_h"})
        lyric = smart_str(table)
        lyric = string.replace(lyric, '<br>', ' ')
        lyric = string.replace(lyric, '</br>', ' ')
        lyric = string.replace(lyric, '<div class="dn" id="content_h">', ' ')
        lyric = string.replace(lyric, '</div>', ' ')
        fulllyrics.append(lyric)
    lyrics['lyrics'] = fulllyrics
    #remove extra whitespace from song names
    lyrics['song'] = lyrics['song'].map(str.strip)
    lyrics['song'] = lyrics['song'].str.lower()
    lyrics['song'] = lyrics['song'].str.replace('[^a-zA-Z0-9\n ]', '')
    return lyrics

def getothersongs(url):
    sock = urllib.urlopen(url).read()
    soup = BeautifulSoup(sock, 'html.parser')
    table = soup.findAll('script')
    pattern = r'(songs\_non\_display.*)'
    songlist = re.search(pattern.decode('utf-8'), soup.decode('utf-8'), re.I | re.U)
    songlist = songlist.groups()
    songlist = smart_str(songlist)
    songlist = str.split(songlist, 'colfirst')
    lyric = []
    lyrics = []
    othersonglist = pd.DataFrame()
    #gets newlist of htmls
    for song in songlist:
        lyric = str.replace(song, '\\', '')
        pattern = r'(\"\/.*?\")'
        lyric = re.findall(pattern, lyric)
        lyric = smart_str(lyric)
        lyric = re.sub('\"', "http://www.lyricsfreak.com", lyric, count=1)
        lyric = re.sub('\"', '', lyric)
        lyric = re.sub('\(', '', lyric)
        lyric = re.sub('\)', '', lyric)
        lyric = re.sub('\[', '', lyric)
        lyric = re.sub('\]', '', lyric)
        lyric = re.sub('\,', '', lyric)
        lyric = re.sub('\'', '', lyric)
        lyrics.append(lyric)
    #get list of song titles
    songname = []
    for song in songlist:
        song1 = str.replace(song, '\\', '')
        pattern = r'(title\=\".*?\")'
        song1 = re.findall(pattern, song1)
        song1 = smart_str(song1)
        song1 = re.sub('Lyrics', '', song1)
        song1 = re.sub('title\=\"', '', song1)
        song1 = re.sub('\[', '', song1)
        song1 = re.sub('\]', '', song1)
        song1 = re.sub('\(', '', song1)
        song1 = re.sub('\)', '', song1)
        song1 = re.sub('\"', '', song1)
        song1 = re.sub('\'', '', song1)
        song1 = re.sub('\,', '', song1)
        song1 = re.sub("\\\\", "'", song1)
        songname.append(song1)
    othersonglist['song'] = songname
    othersonglist['source'] = lyrics
    #COMBINE LIST AND RETURN COMBINED LIST
    return othersonglist[1:]

def getotherlyrics(list):
    lyric = []
    lyrics = pd.DataFrame()
    fulllyrics = []
    lyrics['song'] = list['song']
    lyrics['source'] = list['source']
    for song in list['source']:
        sock = urllib.urlopen(song).read()
        soup = BeautifulSoup(sock, 'html.parser')
        table = soup.find("div", {"id" : "content_h"})
        lyric = smart_str(table)
        lyric = string.replace(lyric, '<br>', ' ')
        lyric = string.replace(lyric, '</br>', ' ')
        lyric = string.replace(lyric, '<div class="dn" id="content_h">', ' ')
        lyric = string.replace(lyric, '</div>', ' ')
        fulllyrics.append(lyric)
    lyrics['lyrics'] = fulllyrics
    lyrics['song'] = lyrics['song'].map(str.strip)
    lyrics['song'] = lyrics['song'].str.lower()
    lyrics['song'] = lyrics['song'].str.replace('[^a-zA-Z0-9\n ]', '')
    return lyrics

allsongsurl = 'http://www.lyricsfreak.com/b/beatles/'
songs = get_songlist(allsongsurl)
alllyrics = get_lyrics(songs)
othersongs = getothersongs(allsongsurl)
otherlyrics = getotherlyrics(othersongs)

alllyrics = alllyrics.append(otherlyrics, ignore_index=True)


## Getting the Album Meta Data <a id="albummeta"></a>

In [None]:
def get_albums(urlname):
    try:
        sock = urllib.urlopen(urlname).read()
    except:
        print "Error with %s, please retry using another url object" % urlname
    soup = BeautifulSoup(sock, 'html.parser')
    soup.prettify()
    table = soup.find("table", {"class" : "wikitable plainrowheaders"})
    #set parameters for dataframe
    albumnames = []
    A = []
    B = []
    C = []
    D = []
    E = []
    F = []
    G = []
    H = []
    I = []
    #fill parameters for dataframe
    for row in table.findAll('tr')[2:]:
        albumnames.append(row.select('i'))
        meta = row.findAll('td')
        if len(meta) == 9:
            A.append(meta[0].find(text=False))
            B.append(meta[1].find(text=True))
            C.append(meta[2].find(text=True))
            D.append(meta[3].find(text=True))
            E.append(meta[4].find(text=True))
            F.append(meta[5].find(text=True))
            G.append(meta[6].find(text=True))
            H.append(meta[7].find(text=True))
            I.append(meta[8].find(text=False))
    #create dataframe
    #convert list to dataframe
    #remove last row, it references a footnote
    albums = pd.DataFrame(albumnames[:28])
    albums['Release'] = A
    albums['UK Peak'] = B
    albums['AUS Peak'] = C
    albums['CAN Peak'] = D
    albums['FRA Peak'] = E
    albums['GER Peak'] = F
    albums['NOR Peak'] = G
    albums['US Peak'] = H
    albums['Certifications'] = I
    #fill missing albumn names due to multiple release dates with previous albumn name
    albums[0] = albums[0].fillna(method='ffill')
    #convert album names to string
    albums[0] = albums[0].astype(str)
    #create new column for album names without tags
    albums['album'] = albums[0].str.extract('(\"\>.*\<\/)', expand=False)    
    #fill albums without tag
    albums['album'] = albums['album'].fillna(albums[0])
    #remove strange symbols from album names
    albums['album'] = albums['album'].str.replace('(\<\/a\>\<\/)', '')
    albums['album'] = albums['album'].str.replace('(\"\>)', '')
    albums['album'] = albums['album'].str.replace('(\<i\>)', '')
    albums['album'] = albums['album'].str.replace('(\<\/i\>)', '')
    #remove tags from Release Column
    albums['Release'] = albums['Release'].astype(str)
    albums['Release'] = albums['Release'].str.replace('(\<.*?\>)', '')
    albums['Release'] = albums['Release'].str.replace('(\\n)', ' ')
    #Remove tags from Certification Clumn
    albums['Certifications'] = albums['Certifications'].astype(str)
    albums['Certifications'] = albums['Certifications'].str.replace('(\<.*?\>)', '')
    albums['Certifications'] = albums['Certifications'].str.replace('(\[.*?\])', ' |')
    albums['Certifications'] = albums['Certifications'].str.replace('(\\n)', ' ')
    #split certifications by |
    albums['BPICert'] = albums['Certifications'].str.extract('(BPI.*?\|)', expand=False)
    albums['BPICert'] = albums['BPICert'].str.replace('(BPI\:)', '')
    albums['BPICert'] = albums['BPICert'].str.replace('(\|)', '')
    albums['ARIACert'] = albums['Certifications'].str.extract('(ARIA.*?\|)', expand=False)
    albums['ARIACert'] = albums['ARIACert'].str.replace('(ARIA\:)', '')
    albums['ARIACert'] = albums['ARIACert'].str.replace('(\|)', '')
    albums['MCCert'] = albums['Certifications'].str.extract('(MC.*?\|)', expand=False)
    albums['MCCert'] = albums['MCCert'].str.replace('(MC\:)', '')
    albums['MCCert'] = albums['MCCert'].str.replace('(\|)', '')
    albums['RIAACert'] = albums['Certifications'].str.extract('(RIAA.*?\|)', expand=False)
    albums['RIAACert'] = albums['RIAACert'].str.replace('(RIAA\:)', '')
    albums['RIAACert'] = albums['RIAACert'].str.replace('(\|)', '')
    albums['BVMICert'] = albums['Certifications'].str.extract('(BVMI.*?\|)', expand=False)
    albums['BVMICert'] = albums['BVMICert'].str.replace('(BVMI\:)', '')
    albums['BVMICert'] = albums['BVMICert'].str.replace('(\|)', '')
    albums['SNEPCert'] = albums['Certifications'].str.extract('(SNEP.*?\|)', expand=False)
    albums['SNEPCert'] = albums['SNEPCert'].str.replace('(SNEP\:)', '')
    albums['SNEPCert'] = albums['SNEPCert'].str.replace('(\|)', '')
    #rename some albums to match other dataframes
    albums['album'] = albums['album'].str.replace('A Hard', 'Hard')
    albums['album'] = albums['album'].str.replace('The Beatles', 'The Beatles (The White Album)')
    albums['album'] = albums['album'].str.lower()
    albums['album'] = albums['album'].str.replace('[^a-zA-Z0-9\n ]', '')
    #return albums dataframe
    albums = albums.drop(albums.columns[[0, 1, 9]], axis=1)
    return albums

my_url3 = "https://en.wikipedia.org/wiki/The_Beatles_discography#Albums"
albums = get_albums(my_url3)

## Getting the Singers List for Each Song <a id="singerslist"></a>

In [None]:
def get_singers(urlname):
    try:
        sock = urllib.urlopen(urlname).read()
    except:
        print "Error with %s, please retry using another url object" % urlname
    soup = BeautifulSoup(sock, 'html.parser')
    soup.prettify()
    table = soup.find("table", {"class" : "wikitable collapsible sortable"})
    #set parameters for dataframe
    songname = []
    A = []
    B = []
    C = []
    D = []
    E = []
    F = []
    G = []
    H = []
    I = []
    #fill parameters for dataframe
    for row in table.findAll('tr')[1:]:
        meta = row.findAll('td')
        A.append(meta[0].find('a'))
        B.append(meta[1].find(text=True))
        C.append(meta[2].find(text=True))
        D.append(meta[3].find(text=True))
        E.append(meta[4].find(text=True))
        F.append(meta[5].find(text=True))
        G.append(meta[6].find(text=True))
        H.append(meta[7].find(text=False))
        #some songs have different info
        I.append(meta[0].find(string=True))
    #create dataframe
    #convert list to dataframe
    #remove last row, it references a footnote
    singers = pd.DataFrame(songname)
    singers[0] = A
    singers['Year'] = B
    singers['Songwriter'] = D
    singers['LeadVocals'] = E
    singers['UKChart'] = F
    singers['USChart'] = G
    singers['I'] = I
    singers[0] = singers[0].astype(str)
    #replace missing songs with 
    singers[0][singers[0] == 'None'] = singers['I']
    #remove tag information from songs
    singers['song'] = singers[0].str.replace('(\<a.*?\>)', '')
    singers['song'] = singers['song'].str.replace('(\<\/a\>)', '')
    singers['song'] = singers['song'].str.replace('(\")', '')
    singers['song'] = singers['song'].str.replace('[^a-zA-Z0-9\n ]', '')
    singers['song'] = singers['song'].str.lower()
    singers = singers.drop(singers.columns[[0,1,4,5,6]], axis=1)
    return singers

singersurl = 'https://en.wikipedia.org/wiki/List_of_songs_recorded_by_the_Beatles'
singerslist = get_singers(singersurl)

## Getting the List of All Songs on Each Album <a id="albumslist"></a>

In [None]:
def get_albumsonglist(urlname):
    sock = urllib.urlopen(urlname).read()
    soup = BeautifulSoup(sock, 'html.parser')
    #extract list of album names and convert to series
    albums1 = soup.findAll('h3')
    albums1 = pd.Series(albums1[:])
    #extract list of song names seperated by albums
    songslist = soup.findAll('table')
    songslist = songslist[2:]
    albumlist = pd.DataFrame(songslist)
    #merge albumns with songs
    albumlist['albumnum'] = albumlist.index.tolist()
    albumlist['album'] = albums1
    albumlist['album'] = albumlist['album'].astype(str)
    albumlist['album'] = albumlist['album'].str.replace('\<.*?\>', '')
    albumlist['year'] = albumlist['album'].str.extract('(\(\d*?\))', expand=False)
    albumlist['year'] = albumlist['year'].str.replace('(\()', "")
    albumlist['year'] = albumlist['year'].str.replace('(\))', "")
    albumlist['album'] = albumlist['album'].str.replace('(\(\d*?\))', '')

    #get a list of the songs
    A = []
    B = []
    num = 0
    for row in albumlist[0]:
        albumset = row
        num = num
        for link in albumset.findAll('a'):
                A.append(link.string)
                B.append(num)
        num = num + 1
    C = []
    for song in A:
        C.append(' '.join(song.split()))
    songsalbums = pd.DataFrame(
        {'song' : C,
        'albumnum' : B,
        })
    albumlist = albumlist.drop(0, 1)
    fullalbumslist = pd.merge(
        songsalbums, albumlist,
        left_on='albumnum', right_on='albumnum')
    #remove extra whitespace from album names
    fullalbumslist['album'] = fullalbumslist['album'].map(str.strip)
    fullalbumslist['song'] = fullalbumslist['song'].str.replace('[^a-zA-Z0-9\n ]', '')
    fullalbumslist['song'] = fullalbumslist['song'].str.lower()
    fullalbumslist['album'] = fullalbumslist['album'].str.replace('[^a-zA-Z0-9\n ]', '')
    fullalbumslist['album'] = fullalbumslist['album'].str.lower()
    return fullalbumslist

albumsongsurl = "http://www.brianhartzog.com/beatles/beatles-list-of-all-lyrics-by-album.htm"
albumsongs = get_albumsonglist(albumsongsurl)

## Merging and Cleaning the Tables <a id="merging"></a>

Note:
Yes, this process is incredibly hacky and there are easier ways to do it but I didn't do it those ways.

In [None]:
completeset = pd.merge(
    albumsongs, albums,
    left_on='album', right_on='album')

#some song names need to be cleaned before they can be merged
alllyrics['song'] = alllyrics['song'] .str.replace('got to get it into my life', 'got to get you into my life')
alllyrics['song'] = alllyrics['song'].str.replace('i want you', 'i want you shes so heavy')
alllyrics['song'] = alllyrics['song'].str.replace('follow the sun', 'ill follow the sun')
alllyrics['song'] = alllyrics['song'].str.replace('kansa city', 'medley kansas city  hey hey hey hey')
alllyrics['song'] = alllyrics['song'].str.replace('^norwegian wood$', 'norwegian wood this bird has flown')
alllyrics['song'] = alllyrics['song'].str.replace('revolution$', 'revolution 1')
alllyrics['song'] = alllyrics['song'].str.replace('she is leaving home', 'shes leaving home')
alllyrics['song'] = alllyrics['song'].str.replace('^money$', 'money thats what i want')
alllyrics['song'] = alllyrics['song'].str.replace('sergeant', 'sgt')
completeset['song'] = completeset['song'].str.replace(' reprise', '')
alllyrics['song'] = alllyrics['song'].str.replace('you never give me your money thats what i want', 'you never give me your money')
alllyrics['song'] = alllyrics['song'].str.replace('when im sixtyfour', 'when im sixty four')
alllyrics['song'] = alllyrics['song'].str.replace('^i saw her standing$', 'i saw her standing there')
alllyrics['song'] = alllyrics['song'].str.replace('dr robert', 'doctor robert')

#clist = set(completeset['song'])
#alist = set(alllyrics['song'])
#clist - alist

completeset = pd.merge(
    completeset, alllyrics,
    left_on='song', right_on='song')


singerslist['song'] = singerslist['song'].str.replace('kansas city', 'medley kansas city  hey hey hey hey')
singerslist['song'] = singerslist['song'].str.replace('please mr postman', 'please mister postman')
singerslist['song'] = singerslist['song'].str.replace('when im sixtyfour', 'when im sixty four')
singerslist['song'] = singerslist['song'].str.replace('youve really got a hold on me', 'you really got a hold on me')

#clist = set(completeset['song'])
#singlist = set(singerslist['song'])
#clist - singlist

completeset = pd.merge(
    completeset, singerslist,
    left_on='song', right_on='song')


#FIX LeadVocals COLUMN
#put in alphabetical order
singlist = set(completeset['LeadVocals'])
completeset['LeadVocals'] = completeset['LeadVocals'].str.replace(u'Harrison\, with Lennon and McCartney', 'Harrison Lennon McCartney')
completeset['LeadVocals'] = completeset['LeadVocals'].str.replace(u'Lennon and Harrison', 'Harrison Lennon')
completeset['LeadVocals'] = completeset['LeadVocals'].str.replace(u'Lennon and McCartney', 'Lennon McCartney')
completeset['LeadVocals'] = completeset['LeadVocals'].str.replace(u'Lennon\, McCartney and Harrison', 'Harrison Lennon McCartney')
completeset['LeadVocals'] = completeset['LeadVocals'].str.replace(u'Lennon\, McCartney\, Harrison', 'Harrison Lennon McCartney')
completeset['LeadVocals'] = completeset['LeadVocals'].str.replace(u'Lennon\, with McCartney', 'Lennon McCartney')
completeset['LeadVocals'] = completeset['LeadVocals'].str.replace(u'Lennon\, with Starkey', 'Lennon Starkey')
completeset['LeadVocals'] = completeset['LeadVocals'].str.replace(u'McCartney\, with Lennon\, Harrison and Starkey', 'Harrison Lennon McCartney Starkey')
completeset['LeadVocals'] = completeset['LeadVocals'].str.replace(u'McCartney\, with Lennon', 'Lennon McCartney')
completeset['LeadVocals'] = completeset['LeadVocals'].str.replace(u'Starkey \(Best\)', 'Starkey')

#FIX Songwriter COLUMN
#writelist = set(completeset['Songwriter'])
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Alexander', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Bacharach', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Berry', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Dixon', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Dobbins\, Garrett\, ', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Drapkin \(aka Ricky Dee\)', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Goffin', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Gordy', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Harrison\, with uncredited contribution from Lennon', 'Harrison')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Holly', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Johnson', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Leiber\, Stoller', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Lennon and McCartney\, with Starkey', 'Lennon McCartney Starkey')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Lennon and McCartney', 'Lennon McCartney')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Lennon with McCartney', 'Lennon McCartney')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Lennon\, with McCartney\, Harrison and Starkey', 'Harrison Lennon McCartney Starkey')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Lennon\, with McCartney', 'Lennon McCartney')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Lennon\, with Ono and Harrison', 'Harrison Lennon Ono')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'McCartney\, with Lennon', 'Lennon McCartney')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Medley', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Perkins', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Robinson', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Russell', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Scott', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Starkey\, with uncredited assistance from Harrison', 'Starkey')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Traditional\, arr\. Lennon\, McCartney\, Harrison\, Starkey', 'Harrison Lennon McCartney Starkey')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Williams', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Willson', 'Other')
completeset['Songwriter'] = completeset['Songwriter'].str.replace(u'Other\, ', 'Other')

#FIX lyrics COLUMN (remove symbols -- and replace ' with a space, lowercase)
completeset['lyrics'] = completeset['lyrics'].str.replace('\.\.\.', ' ')
completeset['lyrics'] = completeset['lyrics'].str.replace('[^a-zA-Z0-9\n ]', '')
completeset['lyrics'] = completeset['lyrics'].str.lower()


#change categorical rows for Songwriter and Singer to binary rows
#writelist = set(completeset['Songwriter'])
#Ono does not count, and is grouped with John
completeset['HarrisonWrite'] = completeset['Songwriter'].str.find('Harrison')
completeset['LennonWrite'] = completeset['Songwriter'].str.find('Lennon')
completeset['McCartneyWrite'] = completeset['Songwriter'].str.find('McCartney')
completeset['StarkeyWrite'] = completeset['Songwriter'].str.find('Starkey')
completeset['OtherWrite'] = completeset['Songwriter'].str.find('Other')

#singlist = set(completeset['LeadVocals'])
completeset['HarrisonSing'] = completeset['LeadVocals'].str.find('Harrison')
completeset['LennonSing'] = completeset['LeadVocals'].str.find('Lennon')
completeset['McCartneySing'] = completeset['LeadVocals'].str.find('McCartney')
completeset['StarkeySing'] = completeset['LeadVocals'].str.find('Starkey')
completeset['OtherSing'] = completeset['LeadVocals'].str.find('N/A')

## Masking Word Clouds  <a id="masking"></a>
### Songs Written by Each Artist  <a id="writtenby"></a>

In [None]:
def createwc(url, lyrics):
    if url == georgepicurl:
        stopwords = set(STOPWORDS)
        image = cStringIO.StringIO(urllib.urlopen(url).read())
        image_mask = Image.open(image).convert("L")
        image_mask = image_mask.point(lambda x: 0 if x<128 else 255)
        image_mask = PIL.ImageOps.invert(image_mask)
        image_mask = np.array(image_mask)
        imagewc = WordCloud(background_color="white", max_words=2000, mask=image_mask,
                   stopwords=stopwords)
        return image_mask, imagewc.generate(lyrics)
    else:
        stopwords = set(STOPWORDS)
        image = cStringIO.StringIO(urllib.urlopen(url).read())
        image_mask = Image.open(image).convert("L")
        image_mask = np.array(image_mask)
        imagewc = WordCloud(background_color="white", max_words=2000, mask=image_mask,
                   stopwords=stopwords)
        return image_mask, imagewc.generate(lyrics)        
    
georgepicurl = 'https://github.com/chrisgmartin/DATA605/raw/master/1_george.png'
georgedf = completeset[completeset['HarrisonWrite'] != -1]
georgedf = georgedf['lyrics']
georgedf = str(list(georgedf))
george_mask, georgewc = createwc(georgepicurl, georgedf)

paulpicurl = 'https://github.com/chrisgmartin/DATA605/raw/master/2_paul.png'
pauldf = completeset[completeset['McCartneyWrite'] != -1]
pauldf = pauldf['lyrics']
pauldf = str(list(pauldf))
paul_mask, paulwc = createwc(paulpicurl, pauldf)

ringopicurl = 'https://github.com/chrisgmartin/DATA605/raw/master/3_ringo.png'
stardf = completeset[completeset['StarkeyWrite'] != -1]
stardf = stardf['lyrics']
stardf = str(list(stardf))
star_mask, starwc = createwc(ringopicurl, stardf)

johnpicurl = 'https://github.com/chrisgmartin/DATA605/raw/master/4_john.png'
lennondf = completeset[completeset['LennonWrite'] != -1]
lennondf = lennondf['lyrics']
lennondf = str(list(lennondf))
lennon_mask, lennonwc = createwc(johnpicurl, lennondf)

fig = plt.figure()
pic1 = fig.add_subplot(141)
pic1.imshow(georgewc)
pic1.axis("off")
plt.title('Harrison Wrote')
pic2 = fig.add_subplot(142)
pic2.imshow(paulwc)
pic2.axis("off")
plt.title('McCartney Wrote')
pic3 = fig.add_subplot(143)
pic3.imshow(starwc)
pic3.axis("off")
plt.title('Star Wrote')
pic4 = fig.add_subplot(144)
pic4.imshow(lennonwc)
pic4.axis("off")
plt.title('Lennon Wrote')

### Songs Sung by Each Artist  <a id="sungby"></a>

In [None]:
georgedf = completeset[completeset['HarrisonSing'] != -1]
georgedf = georgedf['lyrics']
georgedf = str(list(georgedf))
george_mask, georgewc = createwc(georgepicurl, georgedf)

pauldf = completeset[completeset['McCartneySing'] != -1]
pauldf = pauldf['lyrics']
pauldf = str(list(pauldf))
paul_mask, paulwc = createwc(paulpicurl, pauldf)

stardf = completeset[completeset['StarkeySing'] != -1]
stardf = stardf['lyrics']
stardf = str(list(stardf))
star_mask, starwc = createwc(ringopicurl, stardf)

lennondf = completeset[completeset['LennonSing'] != -1]
lennondf = lennondf['lyrics']
lennondf = str(list(lennondf))
lennon_mask, lennonwc = createwc(johnpicurl, lennondf)

fig = plt.figure()
pic1 = fig.add_subplot(141)
pic1.imshow(georgewc)
pic1.axis("off")
plt.title('Harrison Sung')
pic2 = fig.add_subplot(142)
pic2.imshow(paulwc)
pic2.axis("off")
plt.title('McCartney Sung')
pic3 = fig.add_subplot(143)
pic3.imshow(starwc)
pic3.axis("off")
plt.title('Star Sung')
pic4 = fig.add_subplot(144)
pic4.imshow(lennonwc)
pic4.axis("off")
plt.title('Lennon Sung')

## Answering Simple Questions  <a id="questions"></a>

In [None]:
#Let's try to answer a few questions:
#Who wrote the most songs?
pd.crosstab(index=completeset['Songwriter'], columns='count')
#Who wrote the most songs?
pd.crosstab(index=completeset['LeadVocals'], columns='count')

#How many songs are on each US Chart-topping album (by chart ranking)?
pd.crosstab(index=completeset['album'], columns=completeset['US Peak'])
#How many songs are on each UK Chart-topping album (by chart ranking)?
pd.crosstab(index=completeset['album'], columns=completeset['UK Peak'])
#How many songs are on each BPI Certified album (by BPI Certification)?
pd.crosstab(index=completeset['album'], columns=completeset['BPICert'])
#How many songs are on each RIAA Certified album (by RIAA Certification)?
pd.crosstab(index=completeset['album'], columns=completeset['RIAACert'])
#How many songs were released by year (by album)?
pd.crosstab(index=completeset['album'], columns=completeset['year'])


## For the Future  <a id="future"></a>

Overall, I found this to be a fun excersize in dealing with webscraping and figuring out how to merge various Pandas DataFrames when columns and rows do not match. There are several ways that this can be improved, absolutely. It is incredibly 'hacky' as it stands. Some suggestions:
  * Use Parallel Processing to improve speed
  * Better utilization of WordCloud (i.e. stopwords)
  * Better formatting of 'Questions' section
  * Detailed explainations of each step
  * Improved WordCloud masking (i.e. font colours, image sizes, etc.)