In [1]:

%matplotlib  inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re


from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords

from collections import Counter

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

## Unit 4 Capstone
The dataset is a collection of 57650 songs from 600 plus artists. The data was collected from Kaggle and has a variety of applications like predicting the artist by song or sentiment analysis to cluster songs into genres. 

In [2]:
lyric_data = pd.read_csv('C:\\Users\\david\Desktop\\thinkful datasets\\songdata.csv')

In [3]:
lyric_data.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
#removing the uneeded column "Link"
lyric_data = lyric_data.drop(["link"], 1)

In [5]:
lyric_data.head(5)

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [6]:
lyric_data.shape

(57650, 3)

In [7]:
lyric_data.artist.unique()

array(['ABBA', 'Ace Of Base', 'Adam Sandler', 'Adele', 'Aerosmith',
       'Air Supply', 'Aiza Seguerra', 'Alabama', 'Alan Parsons Project',
       'Aled Jones', 'Alice Cooper', 'Alice In Chains', 'Alison Krauss',
       'Allman Brothers Band', 'Alphaville', 'America', 'Amy Grant',
       'Andrea Bocelli', 'Andy Williams', 'Annie', 'Ariana Grande',
       'Ariel Rivera', 'Arlo Guthrie', 'Arrogant Worms', 'Avril Lavigne',
       'Backstreet Boys', 'Barbie', 'Barbra Streisand', 'Beach Boys',
       'The Beatles', 'Beautiful South', 'Beauty And The Beast',
       'Bee Gees', 'Bette Midler', 'Bill Withers', 'Billie Holiday',
       'Billy Joel', 'Bing Crosby', 'Black Sabbath', 'Blur', 'Bob Dylan',
       'Bob Marley', 'Bob Rivers', 'Bob Seger', 'Bon Jovi', 'Boney M.',
       'Bonnie Raitt', 'Bosson', 'Bread', 'Britney Spears',
       'Bruce Springsteen', 'Bruno Mars', 'Bryan White', 'Cake',
       'Carly Simon', 'Carol Banawa', 'Carpenters', 'Cat Stevens',
       'Celine Dion', 'Chaka Khan

In [8]:
#examining the number of unique artists
len(lyric_data.artist.unique())

643

In [9]:
#examining how many songs each artist has in the dataset
lyric = lyric_data.artist.value_counts().nlargest(25)

lyric_pd = pd.DataFrame({'artist':lyric.index ,'count':lyric.values})
lyric_pd

lyric_pd

Unnamed: 0,artist,count
0,Donna Summer,191
1,Gordon Lightfoot,189
2,Bob Dylan,188
3,George Strait,188
4,Alabama,187
5,Cher,187
6,Loretta Lynn,187
7,Reba Mcentire,187
8,Chaka Khan,186
9,Dean Martin,186


In [10]:
lyric_pd['artist_per'] =  lyric_pd['count'].sum()/lyric_pd['count']

In [11]:
lyric_pd

Unnamed: 0,artist,count,artist_per
0,Donna Summer,191,24.120419
1,Gordon Lightfoot,189,24.375661
2,Bob Dylan,188,24.505319
3,George Strait,188,24.505319
4,Alabama,187,24.636364
5,Cher,187,24.636364
6,Loretta Lynn,187,24.636364
7,Reba Mcentire,187,24.636364
8,Chaka Khan,186,24.768817
9,Dean Martin,186,24.768817


## Examining the Artists the Most Songs 

Since the list is so big I am going to examine the aritsts with the most songs. I will look at the top 25 artists in the list. It looks like Donna Summer has the most songs with 191 songs in the dataset and the 25th ranking artist is Ray Orbison with 178 songs. 



In [12]:
Top25_df = pd.DataFrame(columns= ['artist'])

In [13]:
top_25_artists = lyric_pd.artist
Top25_df['artist'] = top_25_artists

In [14]:
top25_songs = pd.merge(Top25_df, lyric_data, on='artist', how='inner')

In [15]:
top25_songs

Unnamed: 0,artist,song,text
0,Donna Summer,A Runner With The Pack,He's got a second side he's got sixth sense \...
1,Donna Summer,All Through The Night,"You know, boy \nYou're so busy reaching for a..."
2,Donna Summer,Any Way At All,Ooo \nHey baby \nOoo \n \nWe never know \...
3,Donna Summer,Autumn Changes,This love of ours \nIs gradually fading \nSo...
4,Donna Summer,Back In Love Again,Lately I'm feeling glad I'm alive \n'Cause wh...
5,Donna Summer,Back Where You Belong,Ooh I've been down there a million times \nIf...
6,Donna Summer,Bad Girls,Bad girls \nTalking about the sad girls \nSa...
7,Donna Summer,Bad Reputation,"When I was young, my momma always told me \nI..."
8,Donna Summer,Be Myself Again,Let me introduce myself \nI'm a woman that yo...
9,Donna Summer,Black Lady,"It was bad, really mean \nShe had the kind of..."


In [16]:
len(top25_songs)

4607

In [17]:
len(top25_songs.artist.unique())

25

In [18]:
top25_songs.artist.value_counts()

Donna Summer         191
Gordon Lightfoot     189
George Strait        188
Bob Dylan            188
Reba Mcentire        187
Alabama              187
Loretta Lynn         187
Cher                 187
Dean Martin          186
Chaka Khan           186
Neil Young           185
Hank Williams Jr.    185
Indigo Girls         184
Nazareth             184
Cliff Richard        184
America              184
Johnny Cash          183
Kiss                 183
Chris Rea            182
Bon Jovi             181
Fleetwood Mac        180
Dolly Parton         180
Rolling Stones       179
Deep Purple          179
Roy Orbison          178
Name: artist, dtype: int64

## Cleaning the Data
Here I clean the data removing any unnecessary punctuation

In [19]:
#shuffle the order
top25_songs = top25_songs.sample(frac=1).reset_index(drop=True)

In [20]:
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?:-[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [21]:
import nltk
artist_name = []
clean_text = []
twords = []
track_name = []


for i in range(0,top25_songs.shape[0]):  
    z = top25_songs.artist[i]
    artist_name.append(z)
    
    y =top25_songs.song[i]
    track_name.append(y)
    
    x = top25_songs.text[i]
    clean = text_cleaner(x)
    clean_text.append(clean)    
    
    token_words = nltk.word_tokenize(clean)
    twords.append(token_words)

In [22]:
finalsongs = pd.DataFrame()
finalsongs['artist'] = artist_name
finalsongs['cleaned_text'] = clean_text
finalsongs['tokens'] = twords
finalsongs['song name']= track_name

In [23]:
finalsongs.head()

Unnamed: 0,artist,cleaned_text,tokens,song name
0,Donna Summer,Night after night I keep holding on You say yo...,"[Night, after, night, I, keep, holding, on, Yo...",Protection
1,Hank Williams Jr.,There's a big old smokin' crater Where our hou...,"[There, 's, a, big, old, smokin, ', crater, Wh...",It's A Start
2,Bon Jovi,"And there I stood, just like a soldier I was t...","[And, there, I, stood, ,, just, like, a, soldi...",Edge Of A Broken Heart
3,Reba Mcentire,"Hey girl, it's me, just called to tell you hi ...","[Hey, girl, ,, it, 's, me, ,, just, called, to...",My Sister
4,George Strait,She was stormin' through the house that day An...,"[She, was, stormin, ', through, the, house, th...",Give It Away


In [24]:

num_words = []
for i in range(0,finalsongs.shape[0]):
    # Add song length as a new feature
    numw = len(finalsongs.tokens[i])
    num_words.append(numw)
    
finalsongs['number of words'] = num_words

In [25]:
unique_words = []
for i in range(0,finalsongs.shape[0]):
    # Add unique words as a new feature
    unw = len(set(finalsongs.tokens[i]))
    unique_words.append(unw)
    
finalsongs['unique words'] = unique_words

In [26]:
finalsongs['percent unique words'] = finalsongs['unique words']/finalsongs['number of words']

In [27]:
finalsongs.head()

Unnamed: 0,artist,cleaned_text,tokens,song name,number of words,unique words,percent unique words
0,Donna Summer,Night after night I keep holding on You say yo...,"[Night, after, night, I, keep, holding, on, Yo...",Protection,319,115,0.360502
1,Hank Williams Jr.,There's a big old smokin' crater Where our hou...,"[There, 's, a, big, old, smokin, ', crater, Wh...",It's A Start,238,111,0.466387
2,Bon Jovi,"And there I stood, just like a soldier I was t...","[And, there, I, stood, ,, just, like, a, soldi...",Edge Of A Broken Heart,440,123,0.279545
3,Reba Mcentire,"Hey girl, it's me, just called to tell you hi ...","[Hey, girl, ,, it, 's, me, ,, just, called, to...",My Sister,265,151,0.569811
4,George Strait,She was stormin' through the house that day An...,"[She, was, stormin, ', through, the, house, th...",Give It Away,283,134,0.473498


In [28]:
finalsongs.shape

(4607, 7)

In [29]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [30]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [31]:

# this picks the most common words in each speech:
allwords = []
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    for word in text:
        word = word.lower()
        word = wordnet_lemmatizer.lemmatize(word)
        if word.isalnum() == True:
            if word not in stopwords.words('english'):
                allwords.append(word)
            else:
                continue
        else:
            continue
            
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(50)]

In [32]:

# this goes through the whole song list in the dataframe to find the BOW# this g 
all_common_words = []

for i in range(0,finalsongs.shape[0]):
    z = bag_of_words(finalsongs.tokens[i])
    all_common_words.append(z)
    z = []
    
# Can flatten list of lists    [[word, word],[word,word]]  ~  sum(list_of_lists, [])

In [33]:

# this finds the common words among all songs top words:# this f 

cw=[]

for i in range(0,finalsongs.shape[0]):
    for word in all_common_words[i]:
        if word not in cw:
            cw.append(word)
        else:
            continue

In [34]:
print(cw)

['protection', 'need', 'baby', 'love', 'ya', 'want', 'ca', 'night', 'keep', 'think', 'believin', 'live', 'without', 'well', 'home', 'away', 'holding', 'say', 'leave', 'lonely', 'believe', 'word', 'sayin', 'evil', 'game', 'playin', 'still', 'day', 'long', 'got', 'confession', 'help', 'obsession', 'wait', 'telephone', 'call', 'house', 'knock', 'door', 'rush', 'stair', 'open', 'together', 'put', 'arm', 'around', 'sweep', 'confusion', 'surround', 'mind', 'ai', 'thing', 'done', 'enough', 'get', 'heart', 'least', 'better', 'hell', 'start', 'feel', 'old', 'back', 'much', 'could', 'time', 'worst', 'first', 'came', 'feelin', 'name', 'edge', 'broken', 'im', 'man', 'run', 'ill', 'stand', 'turn', 'walk', 'know', 'like', 'oh', 'tell', 'sister', 'friend', 'used', 'wa', 'give', 'said', 'big', 'never', 'reason', 'girl', 'yeah', 'wo', 'come', 'trust', 'ah', 'many', 'tree', 'go', 'called', 'made', 'set', 'sweetheart', 'marina', 'del', 'rey', 'going', 'see', 'part', 'clear', 'tonight', 'across', 'america

In [35]:
print(len(cw))

155


In [36]:

wordcount  = pd.DataFrame(columns=cw)
wordcount['song_lyrics'] = finalsongs.cleaned_text
wordcount.loc[:, cw] = 0

list_of_words = []
for i in range(0,finalsongs.shape[0]):
    
    for word in finalsongs.tokens[i]:
        word = word.lower()
        word = wordnet_lemmatizer.lemmatize(word)
        if word.isalnum() == True:
            if word not in stopwords.words('english'):      
                if word in cw:
                    list_of_words.append(word)

                  
    # Populate the row with word counts.
    for w in list_of_words:
        wordcount.loc[i, w] += 1
    
    # reset list again
    list_of_words = []

In [37]:
word_sum = pd.DataFrame()
a = wordcount.sum()
word_sum['wordcount'] = a

word_sum = word_sum.drop(word_sum.index[len(word_sum)-1])

word_sum['wordcount'] = word_sum['wordcount'].astype(str).astype(int)

word_sum['word']=word_sum.index
word_sum = word_sum.reset_index(drop=True)

word_sum.sort_values(['wordcount'], ascending=[False])

Unnamed: 0,wordcount,word
3,8801,love
80,5450,know
29,5029,got
65,4250,time
81,4079,like
87,4059,wa
101,3745,go
82,3663,oh
2,3585,baby
129,3581,one


In [38]:
d = {}
for a, x in word_sum.values:
    d[x] = a

In [39]:
d

{'protection': 26,
 'need': 1776,
 'baby': 3585,
 'love': 8801,
 'ya': 432,
 'want': 2816,
 'ca': 2644,
 'night': 2562,
 'keep': 1453,
 'think': 1522,
 'believin': 18,
 'live': 768,
 'without': 592,
 'well': 2206,
 'home': 1673,
 'away': 2080,
 'holding': 169,
 'say': 2699,
 'leave': 777,
 'lonely': 621,
 'believe': 908,
 'word': 649,
 'sayin': 95,
 'evil': 59,
 'game': 265,
 'playin': 91,
 'still': 1456,
 'day': 2479,
 'long': 1623,
 'got': 5029,
 'confession': 8,
 'help': 618,
 'obsession': 7,
 'wait': 375,
 'telephone': 64,
 'call': 800,
 'house': 283,
 'knock': 135,
 'door': 629,
 'rush': 68,
 'stair': 32,
 'open': 256,
 'together': 489,
 'put': 580,
 'arm': 469,
 'around': 1356,
 'sweep': 11,
 'confusion': 20,
 'surround': 32,
 'mind': 1335,
 'ai': 1667,
 'thing': 1690,
 'done': 563,
 'enough': 600,
 'get': 3103,
 'heart': 2605,
 'least': 78,
 'better': 878,
 'hell': 323,
 'start': 555,
 'feel': 1904,
 'old': 1308,
 'back': 2197,
 'much': 1067,
 'could': 2226,
 'time': 4250,
 'wor

In [40]:

# Need to make my BOWs into integers# Need t 
bow_integers = list(wordcount.columns)
bow_integers = bow_integers[:-1]
wordcount[bow_integers] = wordcount[bow_integers].astype(str).astype(int)

In [41]:
wordcount.head()

Unnamed: 0,protection,need,baby,love,ya,want,ca,night,keep,think,...,life,world,little,right,eye,hey,would,every,look,song_lyrics
0,16,16,13,10,4,4,3,2,2,2,...,0,0,0,0,0,0,0,0,0,Night after night I keep holding on You say yo...
1,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,There's a big old smokin' crater Where our hou...
2,0,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,"And there I stood, just like a soldier I was t..."
3,0,0,0,0,0,0,1,1,0,2,...,0,0,0,0,0,1,0,0,0,"Hey girl, it's me, just called to tell you hi ..."
4,0,0,0,3,0,1,3,2,0,0,...,0,0,0,1,0,0,0,0,0,She was stormin' through the house that day An...


In [42]:
result = pd.concat([wordcount, finalsongs], axis=1)

In [43]:
result.head()

Unnamed: 0,protection,need,baby,love,ya,want,ca,night,keep,think,...,every,look,song_lyrics,artist,cleaned_text,tokens,song name,number of words,unique words,percent unique words
0,16,16,13,10,4,4,3,2,2,2,...,0,0,Night after night I keep holding on You say yo...,Donna Summer,Night after night I keep holding on You say yo...,"[Night, after, night, I, keep, holding, on, Yo...",Protection,319,115,0.360502
1,0,0,0,0,0,0,0,0,0,2,...,0,0,There's a big old smokin' crater Where our hou...,Hank Williams Jr.,There's a big old smokin' crater Where our hou...,"[There, 's, a, big, old, smokin, ', crater, Wh...",It's A Start,238,111,0.466387
2,0,1,1,1,0,0,0,0,0,1,...,0,0,"And there I stood, just like a soldier I was t...",Bon Jovi,"And there I stood, just like a soldier I was t...","[And, there, I, stood, ,, just, like, a, soldi...",Edge Of A Broken Heart,440,123,0.279545
3,0,0,0,0,0,0,1,1,0,2,...,0,0,"Hey girl, it's me, just called to tell you hi ...",Reba Mcentire,"Hey girl, it's me, just called to tell you hi ...","[Hey, girl, ,, it, 's, me, ,, just, called, to...",My Sister,265,151,0.569811
4,0,0,0,3,0,1,3,2,0,0,...,0,0,She was stormin' through the house that day An...,George Strait,She was stormin' through the house that day An...,"[She, was, stormin, ', through, the, house, th...",Give It Away,283,134,0.473498


In [44]:
result.describe()

Unnamed: 0,protection,need,baby,love,ya,want,ca,night,keep,think,...,little,right,eye,hey,would,every,look,number of words,unique words,percent unique words
count,4607.0,4607.0,4607.0,4607.0,4607.0,4607.0,4607.0,4607.0,4607.0,4607.0,...,4607.0,4607.0,4607.0,4607.0,4607.0,4607.0,4607.0,4607.0,4607.0,4607.0
mean,0.005644,0.3855,0.778164,1.910354,0.09377,0.611244,0.573909,0.55611,0.31539,0.330367,...,0.360538,0.394183,0.307358,0.188626,0.340135,0.292815,0.292815,225.896028,96.267853,0.448767
std,0.242944,1.704974,2.377493,3.877577,0.74748,1.871278,1.487288,1.555546,1.19184,1.139926,...,1.36039,1.144391,0.860528,1.360625,1.01594,1.000433,0.933758,92.985078,35.14642,0.114915
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0,15.0,0.071642
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,159.0,73.0,0.36772
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,209.0,92.0,0.444444
75%,0.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,275.0,113.0,0.527778
max,16.0,80.0,36.0,67.0,24.0,32.0,23.0,33.0,24.0,28.0,...,29.0,20.0,15.0,30.0,15.0,18.0,16.0,931.0,395.0,0.834711


In [45]:
artist_cleaned_txt_df =  result[['artist','cleaned_text']]

## Splitting data

Make sure artists are distributed porportionally after the split

In [46]:
artist_cleaned_txt_df.head()

Unnamed: 0,artist,cleaned_text
0,Donna Summer,Night after night I keep holding on You say yo...
1,Hank Williams Jr.,There's a big old smokin' crater Where our hou...
2,Bon Jovi,"And there I stood, just like a soldier I was t..."
3,Reba Mcentire,"Hey girl, it's me, just called to tell you hi ..."
4,George Strait,She was stormin' through the house that day An...


In [47]:
newX =result['cleaned_text']
newy = artist_cleaned_txt_df['artist']

In [48]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
sss.get_n_splits(newX, newy)

5

In [49]:
for train_index, test_index in sss.split(newX, newy):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = newX[train_index], newX[test_index]
    y_train, y_test = newy[train_index], newy[test_index]

TRAIN: [4298 4217 3089 ... 3626 1447  337] TEST: [2408 3464 4576 ... 2357  254 1409]
TRAIN: [ 809 1915  188 ... 2441 3148 2400] TEST: [4416 2605 2786 ... 1456 1343 4409]
TRAIN: [2012 4318 4172 ... 4049 1691 2097] TEST: [3521 1916 1199 ... 4511 3461 2066]
TRAIN: [1263 3466 3049 ... 2231 3619 1654] TEST: [ 404  741  164 ...  972 1273 3893]
TRAIN: [2775 1236  724 ... 2494 1245  138] TEST: [1057 3636 2502 ... 4503 2372 1786]


In [50]:
y_test.value_counts()

Donna Summer         48
Cher                 47
George Strait        47
Loretta Lynn         47
Dean Martin          47
Reba Mcentire        47
Alabama              47
Bob Dylan            47
Gordon Lightfoot     47
Hank Williams Jr.    46
Kiss                 46
Johnny Cash          46
Cliff Richard        46
America              46
Neil Young           46
Nazareth             46
Indigo Girls         46
Chaka Khan           46
Bon Jovi             45
Fleetwood Mac        45
Rolling Stones       45
Deep Purple          45
Dolly Parton         45
Chris Rea            45
Roy Orbison          44
Name: artist, dtype: int64

## BoW Feature

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [52]:
train_data_bow = count_vect.fit_transform(X_train)
test_data_bow = count_vect.transform(X_test)

In [54]:
train_data_bow

<3455x14150 sparse matrix of type '<class 'numpy.int64'>'
	with 289531 stored elements in Compressed Sparse Row format>

In [56]:
test_data_bow

<1152x14150 sparse matrix of type '<class 'numpy.int64'>'
	with 94364 stored elements in Compressed Sparse Row format>

In [55]:
from sklearn import ensemble
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Random Forest Classifier
rfc_bow1 = ensemble.RandomForestClassifier(random_state=42)
rfc_bow1.fit(train_data_bow,y_train)
cv_bow1 = cross_val_score(rfc_bow1, train_data_bow, y_train, cv=5)


#Test
y_true, y_pred = y_test, rfc_bow1.predict(test_data_bow)
print("Test Set score = {:.3} ".format(accuracy_score(y_true, y_pred)))


print("Training set Cross validation = {}".format(cv_bow1))
print("mean = {:.3}".format(cv_bow1.mean()))
print(pd.crosstab(y_true, y_pred))

Test Set score = 0.105 
Training set Cross validation = [0.09728183 0.12931034 0.10693642 0.10334789 0.10132159]
mean = 0.108
col_0              Alabama  America  Bob Dylan  Bon Jovi  Chaka Khan  Cher  \
artist                                                                       
Alabama                  5        0          4         2           1     4   
America                  2        9          6         1           5     1   
Bob Dylan                1        1          5         4           0     1   
Bon Jovi                 2        0          1        15           3     2   
Chaka Khan               3        3          1         1           6     5   
Cher                     2        2          3         2           6     1   
Chris Rea                3        6          1         1           3     0   
Cliff Richard            3        3          5         3           3     1   
Dean Martin              4        4          2         0           4     3   
Deep Purple     

[25 rows x 25 columns]


In [60]:
df_bow_rfc_pct = pd.crosstab(y_true, y_pred, margins = True)

rfc_bow_df = pd.DataFrame(columns = ['artist','rfc_bow_correct','rfc_lsa_svm_total','rfc_bow_pct_correct'])
rfc_bow_correct = []
rfc_bow_total = []
pct_correct = []
all_correct = 0
for artist in df_bow_rfc_pct:
        artist_correct = df_bow_rfc_pct.loc[artist,artist]
        rfc_bow_correct.append(artist_correct)
        t = df_bow_rfc_pct.loc[artist,'All']   
        rfc_bow_total.append(t)
        all_correct += artist_correct
        a = round(artist_correct/t*100,0)
        pct_correct.append(a)
        
rfc_bow_df['rfc_bow_correct'] = rfc_bow_correct
rfc_bow_df['rfc_lsa_svm_total']= rfc_bow_total
rfc_bow_df['artist'] = lyric_pd
rfc_bow_df['rfc_bow_pct_correct'] = pct_correct

rfc_bow_df.sort_values('rfc_bow_pct_correct',ascending = False)

Unnamed: 0,artist,rfc_bow_correct,rfc_lsa_svm_total,rfc_bow_pct_correct
25,,1152,1152,100.0
3,George Strait,15,45,33.0
6,Loretta Lynn,9,45,20.0
1,Gordon Lightfoot,9,46,20.0
8,Chaka Khan,8,47,17.0
18,Chris Rea,7,46,15.0
4,Alabama,6,46,13.0
16,Johnny Cash,6,46,13.0
10,Neil Young,6,45,13.0
11,Hank Williams Jr.,6,48,12.0


## Support Vector Machine- BoW

In [66]:
from sklearn.metrics import confusion_matrix

from sklearn.svm import SVC
svm_bow = SVC(kernel='linear')
svm_bow.fit(train_data_bow,y_train)
svm_bow_cv = cross_val_score(svm_bow, train_data_bow, y_train, cv=5)


print("Report:")
y_true, y_pred = y_test, svm_bow.predict(test_data_bow)
#print(classification_report(y_true, y_pred))

print("Test Set score = {:.3} ".format(accuracy_score(y_true, y_pred)))


print("Training set Cross validation = {}".format(svm_bow_cv))
print("mean = {:.3}".format(svm_bow_cv.mean()))
pd.crosstab(y_true, y_pred)

Report:
Test Set score = 0.203 
Training set Cross validation = [0.19885551 0.21264368 0.19219653 0.20815138 0.23054332]
mean = 0.208


col_0,Alabama,America,Bob Dylan,Bon Jovi,Chaka Khan,Cher,Chris Rea,Cliff Richard,Dean Martin,Deep Purple,...,Hank Williams Jr.,Indigo Girls,Johnny Cash,Kiss,Loretta Lynn,Nazareth,Neil Young,Reba Mcentire,Rolling Stones,Roy Orbison
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,3,2,0,1,2,2,3,2,1,0,...,4,0,2,2,0,1,2,2,1,2
America,1,11,1,0,2,3,2,2,1,1,...,0,0,0,2,2,0,5,4,1,1
Bob Dylan,0,2,12,1,1,1,1,2,2,1,...,0,7,0,1,1,1,3,1,0,0
Bon Jovi,2,1,1,15,1,3,0,2,0,2,...,2,3,0,2,0,1,1,0,1,0
Chaka Khan,0,1,1,1,9,4,1,3,2,0,...,0,1,1,3,2,0,1,0,1,1
Cher,4,0,2,2,5,6,2,2,1,0,...,0,2,2,2,2,1,1,3,2,0
Chris Rea,0,0,2,0,2,0,27,1,2,4,...,1,0,0,0,1,0,1,1,1,2
Cliff Richard,1,2,1,1,4,3,2,4,1,4,...,0,3,2,4,0,2,1,1,1,0
Dean Martin,0,2,1,1,3,0,5,4,8,2,...,3,0,2,0,1,1,3,3,1,3
Deep Purple,0,5,5,0,0,0,2,1,2,7,...,0,1,1,3,1,5,3,0,0,1


In [67]:
df_bow_svm_pct = pd.crosstab(y_true, y_pred, margins = True)

svm_df = pd.DataFrame(columns = ['artist','svm_correct','svm_total','svm_pct_correct'])
svm_correct = []
svm_total = []
pct_correct = []
all_correct = 0
for artist in df_bow_svm_pct:
        artist_correct = df_bow_svm_pct.loc[artist,artist]
        svm_correct.append(artist_correct)
        t = df_bow_svm_pct.loc[artist,'All']   
        svm_total.append(t)
        all_correct += artist_correct
        a = round(artist_correct/t*100,0)
        pct_correct.append(a)
        
svm_df['svm_correct'] = svm_correct
svm_df['svm_total']= svm_total
svm_df['artist'] = lyric_pd
svm_df['svm_pct_correct'] = pct_correct

svm_df.sort_values('svm_pct_correct',ascending = False)

Unnamed: 0,artist,svm_correct,svm_total,svm_pct_correct
25,,1152,1152,100.0
6,Loretta Lynn,27,45,60.0
16,Johnny Cash,19,46,41.0
14,Indigo Girls,16,47,34.0
3,George Strait,15,45,33.0
19,Bon Jovi,13,47,28.0
15,America,13,46,28.0
2,Bob Dylan,12,47,26.0
22,Rolling Stones,12,47,26.0
1,Gordon Lightfoot,11,46,24.0


## Tfid Feature Selection

In [68]:
vectorizer = TfidfVectorizer(max_df=0.4, # drop words that occur in more than half the paragraphs
                             min_df=1, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

#Applying the vectorizer
song_tfidf=vectorizer.fit_transform(newX)
print("Number of features: %d" % song_tfidf.get_shape()[1])

#splitting into training and test sets. Reserving 25% of my corpus as a test set.



Number of features: 15845


In [69]:
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(song_tfidf, 
                                                                newy,
                                                                test_size=0.25,
                                                                random_state=0)


## Latent Semantic Analysis
Let's use Latent Semantic Analysis to reduce the tf-idf into a lower dimensional space. This is similar to Principal Component Analysis (PCA), where we reduce the number of rows and combine them into new dimensions.

In [70]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(2500)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

songs_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(10):
    print('Component {}:'.format(i))
    print(songs_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 97.5211495757385
Component 0:
cleaned_text
Don't tell me you're sorry for thing you caused me And from from now on you'll be true And don't don't you ask me to forgive you again I'm fresh out of forgiveness for you Now I, I gave you everything I knew how to give Now I've done done everything you've ever asked me to do But even even a fool gets so tired being fool I'm fresh out of forgiveness for you No I I can't I just can't I can't forgive you this time if I did oh I know you do Well you're just too just too wrong and you hurt me hurt me again And I'm fresh out of forgiveness for you I'm fresh out of forgiveness for you                                                                                                                                                                                                                                                                                                                                       

Name: 0, dtype: float64
Component 1:
cleaned_text
When I feel you dreaming I think of sunsets How high my high gets Chorus: I wanna give to you an everlasting love I fill your life with a satisfying love All you need is an everlasting love All you want is a satisfying (mystifying) love Sun breezes, moonlight teases Friendly invasion Late night persuasion Chorus Each time you pass me by You saw me fade away I'll give you more each day                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

Name: 1, dtype: float64
Component 2:
cleaned_text
I'm flyin' in a 747, I'm passing by the pearly gates And I'm comin' real close to Heaven And my guitar just can't wait, it just can't wait In France really had the chance, yeah, there was plenty romance I've been to England too, there wasn't much to do One thing I know is true, what I would rather do is Rockin' in the U.S.A. Nowhere else I'd rather stay Rockin' and a-rollin', rockin' and a-rollin' Rockin' in the U.S.A. Germany was really neat, Japan had lots to eat And Denmark was great, but I just can't wait, rockin' in the U.S.A. Rockin' in the U.S.A. (rockin' in the U.S.A.) Nowhere else I'd rather stay (rockin' in the U.S.A.) Rockin' and a-rollin' (rockin' and rollin') Rockin' and a-rollin' (rockin' and rollin') Rockin' in the U.S.A. Rockin' in the U.S.A. (rockin' in the U.S.A.) Nowhere else I'd rather stay (rockin' in the U.S.A.) Rockin' and a-rollin' (rockin' and rollin') Rockin' and a-rollin' (rockin' and rollin') Rockin' in the U

Name: 2, dtype: float64
Component 3:
cleaned_text
In for the night With greg allman Snuggle with me mama like you used to Weather's kind of cold, but I don't care Slip off your old red flannel nightgown Getting to feel like fall Hey we're a fallen pair [Chorus:] Well there's a bluebird Flying home to Mobile Camping in your cornfield for a while Seems he just backed into a square meal And he's in for the night See that fine yellow moon a rising Through the frost along the window pane All of your shooting stars Are in the wrong direction Well I love you darling But you just seem to change [Chorus:] Strut with me mama like you used to Weather's kind of cold, but I don't care Slip off your old red flannel nightgown Getting to feel like falling with a fallin' fare [Chorus:]                                                                                                                                                                                                                             

Name: 3, dtype: float64
Component 4:
cleaned_text
If you see her, say hello, she might be in Tangier She left here last early spring, is livin' there, I hear Say for me that I'm all right though things get kind of slow She might think that I've forgotten her, don't tell her it isn't so. We had a falling-out, like lovers often will And to think of how she left that night, it still brings me a chill And though our separation, it pierced me to the heart She still lives inside of me, we've never been apart. If you get close to her, kiss her once for me I always have respected her for busting out and gettin' free Oh, whatever makes her happy, I won't stand in the way Though the bitter taste still lingers on from the night I tried to make her stay. I see a lot of people as I make the rounds And I hear her name here and there as I go from town to town And I've never gotten used to it, I've just learned to turn it off Either I'm too sensitive or else I'm gettin' soft. Sundown, yellow moon, I r

Name: 4, dtype: float64
Component 5:
cleaned_text
I'm flyin' in a 747, I'm passing by the pearly gates And I'm comin' real close to Heaven And my guitar just can't wait, it just can't wait In France really had the chance, yeah, there was plenty romance I've been to England too, there wasn't much to do One thing I know is true, what I would rather do is Rockin' in the U.S.A. Nowhere else I'd rather stay Rockin' and a-rollin', rockin' and a-rollin' Rockin' in the U.S.A. Germany was really neat, Japan had lots to eat And Denmark was great, but I just can't wait, rockin' in the U.S.A. Rockin' in the U.S.A. (rockin' in the U.S.A.) Nowhere else I'd rather stay (rockin' in the U.S.A.) Rockin' and a-rollin' (rockin' and rollin') Rockin' and a-rollin' (rockin' and rollin') Rockin' in the U.S.A. Rockin' in the U.S.A. (rockin' in the U.S.A.) Nowhere else I'd rather stay (rockin' in the U.S.A.) Rockin' and a-rollin' (rockin' and rollin') Rockin' and a-rollin' (rockin' and rollin') Rockin' in the U

Name: 5, dtype: float64
Component 6:
cleaned_text
Baby's in the back seat, it's so real Gotta love the feel of the automobile You get a certain style when you're so mobile And responsibility you can avoid that trial Walkin' on the wild side Lookin' at the down side Just a little crazy Got a kind of strange pride Standin' at the back door, mercy me Ain't no place for the man to be It's a private love, no publicity And the woman that he's waitin' for is bad news, he'll see Goin' to a go-go Slippin' to a life low Goin' for the fake tan Dreamin' up a fashion plan Loverman He's a very superficial man He likes the girls that go there He's a dude with a master plan So young but why should he care? Drivin' into downtown right on time Smokin' like a pistol aimed on line Waitin' for the tingles in his back All those dangerous feelings getting' ready to attack Walkin' on the barbed wire Playin' with desire Goin' for the night plan Breakin' any heart he can Loverman He's a very superficial man He 

Name: 6, dtype: float64
Component 7:
cleaned_text
She said I don't recall Seeing you around here You must be new to this town Said, I'm just passin through But, girl from the looks of you I Could see me settling down The she smiled and said the invitations open Cause you look just like what I've been waiting on. So I said why don't we take This matter somewhere else And get to know this feeling that's so strong Lead on She said, I had a love once But he just up and left me I said I bet it broke your heart I had a love once to but I acted like a fool Oh what I'd give to be back in her arms Then she smiled and said the invitations open Cause you look just like what I've been waiting on. So I said why don't we take This matter somewhere else And pick up right where everything went wrong Lead on She said I don't recall Seeing you around here You must be new to this town                                                                                                                          

Name: 7, dtype: float64
Component 8:
cleaned_text
Sister I'm heading out of alabama So you better think fast The wind is gonna pick me up now And the rain won't slack I'm heading down to georgia So don't lose track Of where you've been and where you're going now Who you've seen and what they lack And why you come undone Every time you go there You come undone Yeah Yeah I know your heart's in danger And so is your life I said you learn to trust a stranger And stop and rest for the night Set your sight up in the headlight Well the moon won't be enough (sister) And light the embers of another And the night won't won't seem so rough Sister Ah There was a bloody beast of burden (i wish I was a mother) On a dark texas road (with a baby at my breast) A woman in a family way (i am just a farmer) And a car that lost control (with a milkcow and a mess) I'm studying the distance (i wish I was a mother) Between the blanket and the gun (with a baby at my breast) I got ten hours to natchez (i am jus

Name: 8, dtype: float64
Component 9:
cleaned_text
We blew off immigration The moon was sitting high We drove from lacandon into comitan And Gloria was singing And Cecelia closed her eyes And I saw them Drifting out over the night sky I said neuva senoritas Are you gone to brighter days Have you found your greener valley And the place where your heart stays I'm headed back to the flatland's And you're headed up to the hills Rain brings you home middle of July I guess I just got lonesome When I think about how you feel Six months gone And no one to dry your eyes I said nueva senoritas Are you gone to brighter days Have you found your greener valley And the place where your heart stays I said nueva senoritas Are you gone to brighter days Have you found your greener valley And the place where your heart stays                                                                                                                                                                                        

Name: 9, dtype: float64


## LSA Component Examination
### Component 1
- This component grouped songs by breakup. They were songs about past lovers and a yearning to get back with them. Here are some sample lyrics.
"I don't know why I ever let us drift apart"
" All the way I met you head on full speed At the heart the blue flame burns All the way I took the crash course impact"
"When you found somebody new I thought I never would Forget you"

### Component 2
- This next component was related to nature stuff. Although it was a continuing theme of being about love.
"When I feel you dreaming I think of sunsets"
"Oh, I'm sailin' away my own true love, I'm sailin' away in the morning."
"Red sun a-rising, over the hill I've had enough of this desert"

### Component 3
- The component mentions locations.
"Rockin' in the U.S.A. Germany was really neat, Japan had lots to eat And Denmark was great"
"I left a little town A little south of Hudson Bay"
" A prodigal returning to the girl in Saskatoon "

### Component 4
- This component took some time to read through and i had originally thought that it was just another grouping of love songs, which it is. But, in this case they talk about clothing items.
"I don't care Slip off your old red flannel nightgown"
"The blue western denim, the east coats razor edge"
"Momma's got her long dress on "

### Component 5
- This one was hard to decipher and it just looks like a mix of love songs.

Component 6 through 10 were a little harder to decipher and seemed like a hogdgepodge of songs mostly related to love. It seems that most of thes artists sing about love lost. This was a great exercise in truying to group songs by sentiment but did not do much in terms of grouping songs into artists.


## K-means Clustering (Sentiment Attempt)

In [97]:
true_k = 10
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X_train_tfidf)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=10, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [98]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

Top terms per cluster:
Cluster 0:
 ve
 home
 got
 way
 goodbye
 time
 long
 seen
 say
 come
Cluster 1:
 away
 blue
 way
 day
 life
 easy
 like
 ah
 believe
 say
Cluster 2:
 oh
 christmas
 ll
 let
 time
 need
 like
 merry
 make
 night
Cluster 3:
 yeah
 woman
 la
 mind
 ooh
 tonight
 like
 honky
 tonk
 crazy
Cluster 4:
 like
 night
 time
 let
 come
 chorus
 say
 world
 ll
 feel
Cluster 5:
 gonna
 ll
 let
 come
 got
 chorus
 gotta
 baby
 cause
 like
Cluster 6:
 heart
 fool
 start
 say
 let
 ve
 break
 chorus
 ll
 stop
Cluster 7:
 ll
 time
 believe
 come
 ve
 think
 say
 way
 make
 won
Cluster 8:
 baby
 want
 come
 let
 got
 need
 like
 oh
 make
 yeah
Cluster 9:
 got
 man
 hey
 ain
 little
 good
 like
 said
 old
 way


Theses clusters based solely on lyrics came out pretty incoherent. The goal was to test if the model could create meaningful sentiment clusters.

## Classification Clustering Using LSA

### Using K-Means Clusters

In [99]:
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans

# Normalize the data.
X_norm = normalize(X_train_lsa)

# Calculate predicted values.
y_pred = KMeans(n_clusters=25, random_state=42).fit_predict(X_norm)

# Check the solution against the data.
print('Comparing k-means clusters against artists:')
pd.crosstab(y_train, y_pred)

Comparing k-means clusters against artists:


col_0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,2,4,8,51,3,0,16,2,12,8,...,1,5,6,0,4,2,8,1,4,0
America,0,4,5,59,0,2,9,5,8,2,...,0,6,5,4,2,0,3,3,2,0
Bob Dylan,1,6,7,51,3,2,10,1,26,4,...,3,6,6,0,1,0,4,0,5,0
Bon Jovi,0,7,2,39,4,2,18,6,2,12,...,1,8,2,8,2,1,0,2,7,1
Chaka Khan,0,11,4,37,4,1,21,1,7,8,...,1,5,3,0,2,3,0,1,8,1
Cher,1,9,7,29,4,1,28,7,5,6,...,6,12,4,3,2,2,1,0,9,2
Chris Rea,1,5,4,62,4,1,5,5,5,4,...,2,5,8,4,0,8,1,1,1,1
Cliff Richard,1,8,3,49,3,1,13,4,3,8,...,2,10,2,2,3,3,3,1,3,1
Dean Martin,0,4,5,58,3,1,5,1,4,2,...,1,10,3,12,4,1,0,1,8,1
Deep Purple,0,8,8,41,4,4,4,2,9,11,...,2,9,3,3,4,0,0,2,2,0


### Mean Shift

## Supervised Learning Models for Classification

### Random Forest Classifier

In [100]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

rfc_lsa1 = ensemble.RandomForestClassifier(random_state=42)
rfc_lsa1.fit(X_train_lsa,y_train)
cv_lsa1 = cross_val_score(rfc_lsa1, X_train_lsa, y_train, cv=5)


#Test
y_true, y_pred = y_test, rfc_lsa1.predict(X_test_lsa)
print("Test Set score = {:.3} ".format(accuracy_score(y_true, y_pred)))


print("Training set Cross validation = {}".format(cv_lsa1))
print("mean = {:.3}".format(cv_lsa1.mean()))
print(pd.crosstab(y_true, y_pred))


Test Set score = 0.0694 
Training set Cross validation = [0.04125178 0.06446991 0.06367583 0.04239766 0.05301915]
mean = 0.053
col_0              Alabama  America  Bob Dylan  Bon Jovi  Chaka Khan  Cher  \
artist                                                                       
Alabama                  2        4          1         3           4     5   
America                  8        5          5         8           3     1   
Bob Dylan                4        1          2         3           1     6   
Bon Jovi                 4        5          1         6           3     2   
Chaka Khan              10        5          4         3           9     0   
Cher                     4        1          8         1           2     3   
Chris Rea                5        2          1         3           1     2   
Cliff Richard            3        3          5         0           3     2   
Dean Martin              7        3          1         1           2     5   
Deep Purple    

[25 rows x 25 columns]


In [107]:
df_lsa_rfc_pct = pd.crosstab(y_true, y_pred, margins = True)

rfc_lsa_df = pd.DataFrame(columns = ['artist','rfc_lsa_correct','rfc_lsa_svm_total','rfc_lsa_pct_correct'])
rfc_lsa_correct = []
rfc_lsa_total = []
pct_correct = []
all_correct = 0
for artist in df_lsa_rfc_pct:
        artist_correct = df_lsa_rfc_pct.loc[artist,artist]
        rfc_lsa_correct.append(artist_correct)
        t = df_lsa_rfc_pct.loc[artist,'All']   
        rfc_lsa_total.append(t)
        all_correct += artist_correct
        a = round(artist_correct/t*100,0)
        pct_correct.append(a)
        
rfc_lsa_df['rfc_lsa_correct'] = rfc_lsa_correct
rfc_lsa_df['rfc_lsa_svm_total']= rfc_lsa_total
rfc_lsa_df['artist'] = lyric_pd
rfc_lsa_df['rfc_lsa_pct_correct'] = pct_correct

rfc_lsa_df.sort_values('rfc_lsa_pct_correct',ascending = False)

Unnamed: 0,artist,rfc_lsa_correct,rfc_lsa_svm_total,rfc_lsa_pct_correct
25,,1152,1152,100.0
4,Alabama,9,57,16.0
3,George Strait,6,50,12.0
12,Nazareth,7,60,12.0
8,Chaka Khan,6,53,11.0
22,Rolling Stones,4,35,11.0
14,Indigo Girls,5,45,11.0
13,Cliff Richard,5,51,10.0
6,Loretta Lynn,4,41,10.0
1,Gordon Lightfoot,5,55,9.0


Random forest Classifer performs very poorly here with a accuracy score of only 5%. It didn't predict any somngs correctly for Fleetwood Mac on the the test set and the band with the highest prediction accuracy was Alabama. Next I will use Support Vector Machine. 

## Support Vector Machine

In [108]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc_model = svc.fit(X_train_lsa,y_train)
print(svc_model)
print('Training set score:', svc_model.score(X_train_lsa,y_train))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Training set score: 0.8946454413892909


In [109]:
from sklearn.metrics import confusion_matrix

from sklearn.svm import SVC
svm_t200 = SVC(kernel='linear')
svm_t200.fit(X_train_lsa,y_train)
svm_t200_cv = cross_val_score(svm_t200, X_train_lsa, y_train, cv=5)


print("Report:")
y_true, y_pred = y_test, svm_t200.predict(X_test_lsa)
#print(classification_report(y_true, y_pred))

print("Test Set score = {:.3} ".format(accuracy_score(y_true, y_pred)))


print("Training set Cross validation = {}".format(svm_t200_cv))
print("mean = {:.3}".format(svm_t200_cv.mean()))
print(pd.crosstab(y_true, y_pred))



Report:
Test Set score = 0.214 
Training set Cross validation = [0.21621622 0.18767908 0.2170767  0.23684211 0.19587629]
mean = 0.211
col_0              Alabama  America  Bob Dylan  Bon Jovi  Chaka Khan  Cher  \
artist                                                                       
Alabama                  7        1          2         1           3     1   
America                  1        5          1         1           0     1   
Bob Dylan                1        0         12         1           1     1   
Bon Jovi                 3        0          4        18           1     2   
Chaka Khan               1        0          1         3           6     4   
Cher                     2        0          4         1           1     7   
Chris Rea                3        0          4         0           0     2   
Cliff Richard            2        1          1         2           2     3   
Dean Martin              3        2          1         0           0     3   
Deep Pur

[25 rows x 25 columns]


In [110]:
pd.crosstab(y_true, y_pred, margins = True)

col_0,Alabama,America,Bob Dylan,Bon Jovi,Chaka Khan,Cher,Chris Rea,Cliff Richard,Dean Martin,Deep Purple,...,Indigo Girls,Johnny Cash,Kiss,Loretta Lynn,Nazareth,Neil Young,Reba Mcentire,Rolling Stones,Roy Orbison,All
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,7,1,2,1,3,1,0,0,1,0,...,1,6,0,3,0,0,5,1,2,45
America,1,5,1,1,0,1,3,2,2,0,...,3,1,4,1,1,0,4,1,3,55
Bob Dylan,1,0,12,1,1,1,0,1,1,2,...,0,3,1,1,2,1,1,6,0,41
Bon Jovi,3,0,4,18,1,2,0,2,0,2,...,1,1,4,1,1,0,5,1,0,50
Chaka Khan,1,0,1,3,6,4,0,3,2,1,...,1,2,4,1,3,1,10,4,1,57
Cher,2,0,4,1,1,7,2,2,0,2,...,2,0,3,0,1,0,2,1,1,38
Chris Rea,3,0,4,0,0,2,17,2,1,2,...,0,0,0,0,0,0,1,0,1,41
Cliff Richard,2,1,1,2,2,3,3,3,4,2,...,1,3,4,4,1,1,4,2,1,51
Dean Martin,3,2,1,0,0,3,1,4,14,1,...,0,1,1,3,0,2,1,4,2,53
Deep Purple,0,1,1,0,1,1,5,3,0,16,...,2,2,1,0,1,0,1,1,2,48


In [112]:
df_lsa_svm_pct = pd.crosstab(y_true, y_pred, margins = True)

svm_df = pd.DataFrame(columns = ['artist','svm_correct','svm_total','svm_pct_correct'])
svm_correct = []
svm_total = []
pct_correct = []
all_correct = 0
for artist in df_lsa_svm_pct:
        artist_correct = df_lsa_svm_pct.loc[artist,artist]
        svm_correct.append(artist_correct)
        t = df_lsa_svm_pct.loc[artist,'All']   
        svm_total.append(t)
        all_correct += artist_correct
        a = round(artist_correct/t*100,0)
        pct_correct.append(a)
        
svm_df['svm_correct'] = svm_correct
svm_df['svm_total']= svm_total
svm_df['artist'] = lyric_pd
svm_df['svm_pct_correct'] = pct_correct

svm_df.sort_values('svm_pct_correct',ascending = False)

Unnamed: 0,artist,svm_correct,svm_total,svm_pct_correct
25,,1152,1152,100.0
22,Rolling Stones,16,35,46.0
15,America,20,44,45.0
6,Loretta Lynn,17,41,41.0
19,Bon Jovi,15,40,38.0
3,George Strait,18,50,36.0
9,Dean Martin,16,48,33.0
2,Bob Dylan,12,41,29.0
14,Indigo Girls,12,45,27.0
13,Cliff Richard,14,51,27.0


Now this performed markedly better than RFC but still only returned an accuracy score of 21%. The most accurate artist was the Rolling Stones with 46% accuracy. Oddly enough, the top performing artist in the RFC, Alabama, only had an accuracy of 11%. Let's tr Gradient Boosting Classifier.

## Gradient Boosting Classifier

In [113]:

# We'll make 500 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
          'random_state': 42,
          'max_depth': 2,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train_lsa, y_train)


clf_cv = cross_val_score(clf, X_train_lsa, y_train, cv=5)

#Test
y_true, y_pred = y_test, clf.predict(X_test_lsa)

print("Test Set score = {:.3} ".format(accuracy_score(y_true, y_pred)))


print("Training set Cross validation = {}".format(clf_cv))
print("mean = {:.3}".format(clf_cv.mean()))
print(pd.crosstab(y_true, y_pred))

Test Set score = 0.0703 
Training set Cross validation = [0.05832148 0.06590258 0.05499276 0.06578947 0.06038292]
mean = 0.0611
col_0              Alabama  America  Bob Dylan  Bon Jovi  Chaka Khan  Cher  \
artist                                                                       
Alabama                  3        0          3         0           2     1   
America                  0        3          5         5           1     4   
Bob Dylan                0        5          1         0           1     0   
Bon Jovi                 0        2          1         5           1     5   
Chaka Khan               1        1          3         4           5     5   
Cher                     1        1          2         5           5     7   
Chris Rea                4        0          1         3           1     2   
Cliff Richard            2        1          2         5           1     2   
Dean Martin              0        1          3         3           0     3   
Deep Purple   

[25 rows x 25 columns]


This time Gradient Boosting only had an accuracy of 7%. Lastly, let's give Logisitc Regression a try.

## Logistic Regression

In [114]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lf_model = lr.fit(X_train_lsa,y_train)
print(lf_model)
print('Training set score:', lr.score(X_train_lsa,y_train))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Training set score: 0.8370477568740955


In [115]:
y_true, y_pred = y_test, lr.predict(X_test_lsa)
#print(classification_report(y_true, y_pred))

print("Test Set score = {:.3} ".format(accuracy_score(y_true, y_pred)))


Test Set score = 0.228 


In [117]:
pd.crosstab(y_true, y_pred)

col_0,Alabama,America,Bob Dylan,Bon Jovi,Chaka Khan,Cher,Chris Rea,Cliff Richard,Dean Martin,Deep Purple,...,Hank Williams Jr.,Indigo Girls,Johnny Cash,Kiss,Loretta Lynn,Nazareth,Neil Young,Reba Mcentire,Rolling Stones,Roy Orbison
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,5,1,1,0,3,0,2,0,1,0,...,3,2,5,1,3,0,1,7,0,1
America,2,5,1,2,0,1,4,1,2,0,...,2,2,1,4,2,0,0,5,1,3
Bob Dylan,0,0,9,2,1,1,0,0,1,2,...,5,0,5,3,1,3,1,2,3,0
Bon Jovi,2,0,4,15,1,2,2,0,0,2,...,1,5,0,3,2,1,2,5,1,1
Chaka Khan,0,0,1,2,6,7,1,2,1,0,...,1,1,2,4,4,3,0,11,2,1
Cher,1,0,3,2,1,5,2,1,0,2,...,0,2,0,3,0,1,0,4,2,2
Chris Rea,1,0,2,0,0,0,23,1,1,2,...,1,1,0,0,0,0,0,2,0,0
Cliff Richard,2,1,1,1,0,3,4,1,5,1,...,3,1,1,4,3,0,1,12,3,1
Dean Martin,2,1,3,0,0,4,2,2,18,1,...,2,0,1,1,3,0,2,1,3,1
Deep Purple,0,0,0,1,0,0,7,3,1,15,...,4,1,2,3,0,0,1,1,2,1


In [119]:
df_lsa_lr_pct = pd.crosstab(y_true, y_pred, margins = True)

lr_lsa_df = pd.DataFrame(columns = ['artist','lr_correct','lr_total','lr_pct_correct'])
lr_correct = []
lr_total = []
pct_correct = []
all_correct = 0
for artist in df_lsa_lr_pct:
        artist_correct = df_lsa_lr_pct.loc[artist,artist]
        lr_correct.append(artist_correct)
        t = df_lsa_lr_pct.loc[artist,'All']   
        lr_total.append(t)
        all_correct += artist_correct
        a = round(artist_correct/t*100,0)
        pct_correct.append(a)
        
lr_lsa_df['lr_correct'] = svm_correct
lr_lsa_df['lr_total']= svm_total
lr_lsa_df['artist'] = lyric_pd
lr_lsa_df['lr_pct_correct'] = pct_correct

lr_lsa_df.sort_values('lr_pct_correct',ascending = False)

Unnamed: 0,artist,lr_correct,lr_total,lr_pct_correct
25,,1152,1152,100.0
22,Rolling Stones,16,35,57.0
6,Loretta Lynn,17,41,56.0
15,America,20,44,50.0
19,Bon Jovi,15,40,40.0
8,Chaka Khan,14,53,34.0
14,Indigo Girls,12,45,31.0
9,Dean Martin,16,48,31.0
3,George Strait,18,50,30.0
18,Chris Rea,11,50,28.0


Logisitic Regression was the top performing model here and had similar accuracies to the Support Vector Machine with The Rolling Stones being the most accurately predicted artist. 

##  Conclusion

Overall, the dataset cooperated with tf-idf better than Bag of Words. What may be deciphered from this practice is that these artists sing about similar subjects and the model had a difficult time classifying who sand what. This was evident in the Latent Semantic Analysis where the components were almost all related to love. Next time around, it might be better to use a sample of different artists. Clustering also proved to be ineffective in classifying artists and only returned one cluster when using mean-shift.

Logistic Regression on the TF-IDF model was the most accurate model at 22%. Random Forest had extremely inaccurate models both for BoW and TF-idf only returning accuracies of around 5%. 

BoW and TF-idf operated very differently in accurately predicitng artists. The top artist in nearly all the Tf-idf model was The Rolling Stones and the BoW models were better at predicting Loretta Lynn and George Strait. Perhaps the penalization strategy of tf-idf has a significantly different effect compared to BoW.