# Lyric Classifier
---

Music genre classifier (between bossa nova, funk, gospel and sertanejo styles) using lyrics.

References:
* http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
* http://zacstewart.com/2015/04/28/document-classification-with-scikit-learn.html
* http://sebastianraschka.com/Articles/2014_naive_bayes_1.html
* http://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/
* _
* http://radimrehurek.com/data_science_python/
* https://spandan-madan.github.io/DeepLearningProject/

In [1]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import metrics

## Step 1 - Data Wrangling

### Load and join the data lyrics

In [5]:
bossa_nova = pd.read_csv('input/lyrics/bossa_nova.csv')
bossa_nova['genre'] = 'bossa_nova'

funk = pd.read_csv('input/lyrics/funk.csv')
funk['genre'] = 'funk'

gospel = pd.read_csv('input/lyrics/gospel.csv')
gospel['genre'] = 'gospel'

sertanejo = pd.read_csv('input/lyrics/sertanejo.csv')
sertanejo['genre'] = 'sertanejo'

df = pd.concat([bossa_nova, funk, gospel, sertanejo], ignore_index=True)
df = df.reindex(np.random.permutation(df.index)) # Shuffle the data

In [21]:
# Data Analysis
#print( df.describe() )

print( df.head( n=10 ) )

#df['genre'].value_counts().plot(kind='bar')

                                                  lyric       genre
2292   \nHá quem amou demais\nHá quem chorou demais\...      gospel
2214   \nQuão grande graça após uma consagração\nFei...      gospel
3082   \nEu preciso daquele sorriso\nQue me envolve ...   sertanejo
1874   \nRenova-me, Senhor Jesus\nJá não quero ser i...      gospel
304    \nOn my way, (on my way)\nI laugh with you, (...  bossa_nova
804    \nÉ a flauta envolvente que mexe com a mente\...        funk
1637   \nEm poucas palavras\nVou dizer quem você é\n...      gospel
744    \nNem que algum dia eu venha a chorar\nE vive...  bossa_nova
1401   \nAcabou o "caô"\nO guerrero chegou\nO guerre...        funk
1534   \nMas é que eu sou safado\nBandido, tarado\nE...        funk


### Additional resources

Load stopwords (pt-BR), preparing tokenizer, vectorizers and so on.

In [8]:
with open('stopwords.txt', 'r') as infile:
    stop_words = infile.read().splitlines()
print('stop words %s ...' %stop_words[:5])

stop words ['de ', 'a ', 'o ', 'que ', 'e '] ...


In [22]:
# Porter Stemmer
import nltk
import string
import re

porter_stemmer = nltk.stem.porter.PorterStemmer()

def porter_tokenizer(text, stemmer=porter_stemmer):
    """
    A Porter-Stemmer-Tokenizer hybrid to splits sentences into words (tokens) 
    and applies the porter stemming algorithm to each of the obtained token. 
    Tokens that are only consisting of punctuation characters are removed as well.
    Only tokens that consist of more than one letter are being kept.
    
    Parameters
    ----------
        
    text : `str`. 
      A sentence that is to split into words.
        
    Returns
    ----------
    
    no_punct : `str`. 
      A list of tokens after stemming and removing Sentence punctuation patterns.
    
    """
    lower_txt = text.lower()
    tokens = nltk.wordpunct_tokenize(lower_txt)
    stems = [porter_stemmer.stem(t) for t in tokens]
    no_punct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]
    return no_punct

# Test tokenizer
porter_tokenizer("Don't !!! --- want swimming. ")

['don', 't', 'want', 'swim']

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(
            encoding='utf-8',
            decode_error='replace',
            strip_accents='unicode',
            analyzer='word',
            binary=False,
            stop_words=stop_words,
            tokenizer=porter_tokenizer,
            ngram_range=(1,1)
    )

vec2 = CountVectorizer(
            encoding='utf-8',
            decode_error='replace',
            strip_accents='unicode',
            analyzer='word',
            binary=False,
            stop_words=stop_words,
            tokenizer=porter_tokenizer,
            ngram_range=(2,2)
    )

#X_train = df['lyric'].values
#vec = vec.fit(X_train.ravel())
#print('Vocabulary size: %s' %len(vec.get_feature_names()))

In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
            encoding='utf-8',
            decode_error='replace',
            strip_accents='unicode',
            analyzer='word',
            binary=False,
            stop_words=stop_words,
            tokenizer=porter_tokenizer
    )

### Split training and test data

In [17]:
# Split data (validation accuracy)
from sklearn.model_selection import train_test_split
X = df['lyric'].values
y = df['genre'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(2880,) (2880,)
(320,) (320,)


In [18]:
# Data Prep

# Count Vec
#from sklearn.feature_extraction.text import CountVectorizer
#count_vectorizer = CountVectorizer(ngram_range=(1, 4))
#X_train_counts = count_vectorizer.fit_transform(df['lyric'].values)
#counts.shape

X_train_counts = vec.fit_transform(X_train.ravel())
print(X_train_counts.shape)

#from sklearn.feature_extraction.text import TfidfTransformer
#tfidf_transformer = TfidfTransformer()
#X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
#X_train_tfidf.shape

(2880, 18404)


In [20]:
# Classifier #1: NaiveBayes (Multinomial)
from sklearn.naive_bayes import MultinomialNB

clf1 = MultinomialNB().fit(X_train_counts, y_train)

# Avaliacao do modelo
y_pred = clf1.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

ValueError: could not convert string to float: ' \nE ela vem toda indisciplinada\nEla gosta de dar uma quicada\nEla quica, ela para, ela fica\nEsfola a cabeça da pica\nEla quica, ela para, ela fica\nEsfola a cabeça da pica\nCaralho, caralho!\nSentou na piroca cheia de violência\nE quase me machucou!\nE ela só tem 16 e te acaba com uma sentada\nUi! Caraca, moleque! Que pepeca malcriada!\nEla só tem 16 e te acaba numa sentada\nCaraca, moleque! Que pepeca malcriada!\nSenta, garota. Vem sentar com força!\nE ela vem toda indisciplinada\nEla gosta de dar uma quicada\nEla quica, ela para, ela fica\nEsfola a cabeça da pica\nE ela só tem 16 e te acaba com uma sentada...\nUi! Caraca, moleque! Que pepeca malcriada!\nEla só tem 16 e te acaba numa sentada\nCaraca, moleque! Que pepeca malcriada!\nCaralho, caralho! Sentou na piroca cheia de violência\nE quase me machucou! '

In [149]:
# Classifier #1: NaiveBayes
from sklearn.naive_bayes import GaussianNB

#clf = GaussianNB().fit(X_train, y_train)

# Avaliacao do modelo
#y_pred = clf.predict(X_test)
#metrics.accuracy_score(y_test, y_pred)

In [150]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='hinge', penalty='l2', 
                     alpha=1e-4, n_iter=500, random_state=42)
clf2 = clf.fit(X_train, y_train)

# Avaliacao do modelo
y_pred = clf2.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))

#print(metrics.classification_report(y_test, y_pred, target_names=df['genre'].values))

#print( metrics.confusion_matrix(y_test, y_pred) )

0.8359375


In [121]:
from sklearn.linear_model import LogisticRegression

#for i in [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]:
clf = LogisticRegression(C = 10).fit(X_train, y_train)

# Avaliacao do modelo
y_pred = clf.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.8625


In [None]:
from sklearn.naive_bayes import MultinomialNB
#from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

pipeline_3 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

parameters_3 = dict(
    vect__binary=[False],
    vect__stop_words=[stop_words, None],
    vect__tokenizer=[porter_tokenizer, None],
    vect__ngram_range=[(1,1), (2,2), (3,3)],
)

grid_search_3 = GridSearchCV(pipeline_3, parameters_3, n_jobs=1)


print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline_3.steps])
print("parameters:")
grid_search_3.fit(X_train, y_train)
print("Best score: %0.3f" % grid_search_3.best_score_)
#print("Best parameters set:")
#best_parameters_3 = grid_search_3.best_estimator_.get_params()
#for param_name in sorted(parameters_3.keys()):
#    print("\t%s: %r" % (param_name, best_parameters_3[param_name]))