In [1]:
import pandas as pd
import getpass
import plotly.graph_objs as go

import psycopg2
import sqlalchemy
from sqlalchemy import create_engine

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

## Load data from database

In [3]:
DATABASE_URL = getpass.getpass('Enter the DATABASE_URL variable:')
engine = create_engine(DATABASE_URL)

Enter the DATABASE_URL variable:········


In [4]:
selectQuery = "select * from lyrics_table"
lyrics_df = pd.read_sql(selectQuery, engine)
lyrics_df.head()

Unnamed: 0,song,year,artist,genre,lyrics,lyrics-length
0,then-tell-me,2009,beyonce-knowles,Pop,playin everything so easy it s like you seem...,967
1,honesty,2009,beyonce-knowles,Pop,If you search For tenderness It isn t hard to ...,644
2,you-are-my-rock,2009,beyonce-knowles,Pop,Oh oh oh I oh oh oh I Verse 1 If I wrote a...,1757
3,black-culture,2009,beyonce-knowles,Pop,Party the people the people the party it s po...,1147
4,all-i-could-do-was-cry,2009,beyonce-knowles,Pop,I heard Church bells ringing I heard A choir s...,487


In [5]:
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
        
    return clean_tokens

In [6]:
genre_names = lyrics_df.groupby('genre').size().index.tolist()

In [7]:
X = lyrics_df['lyrics']
Y = lyrics_df['genre']

In [8]:
#tokenize(X[0])
#lyrics_df['lyrics'].apply(tokenize).head(5)

In [9]:
#X = lyrics_df['lyrics'].apply(tokenize)
X[:5]

0    playin  everything so easy  it s like you seem...
1    If you search For tenderness It isn t hard to ...
2    Oh oh oh I  oh oh oh I  Verse 1   If I wrote a...
3    Party the people  the people the party it s po...
4    I heard Church bells ringing I heard A choir s...
Name: lyrics, dtype: object

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [11]:
X_train[:5]

851     I ll stay I ll stay for she ll be coming back ...
9306    You re just too good to be true Can t take my ...
7484    I was a man with a world in his hand but I dro...
5248    Don t wanna be a martyr in this war Don t wann...
7926    Here in the depth of You I m not afraid You re...
Name: lyrics, dtype: object

In [12]:
X_test[:5]

3445    So you got dumped on your rump Not surprised y...
7602    He put his arms around her shoulder And with a...
676     Everybody stand up  get down Move when I tell ...
7939    Do you feel sorry for yourself  when you make ...
3689    Ill dance with you when its ready and dont tak...
Name: lyrics, dtype: object

In [13]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=None, stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())
])

In [14]:
parameters = {
    'vect__max_df': (0.5, 1.0),
    'tfidf__use_idf': (True, False),
    'clf__n_estimators': [50, 100]
}

In [15]:
cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=4, cv=3, verbose=5)

In [16]:
cv.fit(X_train, Y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   52.3s
[Parallel(n_jobs=4)]: Done  22 out of  24 | elapsed:  2.4min remaining:   12.9s
[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed:  2.4min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [17]:
#cv_new = CountVectorizer(stop_words='english')
#count_vector=cv_new.fit_transform(X_train)

In [18]:
#cv_new.vocabulary_

{'ll': 26102,
 'stay': 42766,
 'coming': 8739,
 'know': 24519,
 'head': 20134,
 'went': 49500,
 'play': 34018,
 'reward': 37496,
 'returns': 37420,
 'woo': 50057,
 'keeps': 24046,
 'hanging': 19842,
 'friends': 17342,
 'ridicule': 37618,
 'substitute': 43447,
 'talk': 44344,
 'endure': 14393,
 'alright': 1594,
 'mother': 29158,
 'says': 39022,
 'son': 41861,
 'whoo': 49682,
 'come': 8701,
 'home': 20918,
 'woah': 49986,
 'ba': 3346,
 'doo': 12941,
 'right': 37657,
 'girl': 18438,
 'lord': 26418,
 'gonna': 18785,
 'greater': 19060,
 'night': 30460,
 'replace': 37114,
 'decides': 11010,
 'wants': 49170,
 'wait': 49060,
 'ain': 1237,
 'goin': 18740,
 'stand': 42650,
 'ground': 19235,
 'hey': 20508,
 'just': 23665,
 'good': 18789,
 'true': 46480,
 'eyes': 15643,
 'like': 25890,
 'heaven': 20219,
 'touch': 45843,
 'wanna': 49154,
 'hold': 20867,
 'long': 26338,
 'love': 26497,
 'arrived': 2606,
 'thank': 45039,
 'god': 18699,
 'alive': 1472,
 'pardon': 32562,
 'way': 49325,
 'stare': 42682,

In [19]:
#count_vector.shape

(8000, 51155)

In [20]:
#cv_new.stop_words

'english'

In [21]:
Y_pred = cv.predict(X_test)

In [22]:
print(classification_report(Y_test, Y_pred))

               precision    recall  f1-score   support

      Country       0.00      0.00      0.00        40
   Electronic       1.00      0.09      0.17        53
         Folk       1.00      0.10      0.18        30
      Hip-Hop       0.91      0.65      0.76       169
        Indie       0.88      0.16      0.27        44
         Jazz       0.75      0.12      0.21        74
        Metal       0.89      0.45      0.60       183
Not Available       0.81      0.13      0.22       170
        Other       1.00      0.22      0.36        60
          Pop       0.63      0.24      0.35       376
          R&B       0.00      0.00      0.00        10
         Rock       0.49      0.97      0.65       791

     accuracy                           0.56      2000
    macro avg       0.70      0.26      0.31      2000
 weighted avg       0.66      0.56      0.49      2000




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [23]:
import pickle
pickle_file = open('data/classifier.pkl', 'wb')
pickle.dump(cv, pickle_file)

In [24]:
print(confusion_matrix(Y_test, Y_pred))

[[  0   0   0   0   0   0   0   0   0   0   0  40]
 [  0   5   0   0   0   0   0   2   0   1   0  45]
 [  0   0   3   0   0   0   0   0   0   0   0  27]
 [  0   0   0 110   0   0   0   2   0   3   0  54]
 [  0   0   0   0   7   0   0   0   0   3   0  34]
 [  0   0   0   0   0   9   0   1   0   5   0  59]
 [  0   0   0   0   0   0  83   0   0   2   0  98]
 [  0   0   0   5   0   0   3  22   0  22   0 118]
 [  0   0   0   0   1   0   0   0  13   1   0  45]
 [  0   0   0   3   0   3   0   0   0  92   0 278]
 [  0   0   0   0   0   0   0   0   0   2   0   8]
 [  0   0   0   3   0   0   7   0   0  14   0 767]]


In [30]:
#genre_names
cnf_matrix = confusion_matrix(Y_test, Y_pred)
cnf_matrix_df = pd.DataFrame(data = cnf_matrix,
                             index = genre_names,
                             columns = genre_names)
cnf_matrix_df

Unnamed: 0,Country,Electronic,Folk,Hip-Hop,Indie,Jazz,Metal,Not Available,Other,Pop,R&B,Rock
Country,0,0,0,0,0,0,0,0,0,0,0,40
Electronic,0,5,0,0,0,0,0,2,0,1,0,45
Folk,0,0,3,0,0,0,0,0,0,0,0,27
Hip-Hop,0,0,0,110,0,0,0,2,0,3,0,54
Indie,0,0,0,0,7,0,0,0,0,3,0,34
Jazz,0,0,0,0,0,9,0,1,0,5,0,59
Metal,0,0,0,0,0,0,83,0,0,2,0,98
Not Available,0,0,0,5,0,0,3,22,0,22,0,118
Other,0,0,0,0,1,0,0,0,13,1,0,45
Pop,0,0,0,3,0,3,0,0,0,92,0,278


In [25]:
cnf_matrix = confusion_matrix(Y_test, Y_pred)
cnf_matrix_pkl = open('data/matrix.pkl', 'wb')
pickle.dump(cnf_matrix, cnf_matrix_pkl)
cnf_matrix_pkl.close()

In [31]:
cnf_df_pkl = open('data/cnf_df.pkl', 'wb')
pickle.dump(cnf_matrix_df, cnf_df_pkl)
cnf_df_pkl.close()

In [27]:
cls_report = classification_report(Y_test, Y_pred)
cls_report_pkl = open('data/cls_report.pkl', 'wb')
pickle.dump(cls_report, cls_report_pkl)
cls_report_pkl.close()