In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from gensim.models import Doc2Vec



Lyrics classifier. Trying to get the mood out of the song. 

Import the MasterSong.json file first...

In [2]:
import pandas as pd
import numpy as np

In [3]:
songs_df = pd.read_json('MasterSongList.json')

In [4]:
songs_lf_m = ['lyrics_features', 'moods']
lyrics_df = songs_df.copy()
lyrics_df = lyrics_df[songs_lf_m]
lyrics_df.head()

Unnamed: 0,lyrics_features,moods
0,"[oppa, gangnam, style, gangnam, style, najeneu...","[energetic, motivational]"
1,"[lately, i, ve, been, i, ve, been, losing, sle...",[happy]
2,"[party, rock, yeah, woo, let, s, go, party, ro...","[happy, celebratory, rowdy]"
3,"[alagamun, lan, weh, wakun, heya, hanun, gon, ...","[happy, energetic, celebratory]"
4,"[j, lo, the, other, side, out, my, mine, it, s...",[energetic]


In [5]:
lyrics_df['lyrics_features'] = lyrics_df['lyrics_features'].apply(' '.join)
lyrics_df['moods'] = lyrics_df['moods'].apply(', '.join)

In [6]:
lyrics_df.head()

Unnamed: 0,lyrics_features,moods
0,oppa gangnam style gangnam style najeneun ttas...,"energetic, motivational"
1,lately i ve been i ve been losing sleep dreami...,happy
2,party rock yeah woo let s go party rock is in ...,"happy, celebratory, rowdy"
3,alagamun lan weh wakun heya hanun gon alagamun...,"happy, energetic, celebratory"
4,j lo the other side out my mine it s a new gen...,energetic


In [7]:
#NaN songs.
lyrics_df['lyrics_features'].replace('', np.nan, inplace=True)

In [8]:
lyrics_df.shape

(36733, 2)

In [9]:
moodsframe =songs_df['moods']

In [10]:
lyrics_df.dropna(subset=['lyrics_features'], inplace=True)

In [11]:
lyrics_df.shape

(20931, 2)

In [12]:
lyrics_df = lyrics_df[lyrics_df.astype(str)['lyrics_features'] != '[]']
lyrics_df.dropna(subset=['lyrics_features'], inplace=True)
lyrics_df.shape

(20931, 2)

In [13]:
lyrics_df

Unnamed: 0,lyrics_features,moods
0,oppa gangnam style gangnam style najeneun ttas...,"energetic, motivational"
1,lately i ve been i ve been losing sleep dreami...,happy
2,party rock yeah woo let s go party rock is in ...,"happy, celebratory, rowdy"
3,alagamun lan weh wakun heya hanun gon alagamun...,"happy, energetic, celebratory"
4,j lo the other side out my mine it s a new gen...,energetic
5,today i don t feel like doing anything i just ...,"happy, sprightly"
6,there s a fire starting in my heart reaching a...,warm
7,i threw a wish in the well don t ask me i ll n...,"energetic, motivational"
8,now and then i think of when we were together ...,"seductive, nocturnal"
9,don t know what for you re turning heads when ...,"happy, celebratory"


In [14]:
#moods

In [15]:
mlb = MultiLabelBinarizer()

In [16]:
y_labels = mlb.fit_transform(moodsframe)

In [17]:
print(y_labels)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [18]:
print(mlb.classes_)

print(y_labels[0])

print(moodsframe.iloc[0])

['aggressive' 'angsty' 'atmospheric' 'campy' 'celebratory' 'classy'
 'cocky' 'cold' 'earthy' 'energetic' 'funky' 'gloomy' 'happy' 'hypnotic'
 'introspective' 'lush' 'mellow' 'motivational' 'nocturnal' 'raw' 'rowdy'
 'sad' 'seductive' 'sexual' 'soothing' 'spacey' 'sprightly' 'sweet'
 'trashy' 'trippy' 'visceral' 'warm']
[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
['energetic', 'motivational']


In [19]:
type(moodsframe)

pandas.core.series.Series

#end moods?wtf

In [20]:
#Reindex dataframe
lyrics_df.reset_index(drop=True, inplace=True)

Lyrics classifier. Trying to get the mood out of the song. 
First I must clean the data...

In [21]:
from string import punctuation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem.snowball import SnowballStemmer

1. Converting everything to lower case
2. Removing punctuation
3. Removing common words (stop words)
4. Stemming

In [22]:
def clean_text(raw_text):
    # Create empty list to receive result
    clean_words = []
    
    # 1. Convert to lower case
    raw_text = raw_text.lower()
    
    # 2. Remove punctuation
    translator = str.maketrans('', '', punctuation)
    raw_text = raw_text.translate(translator)
    split_words = raw_text.split()
    
    # 3 & 4. Remove common words and stem words
    stemmer = SnowballStemmer('english')
    for word in split_words:
        if word not in ENGLISH_STOP_WORDS:
            stemmed_word = stemmer.stem(word)
            clean_words.append(stemmed_word)
            
    return ' '.join(clean_words)

In [57]:
stemmer = SnowballStemmer('english')
stemmed_word = stemmer.stem('você')
print(stemmed_word)

você


In [23]:
songs_df['lyrics_features']=[" ".join(lyrics_features) for lyrics_features in songs_df['lyrics_features'].values]

In [24]:
moods = songs_df['moods'].tolist()
moods_set = set(x for i in moods for x in i)
moods_set

{'aggressive',
 'angsty',
 'atmospheric',
 'campy',
 'celebratory',
 'classy',
 'cocky',
 'cold',
 'earthy',
 'energetic',
 'funky',
 'gloomy',
 'happy',
 'hypnotic',
 'introspective',
 'lush',
 'mellow',
 'motivational',
 'nocturnal',
 'raw',
 'rowdy',
 'sad',
 'seductive',
 'sexual',
 'soothing',
 'spacey',
 'sprightly',
 'sweet',
 'trashy',
 'trippy',
 'visceral',
 'warm'}

# Random forest classifier bag of words ?

In [25]:
count_vect = CountVectorizer()

In [26]:
lyrics_df['lyrics_features']

0        oppa gangnam style gangnam style najeneun ttas...
1        lately i ve been i ve been losing sleep dreami...
2        party rock yeah woo let s go party rock is in ...
3        alagamun lan weh wakun heya hanun gon alagamun...
4        j lo the other side out my mine it s a new gen...
5        today i don t feel like doing anything i just ...
6        there s a fire starting in my heart reaching a...
7        i threw a wish in the well don t ask me i ll n...
8        now and then i think of when we were together ...
9        don t know what for you re turning heads when ...
10       nossa nossa assim você me mata ai se eu te peg...
11       shine bright like a diamond shine bright like ...
12       do you ever feel like a plastic bag drifting t...
13       oh oh woah oh oh oh oh oh oh oh oh caught in a...
14       girl my body don t lie red one i m out of my m...
15       ohh ohh ohh ohh ohh oh her eyes her eyes make ...
16       it s our party we can do what we want no drama.

Now that I have my cleaned songs I want to export to txt for my doc2vec stuff.

In [27]:
from gensim.models import Doc2Vec

put lyrics features into a list

In [28]:
myList = []

In [65]:
%run Doc2VecHelperFunctions.ipynb

In [38]:
myList = lyrics_df['lyrics_features'].tolist()

In [61]:
#myList

In [68]:
convert_lyrics_to_d2v(myList)

UnicodeEncodeError: 'charmap' codec can't encode characters in position 0-2: character maps to <undefined>

# Count Vectorizor

In [40]:
bag_of_words = count_vect.fit_transform(lyrics_df['lyrics_features'])

In [41]:
moods = lyrics_df['moods']

In [42]:
X = bag_of_words
y = moods

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

rfc = RandomForestClassifier(n_estimators=10, min_samples_split=2, max_features='log2')
rfc.fit(X_train, y_train)
rfc_predictions = rfc.predict(X_test)
print(accuracy_score(y_test, rfc_predictions)) => 0.03390639923591213

RandomForestClassifier(n_estimators=10, criterion=’gini’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)

rfc = RandomForestClassifier(n_estimators=10, criterion='entropy', min_samples_split=2, max_features='auto')
rfc.fit(X_train, y_train)
rfc_predictions = rfc.predict(X_test)
print(accuracy_score(y_test, rfc_predictions))  => 0.050620821394460364

rfc = RandomForestClassifier(n_estimators=10, criterion='gini', min_samples_split=2, max_features='auto')
rfc.fit(X_train, y_train)
rfc_predictions = rfc.predict(X_test)
print(accuracy_score(y_test, rfc_predictions)) = > 0.059694364851957976


rfc = RandomForestClassifier(n_estimators=10, criterion='gini', min_samples_split=2, max_features=None)
rfc.fit(X_train, y_train)
rfc_predictions = rfc.predict(X_test)
print(accuracy_score(y_test, rfc_predictions))

rfc = RandomForestClassifier(n_estimators=10, criterion='gini', min_samples_split=2, max_features='log2')
rfc.fit(X_train, y_train)
rfc_predictions = rfc.predict(X_test)
print(accuracy_score(y_test, rfc_predictions))

rfc = RandomForestClassifier(n_estimators=10, criterion='entropy', min_samples_split=2, max_features='log2')
rfc.fit(X_train, y_train)
rfc_predictions = rfc.predict(X_test)
print(accuracy_score(y_test, rfc_predictions)) => 0.04345749761222541

# TfidfVectorizer

In [43]:
tfidf = TfidfVectorizer()

In [45]:
bagofwordstfidf = tfidf.fit_transform(lyrics_df['lyrics_features'])

In [48]:
X_2 = bagofwordstfidf
y = moods
X_train, X_test, y_train, y_test = train_test_split(X_2,y,test_size=0.1,random_state=42)

lr = LogisticRegression()
lr.fit(X_train,y_train)
predictions2 = lr.predict(X_test)
print(accuracy_score(y_test,predictions2)) => 0.10649474689589303

0.10649474689589303


In [29]:
for lyrics_features in songs_df.columns:
    songs_df['lyrics_features'].to_csv(lyrics_features + '.txt', index=False)

This just saved everything... into txt file no index...?... no organisation.. must compare to exersice

# GridsearchCV

In [49]:
param_grid={}

In [50]:
param_grid['C']=[0.1,1, 10, 100, 1000]
param_grid['gamma']=[1,0.1,0.01,0.001,0.0001]
param_grid['kernel']= ['rbf']
param_grid

{'C': [0.1, 1, 10, 100, 1000],
 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
 'kernel': ['rbf']}

In [52]:
from sklearn.svm import SVC
model = SVC()

In [58]:
grid = GridSearchCV(model, param_grid, verbose=3)

In [53]:
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [54]:
jeanpierre = model.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix(y_test, jeanpierre)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [55]:
print(confusion_matrix(y_test, jeanpierre))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [56]:
print(classification_report(y_test, jeanpierre))

                                                                    precision    recall  f1-score   support

                                                        aggressive       0.00      0.00      0.00        59
                                                   aggressive, raw       0.00      0.00      0.00         2
                                                 aggressive, rowdy       0.00      0.00      0.00         5
                                              aggressive, visceral       0.00      0.00      0.00         5
                                                            angsty       0.00      0.00      0.00        31
                                                angsty, aggressive       0.00      0.00      0.00        31
                                         angsty, aggressive, rowdy       0.00      0.00      0.00         6
                                    angsty, aggressive, rowdy, raw       0.00      0.00      0.00         5
                           

  'precision', 'predicted', average, warn_for)


In [None]:
#grid.fit(X_train,y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits




[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.0531015037593985, total=10.2min
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 13.2min remaining:    0.0s


[CV]  C=0.1, gamma=1, kernel=rbf, score=0.05414874980092371, total=10.2min
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 26.3min remaining:    0.0s


[CV]  C=0.1, gamma=1, kernel=rbf, score=0.056365403304178816, total=10.3min
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.045269423558897244, total= 7.4min
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.04586717630195891, total= 7.4min
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.04664723032069971, total= 7.6min
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.1, gamma=0.01, kernel=rbf, score=0.045269423558897244, total=72.8min
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.1, gamma=0.01, kernel=rbf, score=0.04586717630195891, total= 5.7min
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV]  C=0.1, gamma=0.01, kernel=rbf, score=0.04664723032069971, total= 7.1min
[CV] C=0.1, gamma=0.001, kernel

In [1]:
grid.best_params_

NameError: name 'grid' is not defined

In [None]:
grid.best_estimator_

In [None]:
new_prediction = grid.best_estimator_.predict(X_test)

In [None]:
print(confusion_matrix(y_test,new_prediction))

In [None]:
print(classification_report(y_test, new_prediction))