In [100]:
# Imports

import nltk
nltk.download('stopwords')


import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text
from keras import utils



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [101]:
# import files
from google.colab import files
files.upload()

{}

In [0]:
# constants
TRAIN_DATA_FILE_PATH = "Lyrics-Genre-Train.csv"
TEST_DATA_FILE_PATH = "Lyrics-Genre-Test-GroundTruth.csv"
LYRICS_COLUMN = "Lyrics"
GENRE_COLUMN = "Genre"

# useful variables
output_mappings = {}

In [0]:
# citeste datele din fisier
def read_data(file_path):
    return pd.read_csv(file_path)[[LYRICS_COLUMN, GENRE_COLUMN]]


# clean la input si mapare la int pentru output
def create_new_dataset(dataset):
    global output_mappings
    dataset[GENRE_COLUMN] = dataset[GENRE_COLUMN].map(output_mappings)
    dataset[LYRICS_COLUMN] = dataset[LYRICS_COLUMN].apply(clean_text)
    return dataset


# aplicarea de create_new_dataset
def label_encoding(train_data, test_data):
    possible_labels = train_data[GENRE_COLUMN].unique()
    global output_mappings
    output_mappings = {genre: index for index, genre in enumerate(possible_labels)}
    new_train_data = create_new_dataset(train_data)
    new_test_data = create_new_dataset(test_data)
    return new_train_data, new_test_data
  

# cleaning text - o recomandare de la google
def clean_text(text):
  REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]!.')
  BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
  STOPWORDS = set(stopwords.words('english'))
  
  text = BeautifulSoup(text, "lxml").text
  text = text.lower() 
  text = REPLACE_BY_SPACE_RE.sub(' ', text)
  text = BAD_SYMBOLS_RE.sub('', text) 
  text = ' '.join(word for word in text.split() if word not in STOPWORDS)
  return text
    
  
# preia doar un procentaj din date - folosit pentru cazurile in care antrenarea dureaza foarte mult
def split_data(data, percentage):
  if (percentage == 1.0):
    return data
  data_unique = data[GENRE_COLUMN].unique()
  new_data = pd.DataFrame(columns=[LYRICS_COLUMN, GENRE_COLUMN])
  for i in data_unique:
    aux = data.loc[data[GENRE_COLUMN] == i]
    aux = aux.head(int(len(aux) * percentage))
    new_data = new_data.append(aux)
  
  return new_data
    

# split intre input si output (versuri si gen)
def split_input_label(data):
  x = data[LYRICS_COLUMN]
  y = data[GENRE_COLUMN]
  y.astype('int')
  return x, y


# TFIDF vectorizer
def tf_idf(train_data, test_data):
    vectorizer = TfidfVectorizer(strip_accents='ascii', stop_words='english', token_pattern=r'(?u)\b[A-Za-z]+\b')

    all_data = train_data.append(test_data)
    all_data = vectorizer.fit_transform(all_data[LYRICS_COLUMN])
    x_train = all_data[0:train_data.shape[0]]
    x_test = all_data[train_data.shape[0]:]
    
    y_train = train_data[GENRE_COLUMN]
    y_train.astype('int')
    y_test = test_data[GENRE_COLUMN]
    y_test.astype('int')
    return x_train, y_train, x_test, y_test
  
# SVC model
def SVC_model(x_train, y_train, x_test, y_test):
  svc_model = SVC(C=1.0, gamma=1.0, kernel='linear')
  scores = cross_val_score(svc_model, x_train, y_train, cv=5)
  print(scores)
  print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  print("--------------------------------------------------------------------------------------------------------")
  print()
  print("Training on all data and calculating accuracy on test data")
  svc_model = SVC(C=1.0, gamma=1.0, kernel='linear')
  svc_model.fit(x_train, y_train)
  accuracy = accuracy_score(y_test, svc_model.predict(x_test))
  precision = precision_score(y_test, svc_model.predict(x_test), average="weighted")
  f1_score_ = f1_score(y_test, svc_model.predict(x_test), average="weighted")
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("f1 score:", f1_score_)
  

# MultinomialNB model
def MultinomialNB_model(x_train, y_train, x_test, y_test):
  multinominalnb_model = MultinomialNB()
  scores = cross_val_score(multinominalnb_model, x_train, y_train, cv=5)
  print(scores)
  print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  print("--------------------------------------------------------------------------------------------------------")
  print()
  print("Training on all data and calculating accuracy on test data")
  multinominalnb_model = MultinomialNB()
  multinominalnb_model.fit(x_train, y_train)
  accuracy = accuracy_score(y_test, multinominalnb_model.predict(x_test))
  precision = precision_score(y_test, multinominalnb_model.predict(x_test), average="weighted")
  f1_score_ = f1_score(y_test, multinominalnb_model.predict(x_test), average="weighted")
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("f1 score:", f1_score_)
  

# SGD Classifier model
def SGDClassifier_model(x_train, y_train, x_test, y_test):
  sgdclassifier_model = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)
  scores = cross_val_score(sgdclassifier_model, x_train, y_train, cv=5)
  print(scores)
  print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  print("--------------------------------------------------------------------------------------------------------")
  print()
  print("Training on all data and calculating accuracy on test data")
  sgdclassifier_model =  SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)
  sgdclassifier_model.fit(x_train, y_train)
  accuracy = accuracy_score(y_test, sgdclassifier_model.predict(x_test))
  precision = precision_score(y_test, sgdclassifier_model.predict(x_test), average="weighted")
  f1_score_ = f1_score(y_test, sgdclassifier_model.predict(x_test), average="weighted")
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("f1 score:", f1_score_)
  
  
# Logistic Regression model
def LogisticRegression_model(x_train, y_train, x_test, y_test):
  logisticregression_model = LogisticRegression(n_jobs=1, C=1e5)
  scores = cross_val_score(logisticregression_model, x_train, y_train, cv=5)
  print(scores)
  print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  print("--------------------------------------------------------------------------------------------------------")
  print()
  print("Training on all data and calculating accuracy on test data")
  logisticregression_model = LogisticRegression(n_jobs=1, C=1e5)
  logisticregression_model.fit(x_train, y_train)
  accuracy = accuracy_score(y_test, logisticregression_model.predict(x_test))
  precision = precision_score(y_test, logisticregression_model.predict(x_test), average="weighted")
  f1_score_ = f1_score(y_test, logisticregression_model.predict(x_test), average="weighted")
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("f1 score:", f1_score_)
  
  
# MultinomialNB model using count vectorizer and TFIDF transformer
def count_vectorizer_tfidf_multinominalnb(x_train, y_train, x_test, y_test):
  pipe = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

  scores = cross_val_score(pipe, x_train, y_train, cv=5)
  print(scores)
  print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  print("--------------------------------------------------------------------------------------------------------")
  print()
  print("Training on all data and calculating accuracy on test data")
  
  pipe = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
  
  pipe.fit(x_train, y_train)
  accuracy = accuracy_score(y_test, pipe.predict(x_test))
  precision = precision_score(y_test, pipe.predict(x_test), average="weighted")
  f1_score_ = f1_score(y_test, pipe.predict(x_test), average="weighted")
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("f1 score:", f1_score_)
  
  
# SGD Classifier model using count vectorizer and TFIDF transformer
def count_vectorizer_tfidf_sgd_classifier(x_train, y_train, x_test, y_test):
  pipe = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])

  scores = cross_val_score(pipe, x_train, y_train, cv=5)
  print(scores)
  print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  print("--------------------------------------------------------------------------------------------------------")
  print()
  print("Training on all data and calculating accuracy on test data")
  
  pipe = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
  
  pipe.fit(x_train, y_train)
  accuracy = accuracy_score(y_test, pipe.predict(x_test))
  precision = precision_score(y_test, pipe.predict(x_test), average="weighted")
  f1_score_ = f1_score(y_test, pipe.predict(x_test), average="weighted")
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("f1 score:", f1_score_)
  
  
# Logistic Regression model using count vectorizer and TFIDF transformer
def count_vectorizer_tfidf_logistic_regression(x_train, y_train, x_test, y_test):
  pipe = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])

  scores = cross_val_score(pipe, x_train, y_train, cv=5)
  print(scores)
  print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
  print("--------------------------------------------------------------------------------------------------------")
  print()
  print("Training on all data and calculating accuracy on test data")
  
  pipe = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
  
  pipe.fit(x_train, y_train)
  accuracy = accuracy_score(y_test, pipe.predict(x_test))
  precision = precision_score(y_test, pipe.predict(x_test), average="weighted")
  f1_score_ = f1_score(y_test, pipe.predict(x_test), average="weighted")
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("f1 score:", f1_score_)
  

  
# Keras model using Bag of Words from keras
def keras_model_bow(all_data, train_data, test_data, loss_function):
  max_words = 1024
  tokenize = text.Tokenizer(num_words=max_words, char_level=False)
  tokenize.fit_on_texts(all_data[LYRICS_COLUMN])

  x_train = tokenize.texts_to_matrix(train_data[LYRICS_COLUMN])
  x_test = tokenize.texts_to_matrix(test_data[LYRICS_COLUMN])

  encoder = LabelEncoder()
  encoder.fit(all_data[GENRE_COLUMN])
  y_train = encoder.transform(train_data[GENRE_COLUMN])
  y_test = encoder.transform(test_data[GENRE_COLUMN])

  num_classes = np.max(y_train) + 1
  y_train = utils.to_categorical(y_train, num_classes)
  y_test = utils.to_categorical(y_test, num_classes)

  batch_size = 32
  epochs = 5

  model = Sequential()
  model.add(Dense(512, input_shape=(max_words,)))
  model.add(Activation('relu'))
  model.add(Dropout(0.5))
  model.add(Dense(num_classes))
  model.add(Activation('softmax'))

  model.compile(loss=loss_function,
                optimizer='adam',
                metrics=['accuracy'])

  history = model.fit(x_train, y_train,
                      batch_size=batch_size,
                      epochs=epochs,
                      verbose=1,
                      validation_split=0.1)
    
  score = model.evaluate(x_test, y_test,
                     batch_size=batch_size, verbose=1)
  print('Test accuracy:', score[1])

In [0]:
train_data = read_data(TRAIN_DATA_FILE_PATH)
test_data = read_data(TEST_DATA_FILE_PATH)

# Aleg doar un procentaj mic din date in cazul modelelor care necesita mult prea mult timp de antrenare (SVC)
# Astfel, rezultatele nu vor fi "reale" intrucat nu se folosesc toate datele.
train_data = split_data(train_data, 0.3)
test_data = split_data(test_data, 0.3)

In [0]:
# Encodez label-urile cu int-uri de la 0 la 9 in toat cazurile
train_data, test_data = label_encoding(train_data, test_data)

Voi extrage feature-urile in diferite moduri la care voi aplica diferiti algoritmi de invatare. 

Pentru fiecare model ales voi face cross validation.

Dupa rezultatele din cross validation voi calcula acuratetea pe datele de test.

Voi incepe prin extragerea feature-urilor folosind TFIDF si incercarea de diferiti algoritmi de invatare

Mentionez ca in fiecare code zone exista un comment care spune ce algoritm de invatare am folosit si ce feature extraction am folosit.

Rezultatele sunt afisate dupa fiecare rulare de algoritm.

In [0]:
x_train_tfidf, y_train_tfidf, x_test_tfidf, y_test_tfidf = tf_idf(train_data, test_data)

In [107]:
# SVC + tfidf
SVC_model(x_train_tfidf, y_train_tfidf, x_test_tfidf, y_test_tfidf)

[0.35483871 0.33154122 0.34146341 0.34688347 0.31526649]
Accuracy: 0.34 (+/- 0.03)
--------------------------------------------------------------------------------------------------------

Training on all data and calculating accuracy on test data
Accuracy: 0.35714285714285715
Precision: 0.4424175596973597
f1 score: 0.3185853823831478


In [108]:
# MultinomialNB_model + tfidf
train_data = read_data(TRAIN_DATA_FILE_PATH)
test_data = read_data(TEST_DATA_FILE_PATH)
train_data_all = split_data(train_data, 1.0)
test_data_all = split_data(test_data, 1.0)
train_data, test_data = label_encoding(train_data, test_data)
x_train_tfidf, y_train_tfidf, x_test_tfidf, y_test_tfidf = tf_idf(train_data, test_data)

MultinomialNB_model(x_train_tfidf, y_train_tfidf, x_test_tfidf, y_test_tfidf)

[0.28220362 0.28058331 0.28274372 0.28227985 0.28119935]
Accuracy: 0.28 (+/- 0.00)
--------------------------------------------------------------------------------------------------------

Training on all data and calculating accuracy on test data
Accuracy: 0.28733459357277885
Precision: 0.4694960572492441
f1 score: 0.18698440442165717


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [109]:
# SGDClassifier model + tfidf
train_data = read_data(TRAIN_DATA_FILE_PATH)
test_data = read_data(TEST_DATA_FILE_PATH)
train_data_all = split_data(train_data, 1.0)
test_data_all = split_data(test_data, 1.0)
train_data, test_data = label_encoding(train_data, test_data)
x_train_tfidf, y_train_tfidf, x_test_tfidf, y_test_tfidf = tf_idf(train_data, test_data)

SGDClassifier_model(x_train_tfidf, y_train_tfidf, x_test_tfidf, y_test_tfidf)



[0.37186065 0.36402917 0.37456117 0.37385197 0.37736359]
Accuracy: 0.37 (+/- 0.01)
--------------------------------------------------------------------------------------------------------

Training on all data and calculating accuracy on test data
Accuracy: 0.3784499054820416
Precision: 0.34919864283879115
f1 score: 0.3448792972018557


In [110]:
# LogisticRegression model + tfidf
train_data = read_data(TRAIN_DATA_FILE_PATH)
test_data = read_data(TEST_DATA_FILE_PATH)
train_data_all = split_data(train_data, 1.0)
test_data_all = split_data(test_data, 1.0)
train_data, test_data = label_encoding(train_data, test_data)
x_train_tfidf, y_train_tfidf, x_test_tfidf, y_test_tfidf = tf_idf(train_data, test_data)

LogisticRegression_model(x_train_tfidf, y_train_tfidf, x_test_tfidf, y_test_tfidf)



[0.36834999 0.36132865 0.38968404 0.37520259 0.3719611 ]
Accuracy: 0.37 (+/- 0.02)
--------------------------------------------------------------------------------------------------------

Training on all data and calculating accuracy on test data
Accuracy: 0.3696282293635791
Precision: 0.3631043741302422
f1 score: 0.361945715979274


Rezultatele de mai sus nu sunt foarte promitatoare intrucat au o acuratete foarte mica

Urmatorul aproach va fi Count Vectorizer + TFIDF Transformer

In [111]:
# MultinomialNB_model + count vectorizer + tfidf

train_data = read_data(TRAIN_DATA_FILE_PATH)
test_data = read_data(TEST_DATA_FILE_PATH)
train_data_all = split_data(train_data, 1.0)
test_data_all = split_data(test_data, 1.0)
train_data, test_data = label_encoding(train_data, test_data)

x_train, y_train = split_input_label(train_data)
x_test, y_test = split_input_label(test_data)

count_vectorizer_tfidf_multinominalnb(x_train, y_train, x_test, y_test)

[0.28193357 0.28436403 0.28679449 0.28417072 0.28444084]
Accuracy: 0.28 (+/- 0.00)
--------------------------------------------------------------------------------------------------------

Training on all data and calculating accuracy on test data


  'precision', 'predicted', average, warn_for)


Accuracy: 0.2882167611846251
Precision: 0.37774713374138685
f1 score: 0.18772251264119305


  'precision', 'predicted', average, warn_for)


In [112]:
# sgd classifier + count vectorizer + tfidf

train_data = read_data(TRAIN_DATA_FILE_PATH)
test_data = read_data(TEST_DATA_FILE_PATH)
train_data_all = split_data(train_data, 1.0)
test_data_all = split_data(test_data, 1.0)
train_data, test_data = label_encoding(train_data, test_data)

x_train, y_train = split_input_label(train_data)
x_test, y_test = split_input_label(test_data)

count_vectorizer_tfidf_sgd_classifier(x_train, y_train, x_test, y_test)



[0.36591952 0.35862814 0.36943019 0.36088601 0.36763911]
Accuracy: 0.36 (+/- 0.01)
--------------------------------------------------------------------------------------------------------

Training on all data and calculating accuracy on test data


  'precision', 'predicted', average, warn_for)


Accuracy: 0.2882167611846251
Precision: 0.37774713374138685
f1 score: 0.18772251264119305


  'precision', 'predicted', average, warn_for)


In [113]:
# logistic regression + count vectorizer + tfidf

train_data = read_data(TRAIN_DATA_FILE_PATH)
test_data = read_data(TEST_DATA_FILE_PATH)
train_data_all = split_data(train_data, 1.0)
test_data_all = split_data(test_data, 1.0)
train_data, test_data = label_encoding(train_data, test_data)

x_train, y_train = split_input_label(train_data)
x_test, y_test = split_input_label(test_data)

count_vectorizer_tfidf_logistic_regression(x_train, y_train, x_test, y_test)



[0.36402917 0.36753983 0.38536322 0.37250135 0.36925986]
Accuracy: 0.37 (+/- 0.01)
--------------------------------------------------------------------------------------------------------

Training on all data and calculating accuracy on test data


  'precision', 'predicted', average, warn_for)


Accuracy: 0.2882167611846251
Precision: 0.37774713374138685
f1 score: 0.18772251264119305


  'precision', 'predicted', average, warn_for)


Ultimul aproach va fi folosirea unui neural network din keras si varianta de bag of words tot din keras folosind diferite functii de loss.

In [0]:
# Prepare data for keras model

train_data = read_data(TRAIN_DATA_FILE_PATH)
test_data = read_data(TEST_DATA_FILE_PATH)
train_data_all = split_data(train_data, 1.0)
test_data_all = split_data(test_data, 1.0)
train_data, test_data = label_encoding(train_data, test_data)
all_data = train_data.append(test_data)


In [115]:
# Keras + Bow + categorical_crossentropy
keras_model_bow(all_data, train_data, test_data, "categorical_crossentropy")

Train on 16661 samples, validate on 1852 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.37328292369542065


In [116]:
# Keras + Bow + kullback_leibler_divergence
keras_model_bow(all_data, train_data, test_data, "kullback_leibler_divergence")

Train on 16661 samples, validate on 1852 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.36584751098202545


In [117]:
# Keras + Bow + poisson
keras_model_bow(all_data, train_data, test_data, "poisson")

Train on 16661 samples, validate on 1852 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.3692501574773494


In [118]:
# Keras + Bow + cosine_proximity
keras_model_bow(all_data, train_data, test_data, "cosine_proximity")

Train on 16661 samples, validate on 1852 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.36899810958449464


In [119]:
# Keras + Bow + binary_crossentropy
keras_model_bow(all_data, train_data, test_data, "binary_crossentropy")

Train on 16661 samples, validate on 1852 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test accuracy: 0.9051291559654433



Avand in vedere rezultatele combinatiilor de mai sus putem concluziona ca cele mai bune rezultate au fost obtinute de modelul Keras folosind ca loss function Binary Crossentropy