In [23]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/genius-song-lyrics-with-language-information/song_lyrics.csv


In [None]:
lyric_database = pd.read_csv("/kaggle/input/genius-song-lyrics-with-language-information/song_lyrics.csv")
lyric_database.head(5)

In [None]:
db = lyric_database.sample(frac=0.01, random_state=1) #this database has 3 million entries
db = (
    db
    .loc[db["language"] == "en"]
    .loc[db["tag"] != "misc"]
    .loc[db["tag"] != ""]
    .loc[db["tag"] != None]
    .loc[db["lyrics"] != None]
    .loc[db["lyrics"] != ""]
    .loc[db["lyrics"] != "[Instrumental]"]
)
columns_of_interest = ['id', 'title','artist', 'lyrics', 'tag']
db = db[columns_of_interest]
db = db.sort_values(by="id")
db.head(10)

In [None]:
print(db.tag.unique())
print(db.tag.value_counts(normalize=True))

In [None]:
import re
import nltk

def fix_lyrics(lyrics):
    parts = re.split(r"([\n\[\]\(\)])", lyrics)
    output = " ".join(filter(None, parts))
    output = re.sub(r"([?.,!:;])",'',output)
    output = re.sub(r"in'(?= \w|\.|,|$)","ing",output.lower())
    return output


db["lyrics"] = db["lyrics"].apply(fix_lyrics)
db = db.sort_values(by="id")

In [None]:
from nltk.tokenize import word_tokenize
db['tokens_raw'] = db['lyrics'].apply(lambda x: word_tokenize(x.lower()))
db.head(10)

In [None]:
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')
db['tokens_raw'] = db['tokens_raw'].apply(lambda x: [w for w in x if w not in stopwords])
db['lyrics_clean'] = db['tokens_raw'].apply(lambda x: ' '.join(x))

db.head(10)

In [None]:
from sklearn.model_selection import train_test_split

X = db['lyrics_clean'].copy()
y = db['tag'].copy()

X_train_raw, X_val_raw, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=60)
print(X_train_raw)
print(y_train)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
#import seaborn as sns

#tfidf_vectorizer = TfidfVectorizer()
#X_train = tfidf_vectorizer.fit_transform(X_train_raw).toarray()

# an alternative is to use term frequency:
from sklearn.feature_extraction.text import CountVectorizer
one_hot_vectorizer = CountVectorizer(binary=True)
X_train = one_hot_vectorizer.fit_transform(X_train_raw)

print(f"X_train.shape = {X_train.shape}")
type(X_train)

In [None]:
X_val = one_hot_vectorizer.transform(X_val_raw).toarray()
print(f"X_val.shape = {X_val.shape}")
type(X_val)

In [None]:
print(f"pop songs = {len(db.loc[db['tag'] == 'pop'])}")
print(f"rap songs = {len(db.loc[db['tag'] == 'rap'])}")
print(f"rock songs = {len(db.loc[db['tag'] == 'rock'])}")
print(f"rb songs = {len(db.loc[db['tag'] == 'rb'])}")
print(f"country songs = {len(db.loc[db['tag'] == 'country'])}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score


models = []
for i in range(1,10):
    model = KNeighborsClassifier(n_neighbors=i)
    model = model.fit(X_train, y_train)

    predictions_train = model.predict(X_train)

    disp = ConfusionMatrixDisplay(confusion_matrix(y_train, predictions_train), display_labels=['pop','rap','rock','rb','country'])
    disp.plot()
    print(f"accuracy ({i}): {accuracy_score(y_train, predictions_train):.4f}")
    models.append(model)

In [None]:
from sklearn.metrics import log_loss

# A function to create and fit a RF with a specific number of trees
def tuneModel(hyperparam_value):
    rf_model = KNeighborsClassifier(n_neighbors=hyperparam_value) 
    #rf_model = RandomForestClassifier(min_samples_split=hyperparam_value, random_state=1)
    rf_model.fit(X_train, y_train)
    y_train_pred_prob = rf_model.predict_proba(X_train)
    y_train_pred = rf_model.predict(X_train)
    y_val_pred_prob = rf_model.predict_proba(X_val)
    y_val_pred = rf_model.predict(X_val)
    train_loss = log_loss(y_train, y_train_pred_prob, labels=['pop', 'rap', 'rock', 'rb', 'country'])
    train_acc = accuracy_score(y_train, y_train_pred)
    val_loss = log_loss(y_val, y_val_pred_prob, labels=['pop', 'rap', 'rock', 'rb', 'country'])
    val_acc = accuracy_score(y_val, y_val_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix(y_train, predictions_train), display_labels=['pop','rap','rock','rb','country'])
    disp.plot()
    print(f"accuracy ({hyperparam_value}): {val_acc}")
    return (train_loss, val_loss, train_acc, val_acc)

# Possible values for the hyperparameter are in the range of 5 to 150 (by 50)
hyp_param_vals = range(1,10) # good values for n_estimators
#hyp_param_vals = [2,3] + list(range(5, 50, 10)) # good values for min_samples_split
metrics = []

for hp in hyp_param_vals:
    metrics.append(tuneModel(hp))

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12, 6))
ax = fig.add_axes([0, 0, 1, 1]) #.1, 0.1, 0.8, 0.8]) # main axes
ax.plot(hyp_param_vals, [metric[1] for metric in metrics], '--ro') # validattion loss
ax.plot(hyp_param_vals, [metric[0] for metric in metrics], '--bo') # training loss
ax.legend(["Validation Loss", "Train Loss"], loc=1)
ax.set_xticks(hyp_param_vals)
ax.set(xlabel="n_estimators", ylabel="loss (lower is better)")
plt.show()

In [None]:
fig = plt.figure(figsize=(12, 6))
ax = fig.add_axes([0, 0, 1, 1]) #.1, 0.1, 0.8, 0.8]) # main axes
ax.plot(hyp_param_vals, [metric[3] for metric in metrics], '--ro') # validattion accuracy
ax.plot(hyp_param_vals, [metric[2] for metric in metrics], '--bo') # training accuracy
ax.legend(["Validation Accuracy", "Train Accuracy"], loc=4)
ax.set_xticks(hyp_param_vals)
ax.set(xlabel="n_estimators", ylabel="accuracy (higher is better)")
plt.show()