In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score
import mlflow
from mlflow import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import warnings

df = pd.read_csv("preprocessed_filtered.csv")

In [2]:
with open("stopwords.txt", "r") as f:
    stopwords = f.read().splitlines()

# Vectorize the lyrics
print("Vectorizing lyrics...")
vectorizer = TfidfVectorizer(max_features=100, stop_words=stopwords)

tfidf = vectorizer.fit_transform(df['lyrics'])
tfidf_df = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names_out())

scaler = MinMaxScaler()
df_features = df[['word_count', 'unique_word_count', 'average_word_length']]
df_scaled = scaler.fit_transform(df_features)
df_scaled = pd.DataFrame(df_scaled, columns=df_features.columns)
df_reset = df_scaled.reset_index(drop=True)
language_dummies = pd.get_dummies(df['language'], prefix='language')
tfidf_df_reset = tfidf_df.reset_index(drop=True)
language_dummies_reset = language_dummies.reset_index(drop=True)

X = pd.concat([df[['genre', 'interpreter', 'year']], df_reset, tfidf_df_reset, language_dummies_reset], axis=1)
X = X.dropna()

Vectorizing lyrics...


In [3]:
X

Unnamed: 0,genre,interpreter,year,word_count,unique_word_count,average_word_length,alone,always,another,around,...,language_nl,language_no,language_pl,language_pt,language_ro,language_sl,language_so,language_sv,language_sw,language_tl
0,Rock,Fleetwood Mac,1968,0.033066,0.023276,0.122870,0.000000,0.0,0.0,0.000000,...,False,False,False,False,False,False,False,False,False,False
1,Rock,David Cook,1970,0.098888,0.161207,0.152334,0.000000,0.0,0.0,0.000000,...,False,False,False,False,False,False,False,False,False,False
2,Rock,David Cook,1970,0.099815,0.107759,0.147059,0.000000,0.0,0.0,0.107338,...,False,False,False,False,False,False,False,False,False,False
3,Rock,Beach Boys,1970,0.068912,0.066379,0.200800,0.000000,0.0,0.0,0.000000,...,False,False,False,False,False,False,False,False,False,False
4,Rock,Beach Boys,1970,0.083436,0.080172,0.108128,0.041295,0.0,0.0,0.034969,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25340,Rock,Beach Boys,2016,0.009889,0.022414,0.191434,0.000000,0.0,0.0,0.571577,...,False,False,False,False,False,False,False,False,False,False
25341,Rock,Chicago,2016,0.010507,0.026724,0.167829,0.000000,0.0,0.0,0.000000,...,False,False,False,False,False,False,False,False,False,False
25342,Rock,Electric Light Orchestra,2016,0.006799,0.015517,0.220431,0.000000,0.0,0.0,0.000000,...,False,False,False,False,False,False,False,False,False,False
25343,Rock,Chicago,2016,0.006799,0.018103,0.179274,0.000000,0.0,0.0,0.000000,...,False,False,False,False,False,False,False,False,False,False


In [4]:
def train_and_log(model, model_name, X_train, y_train, X_test, y_test, target):
    with mlflow.start_run(run_name=f"{model_name} Model") as run:
        scores = cross_val_score(model, X_train, y_train, cv=5)
        for i, score in enumerate(scores, start=1):
            mlflow.log_metric(f"cross_val_score_{i}", score)
        mlflow.log_metric("average_cross_val_score", scores.mean())
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mlflow.sklearn.log_model(model, f"{model_name}_model")
        acc = accuracy_score(y_test, y_pred)
        print(f"Accuracy: {acc}")
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", precision_score(y_test, y_pred, average='weighted'))
        mlflow.log_metric("recall", recall_score(y_test, y_pred, average='weighted'))
        mlflow.log_metric("f1", f1_score(y_test, y_pred, average='weighted'))
        mlflow.log_param("model_class", type(model).__name__)
        mlflow.log_param("target", target)
        # Check if the model supports predict_proba
        if hasattr(model, 'predict_proba'):
            y_probs = model.predict_proba(X_test)
            # Get the top 3 class indices with highest probabilities for each instance
            top3_classes = np.argsort(y_probs, axis=1)[:, -3:]
            # Convert y_test to a numpy array before the comparison
            y_test_np = np.array(y_test)
            # Convert y_test labels to their corresponding indices
            y_test_indices = np.array([list(model.classes_).index(label) for label in y_test_np])
            # Check if the true class is in the top 3 predicted classes
            matches = np.any(top3_classes == y_test_indices[:, None], axis=1)
            top3_accuracy = np.mean(matches)
            print(f"Top-3 Accuracy: {top3_accuracy}")
            mlflow.log_metric("top3_accuracy", top3_accuracy)

            # Get the top 5 class indices with highest probabilities for each instance
            top5_classes = np.argsort(y_probs, axis=1)[:, -5:]
            y_test_np = np.array(y_test)
            y_test_indices = np.array([list(model.classes_).index(label) for label in y_test_np])
            matches = np.any(top5_classes == y_test_indices[:, None], axis=1)
            top5_accuracy = np.mean(matches)
            print(f"Top-5 Accuracy: {top5_accuracy}")
            mlflow.log_metric("top5_accuracy", top5_accuracy)
        
        else:
            print(f"The {model_name} model does not support predict_proba.")

In [5]:
warnings.filterwarnings('ignore')

mnb = MultinomialNB()
svc = SVC(kernel='linear', probability=True)
logreg = LogisticRegression(max_iter=2000)

models = [mnb, svc, logreg]
model_names = ["Multinomial Naive Bayes", "Linear SVC", "Logistic Regression"]
mlflow.set_tracking_uri('http://localhost:5000/')
X_train, X_test, y_train_genre, y_test_genre = train_test_split(X.drop(['genre', 'interpreter'], axis=1), X['genre'], test_size=0.2, random_state=42)
X_train, X_test, y_train_interpreter, y_test_interpreter = train_test_split(X.drop(['genre', 'interpreter'], axis=1), X['interpreter'], test_size=0.2, random_state=42)
pd.concat([X_test, y_test_genre], axis=1).to_csv("test_data_genre.csv")
pd.concat([X_test, y_test_interpreter], axis=1).to_csv("test_data_genre.csv")
for model, model_name in zip(models, model_names):
    train_and_log(model, f"Genre {model_name}", X_train, y_train_genre, X_test, y_test_genre, "genre")
    train_and_log(model, f"Interpreter {model_name}", X_train, y_train_interpreter, X_test, y_test_interpreter, "interpreter")

Accuracy: 0.5570244672454617
Top-3 Accuracy: 0.7569060773480663
Top-5 Accuracy: 0.9102209944751382
Accuracy: 0.10951065509076559
Top-3 Accuracy: 0.22296764009471193
Top-5 Accuracy: 0.30603788476716653
Accuracy: 0.5224940805051302
Top-3 Accuracy: 0.888318863456985
Top-5 Accuracy: 0.9852012628255722
Accuracy: 0.21566692975532756
Top-3 Accuracy: 0.3855564325177585
Top-5 Accuracy: 0.48855564325177586
Accuracy: 0.563733228097869
Top-3 Accuracy: 0.8676006314127861
Top-5 Accuracy: 0.9682320441988951
Accuracy: 0.19514601420678768
Top-3 Accuracy: 0.3382004735595896
Top-5 Accuracy: 0.43429360694554064
