In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score
import mlflow
from mlflow import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import warnings

df = pd.read_csv("preprocessed_filtered.csv")

In [None]:
with open("stopwords.txt", "r") as f:
    stopwords = f.read().splitlines()

# Vectorize the lyrics
print("Vectorizing lyrics...")
vectorizer = TfidfVectorizer(max_features=100, stop_words=stopwords)

tfidf = vectorizer.fit_transform(df['lyrics'])
tfidf_df = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names_out())

scaler = MinMaxScaler()
df_features = df[['word_count', 'unique_word_count', 'average_word_length']]
df_scaled = scaler.fit_transform(df_features)
df_scaled = pd.DataFrame(df_scaled, columns=df_features.columns)
df_reset = df_scaled.reset_index(drop=True)
language_dummies = pd.get_dummies(df['language'], prefix='language')
tfidf_df_reset = tfidf_df.reset_index(drop=True)
language_dummies_reset = language_dummies.reset_index(drop=True)

X = pd.concat([df[['genre', 'interpreter', 'year']], df_reset, tfidf_df_reset, language_dummies_reset], axis=1)
X = X.dropna()

Vectorizing lyrics...


In [None]:
X

In [None]:
def train_and_log(model, model_name, X_train, y_train, X_test, y_test, target):
    with mlflow.start_run(run_name=f"{model_name} Model") as run:
        scores = cross_val_score(model, X_train, y_train, cv=5)
        for i, score in enumerate(scores, start=1):
            mlflow.log_metric(f"cross_val_score_{i}", score)
        mlflow.log_metric("average_cross_val_score", scores.mean())
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mlflow.sklearn.log_model(model, f"{model_name}_model")
        acc = accuracy_score(y_test, y_pred)
        csv_path = f"test_data_{model_name}.csv"
        pd.concat([X_test, y_test], axis=1).to_csv(csv_path, index=False)
        mlflow.log_artifact(csv_path)
        print(f"Accuracy: {acc}")
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", precision_score(y_test, y_pred, average='weighted'))
        mlflow.log_metric("recall", recall_score(y_test, y_pred, average='weighted'))
        mlflow.log_metric("f1", f1_score(y_test, y_pred, average='weighted'))
        mlflow.log_param("model_class", type(model).__name__)
        mlflow.log_params(model.best_params_)
        mlflow.log_param("target", target)
        # Check if the model supports predict_proba
        if hasattr(model, 'predict_proba'):
            y_probs = model.predict_proba(X_test)
            # Get the top 3 class indices with highest probabilities for each instance
            top3_classes = np.argsort(y_probs, axis=1)[:, -3:]
            # Convert y_test to a numpy array before the comparison
            y_test_np = np.array(y_test)
            # Convert y_test labels to their corresponding indices
            y_test_indices = np.array([list(model.classes_).index(label) for label in y_test_np])
            # Check if the true class is in the top 3 predicted classes
            matches = np.any(top3_classes == y_test_indices[:, None], axis=1)
            top3_accuracy = np.mean(matches)
            print(f"Top-3 Accuracy: {top3_accuracy}")
            mlflow.log_metric("top3_accuracy", top3_accuracy)

            # Get the top 5 class indices with highest probabilities for each instance
            top5_classes = np.argsort(y_probs, axis=1)[:, -5:]
            y_test_np = np.array(y_test)
            y_test_indices = np.array([list(model.classes_).index(label) for label in y_test_np])
            matches = np.any(top5_classes == y_test_indices[:, None], axis=1)
            top5_accuracy = np.mean(matches)
            print(f"Top-5 Accuracy: {top5_accuracy}")
            mlflow.log_metric("top5_accuracy", top5_accuracy)
        
        else:
            print(f"The {model_name} model does not support predict_proba.")

In [None]:
warnings.filterwarnings('ignore')
svc_params = {'C': [0.1, 1, 10]}
mnb_params = {'alpha': [0.01, 0.1, 1]}
logreg_params = {'C': [0.1, 1, 10]}

# Create the models
svc = GridSearchCV(SVC(kernel='linear', probability=True), svc_params, cv=5)
mnb = GridSearchCV(MultinomialNB(), mnb_params, cv=5)
logreg = GridSearchCV(LogisticRegression(max_iter=2000), logreg_params, cv=5)

models = [mnb, svc, logreg]
model_names = ["Multinomial Naive Bayes", "Linear SVC", "MLP", "kNN", "Logistic Regression"]
mlflow.set_tracking_uri('http://localhost:5000/')
X_train, X_test, y_train_genre, y_test_genre = train_test_split(X.drop(['genre', 'interpreter'], axis=1), X['genre'], test_size=0.2, random_state=42)
X_train, X_test, y_train_interpreter, y_test_interpreter = train_test_split(X.drop(['genre', 'interpreter'], axis=1), X['interpreter'], test_size=0.2, random_state=42)
for model, model_name in zip(models, model_names):
    train_and_log(model, f"Genre {model_name}", X_train, y_train_genre, X_test, y_test_genre, "genre")
    train_and_log(model, f"Interpreter {model_name}", X_train, y_train_interpreter, X_test, y_test_interpreter, "interpreter")

Accuracy: 0.5840458924125987
Top-3 Accuracy: 0.8774759173070679
Top-5 Accuracy: 0.9607641519644983
Accuracy: 0.06082909405779846
Top-3 Accuracy: 0.13004654183353176
Top-5 Accuracy: 0.17209654724537288
