In [18]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1


In [1]:
import pandas as pd
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import os

In [None]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [2]:
os.path.exists('../Model/label_to_genre.joblib') 

True

In [21]:
# Load the lyrics_analyze DataFrame if not already loaded
csv_path = '../Data/Data.csv'
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"File not found: {csv_path}. Please check the path and filename.")
lyrics_analyze = pd.read_csv(csv_path)  # Update the filename/path as needed

In [24]:
def clean_and_prepare_data(filepath=csv_path):
    df = pd.read_csv(filepath)
    df.rename(columns={'type': 'genre'}, inplace=True)
    genre_mapping = {'rap & hip hop': 'hiphop', 'R&B': 'rnb'}
    df['genre'] = df['genre'].replace(genre_mapping)
    df = df[['lyrics', 'genre']]

    def clean_lyrics_text(text):
        if not isinstance(text, str):
            return ""
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    df['lyrics'] = df['lyrics'].apply(clean_lyrics_text)
    df.dropna(subset=['lyrics'], inplace=True)
    df.drop_duplicates(subset=['lyrics'], inplace=True)
    df = df[df['lyrics'].str.split().str.len() >= 5]
    
    df['genre_label'] = df['genre'].astype('category').cat.codes
    
    return df


In [None]:
def train_and_save_model(df):
    X = df['lyrics']
    y = df['genre_label']
    final_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            stop_words='english',
            max_features=7500,
            ngram_range=(1, 1),
            norm='l2'
        )),
        ('clf', LogisticRegression(
            C=1,
            max_iter=2000, 
            random_state=42, 
            solver='liblinear'
        ))
    ])

    print("Training the final model pipeline...")
    final_pipeline.fit(X, y)
    print("Training complete.")

    model_filename = '../Model/logistic_regression_pipeline.joblib'
    print(f"Saving model to {model_filename}...")
    joblib.dump(final_pipeline, model_filename)
    print("Model saved successfully!")

    label_to_genre = dict(enumerate(df['genre'].astype('category').cat.categories))
    joblib.dump(label_to_genre, 'label_to_genre.joblib')
    print("Label mapping saved to label_to_genre.joblib")

In [None]:
lyrics_df = clean_and_prepare_data()
train_and_save_model(lyrics_df)

Training the final model pipeline...
Training complete.
Saving model to ../Model/logistic_regression_pipeline.joblib...
Model saved successfully!
Label mapping saved to label_to_genre.joblib
