In [24]:
!pip install sklearn transformers

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [15 lines of output]
  The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  rather than 'sklearn' for pip commands.
  
  Here is how to fix this error in the main use cases:
  - use 'pip install scikit-learn' rather than 'pip install sklearn'
  - replace 'sklearn' by 'scikit-learn' in your pip requirements files
    (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  - if the 'sklearn' package is used by one of your dependencies,
    it would be great if you take some time to track which package uses
    'sklearn' instead of 'scikit-learn' and report it to their issue tracker
  - as a last resort, set the environment variable
    SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
  
  More information is available at
  https://github.com/scikit-learn/sklearn-pypi-package
  [end of output]
  
  note: This error originates from a subpr

In [15]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [16]:
# Text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

In [17]:
# Load the dataset
data = pd.read_csv("D:\HNDSE\Enterpunership Final Project\GitHub\music_ai_api-main\music_ai_api-main\Model\data_for_model_training.csv")

In [18]:
# Clean the lyrics
data['cleaned_lyrics'] = data['lyrics'].apply(clean_text)

In [19]:
data.head()

Unnamed: 0,genre,lyrics,cleaned_lyrics
0,pop,hold time feel break feel untrue convince spea...,hold time feel break feel untrue convince spea...
1,pop,believe drop rain fall grow believe darkest ni...,believe drop rain fall grow believe darkest ni...
2,pop,sweetheart send letter goodbye secret feel bet...,sweetheart send letter goodbye secret feel bet...
3,pop,kiss lips want stroll charm mambo chacha merin...,kiss lips want stroll charm mambo chacha merin...
4,pop,till darling till matter know till dream live ...,till darling till matter know till dream live ...


In [20]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_lyrics'], data['genre'], test_size=0.2, random_state=42)


In [21]:
# Define the pipeline with TfidfVectorizer and RandomForestClassifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.75, min_df=5, ngram_range=(1, 2))),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [22]:
# Reduced hyperparameter space to search
param_distributions = {
    'tfidf__max_df': [0.75, 1.0],  # Fewer options
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Limited n-gram range
    'classifier__n_estimators': [100, 200],  # Number of trees
    'classifier__max_depth': [None, 10],  # Max depth of trees
    'classifier__min_samples_split': [2, 5],  # Minimum samples to split a node
    'classifier__min_samples_leaf': [1, 2],  # Minimum samples to make a leaf
}

In [23]:
# Randomized Search with fewer iterations, reduced cross-validation, and limited CPU usage
random_search = RandomizedSearchCV(
    pipeline, 
    param_distributions, 
    n_iter=5,  # Try fewer random combinations
    n_jobs=2,  # Limit CPU usage
    cv=3,  # Reduce cross-validation folds
    verbose=1, 
    random_state=42
)

In [24]:
# Fit the model with a smaller subset of the training data (50%)
X_train_sample = X_train.sample(frac=0.5, random_state=42)  # Use 50% of the training data
y_train_sample = y_train.loc[X_train_sample.index]

In [25]:
# Fit the random search
random_search.fit(X_train_sample, y_train_sample)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [26]:
# Print the best parameters found
print(f"Best parameters: {random_search.best_params_}")

Best parameters: {'tfidf__ngram_range': (1, 2), 'tfidf__max_df': 0.75, 'classifier__n_estimators': 200, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': None}


In [27]:
# Use the best model
best_model = random_search.best_estimator_

In [28]:
# Test the best model on the full test data
best_predictions = best_model.predict(X_test)
print(f"Best model accuracy: {accuracy_score(y_test, best_predictions)}")

Best model accuracy: 0.37832599118942734


In [30]:
# Save the trained model to a .pkl file
import joblib  # Import joblib to save the model
joblib.dump(best_model, 'D:\HNDSE\Enterpunership Final Project\GitHub\music_ai_api-main\music_ai_api-main\Model\lyrics_genre_classifier1.pkl')
print("Model saved as lyrics_genre_classifier1.pkl")

Model saved as lyrics_genre_classifier1.pkl


In [31]:
# Function to classify new lyrics
def classify_lyrics(lyrics):
    cleaned_lyrics = clean_text(lyrics)
    return best_model.predict([cleaned_lyrics])[0]

In [38]:
# Example usage
lyrics_input = "pop"
genre_pred = classify_lyrics(lyrics_input)
print(f"Predicted genre: {genre_pred}")

Predicted genre: jazz
