<a href="https://colab.research.google.com/github/cmurray1716/ml_group_project_0425/blob/main/Random_Forrest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#Loading in dataset
from google.colab import files
uploaded = files.upload()

Saving combinedDF.csv to combinedDF.csv


This analysis was performed on Google Colab to leverage its cloud-based computational resources. Running these models locally on my laptop was not feasible due to hardware limitations (e.g., insufficient RAM/GPU for large datasets or complex algorithms like RandomForestClassifier with hyperparameter tuning). Additionally, some cells (e.g., model training or RandomizedSearchCV) require significant runtime, and reprocessing them was avoided due to time constraints. However, the final model outputs (e.g., evaluation metrics, confusion matrices) are preserved and documented below for reproducibility.

In [6]:
!ls

combinedDF.csv	sample_data


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, FeatureHasher
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, TomekLinks, CondensedNearestNeighbour
from imblearn.pipeline import Pipeline

df = pd.read_csv('combinedDF.csv')

# Process the Lyrics column (convert to lowercase)
df["Lyrics"] = df["Lyrics"].str.lower()

# Verify
print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'combinedDF.csv'

In [8]:
# Train-test split (70/30)
train_df, val_df = train_test_split(df, test_size=0.30, random_state=1000)
trainx = train_df["Lyrics"].values
testx = val_df["Lyrics"].values
trainy = train_df["Genre"].values
testy = val_df["Genre"].values

# Vectorizer using manually set best parameters (example values below)
vectorizer = TfidfVectorizer(min_df=0.2, max_features=1000)
trainx_v = vectorizer.fit_transform(trainx)
testx_v = vectorizer.transform(testx)


In [9]:
# Random Forest with selected hyperparameters
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Train model
rf.fit(trainx_v, trainy)

# Predict on test data
predictions = rf.predict(testx_v)

# Evaluate performance
print("Confusion Matrix:")
print(confusion_matrix(testy, predictions))
print("Accuracy Score:")
print(accuracy_score(testy, predictions))
print("Precision Score:")
print(precision_score(testy, predictions, average="weighted"))
print("Recall Score:")
print(recall_score(testy, predictions, average="weighted"))
print("F1 Score:")
print(f1_score(testy, predictions, average="weighted"))

KeyboardInterrupt: 

In [10]:


# Create pipeline
pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier(random_state=42))
])

# Define hyperparameter grid
param_dist_rf = dict(
    rf__n_estimators=[100, 150, 200],
    rf__max_depth=np.arange(5, 16),
    rf__min_samples_leaf=[5, 10, 20, 50, 100],
    rf__criterion=['gini', 'entropy'],
    tfidf__min_df=np.linspace(0.01, 0.5, num=10),
    tfidf__max_features=np.arange(200, 1100, 200)
)

# Random Search
random_search_rf = RandomizedSearchCV(
    estimator=pipeline_rf,
    param_distributions=param_dist_rf,
    n_iter=3,
    cv=3,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

# Fit model
random_search_rf.fit(trainx, trainy)

# Output best parameters and score
print("Best Parameters:", random_search_rf.best_params_)
print("Best Score:", random_search_rf.best_score_)


Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best Parameters: {'tfidf__min_df': np.float64(0.2822222222222222), 'tfidf__max_features': np.int64(1000), 'rf__n_estimators': 100, 'rf__min_samples_leaf': 5, 'rf__max_depth': np.int64(15), 'rf__criterion': 'entropy'}
Best Score: 0.5717203054480021


In [11]:

# Best parameters from RandomizedSearchCV
best_params = {
    'tfidf__min_df': 0.2822222222222222,
    'tfidf__max_features': 1000,
    'rf__n_estimators': 100,
    'rf__min_samples_leaf': 5,
    'rf__max_depth': 15,
    'rf__criterion': 'entropy'
}

# Define the final model pipeline
final_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        min_df=best_params['tfidf__min_df'],
        max_features=best_params['tfidf__max_features']
    )),
    ('rf', RandomForestClassifier(
        n_estimators=best_params['rf__n_estimators'],
        min_samples_leaf=best_params['rf__min_samples_leaf'],
        max_depth=best_params['rf__max_depth'],
        criterion=best_params['rf__criterion'],
        random_state=42
    ))
])

# Fit on training data
final_pipeline.fit(trainx, trainy)

# Predict on test data
predictions = final_pipeline.predict(testx)

# Print evaluation metrics
print("Confusion Matrix:")
print(confusion_matrix(testy, predictions))
print("\nClassification Report:")
print(classification_report(testy, predictions))
print("Accuracy Score:", accuracy_score(testy, predictions))
print("Precision Score:", precision_score(testy, predictions, average='weighted'))
print("Recall Score:", recall_score(testy, predictions, average='weighted'))
print("F1 Score:", f1_score(testy, predictions, average='weighted'))


Confusion Matrix:
[[    0     0     0     0     0     2     0    85     0   731]
 [    0     0     0     0     0     1     4   156     0   630]
 [    0     0     3     0     0    29    10   193     0  2328]
 [    0     0     0     0     0     0     0   783     0   177]
 [    0     0     1     0    40    10     2   339     0  1971]
 [    0     0     2     0     0  1337     6   353     0  2471]
 [    0     0     0     0     0     4   362   671     0  4916]
 [    0     0     0     0     0   106    22 13974     1 12091]
 [    0     0     0     0     0     5     1   320     0   657]
 [    0     0     0     0     0   109    62  3139     0 29335]]

Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

     Country       0.00      0.00      0.00       818
  Electronic       0.00      0.00      0.00       791
        Folk       0.50      0.00      0.00      2563
     Hip-Hop       0.00      0.00      0.00       960
       Indie       1.00      0.02      0.03      2363
        Jazz       0.83      0.32      0.46      4169
       Metal       0.77      0.06      0.11      5953
         Pop       0.70      0.53      0.60     26194
         R&B       0.00      0.00      0.00       983
        Rock       0.53      0.90      0.67     32645

    accuracy                           0.58     77439
   macro avg       0.43      0.18      0.19     77439
weighted avg       0.61      0.58      0.52     77439

Accuracy Score: 0.5817611281137411


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision Score: 0.6110799446422811
Recall Score: 0.5817611281137411
F1 Score: 0.5204983746868485
