## Combined XGB

In [2]:
import pandas as pd
import xgboost as xgb

data = pd.read_csv('extracted_features_combined.csv')

In [3]:
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Labels (the last column)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)


In [None]:
from sklearn.metrics import roc_auc_score
model = xgb.XGBClassifier(subsample = 1.0, n_estimators = 400, max_depth = 7, learning_rate = 0.05, gamma = 0, colsample_bytree = 0.5, n_jobs = -1)

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Make probability predictions on the testing data
y_prob = model.predict_proba(X_test)

# Calculate AUC score
auc_score = roc_auc_score(y_test, y_prob, multi_class='ovr')
print("AUC Score:", auc_score)

In [5]:
from sklearn.metrics import roc_auc_score
model = xgb.XGBClassifier()

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Make probability predictions on the testing data
y_prob = model.predict_proba(X_test)

# Calculate AUC score
auc_score = roc_auc_score(y_test, y_prob, multi_class='ovr')
print("AUC Score:", auc_score)

Accuracy: 0.5927469779074614
AUC Score: 0.8995339662168482


Basic XGBoost of extracted_features_combined

Accuracy: 0.6053783614759225

AUC Score: 0.902076825823791

In [None]:
import matplotlib.pyplot as plt

# Define the number of top features to display
top_n = 50  # Adjust this number as needed

# Get the top N feature names and their importance scores
top_features = features[:top_n]
top_importance_scores = importance_scores[:top_n]

# Plot the top N feature importance scores
plt.figure(figsize=(12, 8))
plt.barh(top_features, top_importance_scores, color='skyblue')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.title('Top {} Feature Importance Scores'.format(top_n))
plt.gca().invert_yaxis()  # Invert y-axis to display highest importance at the top
plt.show()


In [None]:
# Find features with 0 importance
zero_importance_features = [feature for feature, importance in sorted_feature_importance if importance == 0]

# Print features with 0 importance
print("Features with 0 importance:")
for feature in zero_importance_features:
    print(feature)


## TESTING

In [None]:
from sklearn.metrics import roc_auc_score
model = xgb.XGBClassifier()

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# Define the parameter distributions for RandomizedSearchCV
param_distributions = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=10, cv=3, scoring='roc_auc', n_jobs=-1)

# Fit the randomized search to the data
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Get the best estimator
best_estimator = random_search.best_estimator_

# Make predictions on the testing data using the best estimator
y_prob_best = best_estimator.predict_proba(X_test)

# Calculate AUC score using the best estimator
auc_score_best = roc_auc_score(y_test, y_prob_best, multi_class='ovr')
print("Best AUC Score:", auc_score_best)




Best Parameters: {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.01}
Best AUC Score: 0.868186778204652


## Wav XGB

In [None]:
import pandas as pd
import xgboost as xgb

data = pd.read_csv('extracted_features_wav2vec.csv')

In [None]:
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Labels (the last column)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [None]:
# Define the XGBoost model
model = xgb.XGBClassifier()

# Train the model
model.fit(X_train, y_train)

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Make probability predictions on the testing data
y_prob = model.predict_proba(X_test)

# Calculate AUC score
auc_score = roc_auc_score(y_test, y_prob, multi_class='ovr')
print("AUC Score:", auc_score)

Accuracy: 0.4675
AUC Score: 0.8180124506258732


Basic XGBoost of wav2vec

Accuracy: 0.4675

AUC Score: 0.8180124506258732

## dft_features

In [None]:
import pandas as pd
import xgboost as xgb

data = pd.read_csv('dft_features.csv')
X = data.iloc[:, 2:]  # Features (all columns except the first two)
y = data.iloc[:, 1]   # Labels (the second column)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

from sklearn.metrics import roc_auc_score
model = xgb.XGBClassifier()

from sklearn.model_selection import RandomizedSearchCV

# Define the parameter distributions for RandomizedSearchCV
param_distributions = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=10, cv=3, scoring='roc_auc', n_jobs=-1)

# Fit the randomized search to the data
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Get the best estimator
best_estimator = random_search.best_estimator_

# Make predictions on the testing data using the best estimator
y_prob_best = best_estimator.predict_proba(X_test)

# Calculate AUC score using the best estimator
auc_score_best = roc_auc_score(y_test, y_prob_best, multi_class='ovr')
print("Best AUC Score:", auc_score_best)




Best Parameters: {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.1}
Best AUC Score: 0.8510608477570046


In [None]:
import pandas as pd
import xgboost as xgb

data = pd.read_csv('dft_features.csv')
X = data.iloc[:, 2:]  # Features (all columns except the first two)
y = data.iloc[:, 1]   # Labels (the second column)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
model = xgb.XGBClassifier()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Make probability predictions on the testing data
y_prob = model.predict_proba(X_test)

# Calculate AUC score
auc_score = roc_auc_score(y_test, y_prob, multi_class='ovr')
print("AUC Score:", auc_score)

Accuracy: 0.4978111319574734
AUC Score: 0.8456550929359332


Basic XGBoost of dft_features

Accuracy: 0.4978111319574734

AUC Score: 0.8456550929359332

## KNN

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier

# Initialize KNN classifier
knn = KNeighborsClassifier(n_neighbors = 8)

# Train the KNN classifier
knn.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_knn_top_8 = knn.predict(X_test)

# Calculate accuracy
accuracy_knn_top_8 = accuracy_score(y_test, y_pred_knn_top_8)
print("Accuracy of KNN with top 8 features:", accuracy_knn_top_8)


## Feature Boosting

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
data_df =pd.read_csv("extracted_features_combined.csv")

X = data_df.drop(['track_id','genre'], axis=1)  # Drop the 'Id' column and target variable 'Y' to create feature matrix X
y = data_df['genre']  # Target variable

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Encoding the target variable if it's categorical
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Ensure 'Genre' is the target and it's not part of X_train
if 'genre' in X_train.columns:
    X_train = X_train.drop('genre', axis=1)
if 'genre' in X_val.columns:
    X_test = X_val.drop('genre', axis=1)

# Check data types
print(X_train.dtypes)

0       float64
1       float64
2       float64
3       float64
4       float64
         ...   
1180    float64
1181    float64
1182    float64
1183    float64
1184    float64
Length: 1185, dtype: object


In [None]:
!pip install CatBoost
import xgboost as xgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier  # Import CatBoost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import randint, uniform

# Define final estimator
final_estimator = LogisticRegression(random_state=42)

# Initialize the classifiers
catboost_clf = CatBoostClassifier(verbose=0, random_state=42)  # Initialize CatBoost, turn off verbose output
adaboost_clf = AdaBoostClassifier(n_estimators=100)

# Define base models
estimators = [
    ('catboost', catboost_clf),
    ('adaboost', adaboost_clf)
]

# Define stacking model
stacking_classifier = StackingClassifier(
    estimators=estimators,
    final_estimator=final_estimator,
)

param_grid = {
    'catboost__depth': randint(4, 10),
    'final_estimator__C': uniform(0.1, 10),
    'adaboost__n_estimators': randint(50, 500),
    'adaboost__learning_rate': uniform(0.01, 1)
}

# Initialize RandomizedSearchCV
rand_search = RandomizedSearchCV(stacking_classifier, param_grid, cv=2, n_iter=5, scoring='roc_auc', n_jobs=-1, random_state=42, verbose=2)

# Assuming X_train and y_train are your training dataset
rand_search.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
best_model = rand_search.best_estimator_
y_pred = best_model.predict(X_train)
print("Best parameters:", rand_search.best_params_)
print("ROC_AUC Score: ", rand_search.best_score_)
print(accuracy_score(y_test, y_pred))

Collecting CatBoost
  Using cached catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
Installing collected packages: CatBoost
Successfully installed CatBoost-1.2.5
Fitting 2 folds for each of 5 candidates, totalling 10 fits
