# Imbalanced Learning
- Define one simple unbalanced classification tasks and solve it with Decision Tree or KNN.
- If the dataset is already unbalanced leave it as it is, otherwise turns the dataset into an imbalanced version (e.g., 96% - 4%, for binary classification).
- Then solve the classification task using the Decision Tree or KNN by adopting at least 2 techniques of imbalanced learning (Undersampling, Oversampling).

STEPS: 
- quindi sbilancio + classifico con dt e knn
- applico under e over + class
- e poi in caso applico dim red + under e over + class (tra quelle viste in cap prima)


TIPI DI BILANCIAMENTO:
- undersampling: Cluster Centroids - Edited Nearest Neighbors - Tomek Links - CondensedNearestNeighbour - RandomUnderSampler
- oversampling: RandomOverSampler - ADASYN - SMOTE
- balancing at the algo level: adjust the class weight- e meta cost sensitive classifier (NO)



# Import Libraries

In [2]:
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders.count import CountEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
#from scikitplot.metrics import plot_roc

from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import EditedNearestNeighbours

from scipy.stats import randint as sp_randint

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

# Load & Prep DF

In [3]:
df = pd.read_csv('Data/Processed/df_wout_outliers.csv')

In [4]:
df.shape

(88484, 24)

In [5]:
df = df.drop(columns=['energy', 'valence', 'Unnamed: 0']) #correlated with mood

In [6]:
pd.set_option('display.max_columns', None)


In [7]:
df.head()

Unnamed: 0,duration_ms,popularity,danceability,loudness,speechiness,acousticness,instrumentalness,liveness,tempo,tempo_confidence,time_signature_confidence,key_confidence,mode_confidence,popularity_artist,followers_artist,explicit,key,mode,time_signature,genre,mood
0,-0.680781,0.162759,1.978201,-0.275068,0.999975,-0.740877,-0.522897,-0.321113,-0.618882,0.433548,0.564559,0.786251,0.398323,-0.188934,-0.304401,1,8,0,4,german,Sad
1,0.940114,0.812539,-1.691805,0.208828,-0.452776,0.240034,-0.532243,-0.511836,1.878636,-1.384628,-1.058807,0.895193,1.1342,1.049266,-0.103532,0,0,1,4,club,Angry
2,2.862979,-1.461689,0.695703,-0.94157,-0.366899,-0.962302,0.029763,-0.542764,-0.081356,1.276279,0.545517,-1.767842,-1.03499,0.092476,-0.222466,0,5,0,4,minimal-techno,Angry
3,-0.386216,1.415905,0.655625,0.954063,0.919465,-0.792551,-0.532243,-0.689672,1.324332,-1.142426,0.564559,-0.609825,-0.205755,0.458307,-0.119712,0,0,1,4,hip-hop,Happy
4,1.023124,-0.533433,-0.260445,-1.00822,-0.519867,-0.962825,1.703361,-0.642765,0.25467,0.579533,0.564559,-1.933273,-1.924633,0.767857,-0.14978,0,6,0,4,minimal-techno,Sad


In [8]:
df['time_signature'].value_counts()

time_signature
4    79329
3     7579
5     1576
Name: count, dtype: int64

In [9]:
df_class = df.copy()

In [10]:
label_encoder = LabelEncoder()

enc = CountEncoder(cols=['genre'], normalize=True).fit(df_class, df_class['mood'])
df_class = enc.transform(df_class)



In [11]:
df_class = df_class.drop(columns=['mood'])

In [12]:
df_class.columns

Index(['duration_ms', 'popularity', 'danceability', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'tempo',
       'tempo_confidence', 'time_signature_confidence', 'key_confidence',
       'mode_confidence', 'popularity_artist', 'followers_artist', 'explicit',
       'key', 'mode', 'time_signature', 'genre'],
      dtype='object')

In [13]:
df_class['explicit'].value_counts()

explicit
0    80786
1     7698
Name: count, dtype: int64

# Classification

In [18]:
target = 'explicit'

attributes = [col for col in df_class.columns if col != target]

X = df_class[attributes].values
y = np.array(df_class[target])  # oppure y = df['emotion].values

X.shape, y.shape

((88484, 19), (88484,))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# Usando l'opzione stratify=y, garantisci che la suddivisione mantenga la stessa proporzione di classi tra set di addestramento e di test, utile quando hai un set di dati sbilanciato

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((61938, 19), (26546, 19), (61938,), (26546,))

In [20]:
# DT con grid

dtree = DecisionTreeClassifier(random_state=42)

# Definizione dei parametri da ottimizzare
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20, 30, 40, 50],
    'min_samples_leaf': [1, 5, 10, 20, 30, 40]
}

# Configurazione della Grid Search
grid_search = RandomizedSearchCV(estimator=dtree, param_distributions=param_grid, n_iter=10,  cv=5, n_jobs=-1)

# Addestramento del modello
grid_search.fit(X_train, y_train)

# Migliori parametri trovati
print(f"Migliori parametri trovati: {grid_search.best_params_}")

# Predizioni sul set di test
y_pred = grid_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))


Migliori parametri trovati: {'min_samples_split': 30, 'min_samples_leaf': 40, 'max_depth': 30, 'criterion': 'entropy'}
              precision    recall  f1-score   support

           0       0.93      0.98      0.96     24237
           1       0.55      0.27      0.36      2309

    accuracy                           0.92     26546
   macro avg       0.74      0.62      0.66     26546
weighted avg       0.90      0.92      0.90     26546



In [23]:

knn = KNeighborsClassifier()

# Definizione dei parametri da ottimizzare
param_dist = {
    'n_neighbors': sp_randint(1, 31),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Configurazione della Random Search
random_search = RandomizedSearchCV(estimator=knn, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, random_state=42)

# Addestramento del modello
random_search.fit(X_train, y_train) 

# Migliori parametri trovati
print(f"Migliori parametri trovati: {random_search.best_params_}")

# Predizioni sul set di test
y_pred = random_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))

Migliori parametri trovati: {'n_neighbors': 11, 'p': 1, 'weights': 'distance'}
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     24237
           1       0.70      0.21      0.32      2309

    accuracy                           0.92     26546
   macro avg       0.82      0.60      0.64     26546
weighted avg       0.91      0.92      0.90     26546



## Undersampling

### Random

In [24]:
from collections import Counter


rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({np.int64(0): 5389, np.int64(1): 5389})


In [25]:
# DT con grid

dtree = DecisionTreeClassifier(random_state=42)

# Definizione dei parametri da ottimizzare
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20, 30, 40, 50],
    'min_samples_leaf': [1, 5, 10, 20, 30, 40]
}

# Configurazione della Grid Search
grid_search = RandomizedSearchCV(estimator=dtree, param_distributions=param_grid, n_iter=10,  cv=5, n_jobs=-1)

# Addestramento del modello
grid_search.fit(X_res, y_res)

# Migliori parametri trovati
print(f"Migliori parametri trovati: {grid_search.best_params_}")

# Predizioni sul set di test
y_pred = grid_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))

dt_randomunder = grid_search

Migliori parametri trovati: {'min_samples_split': 30, 'min_samples_leaf': 40, 'max_depth': 10, 'criterion': 'entropy'}
              precision    recall  f1-score   support

           0       0.97      0.74      0.84     24237
           1       0.21      0.72      0.33      2309

    accuracy                           0.74     26546
   macro avg       0.59      0.73      0.58     26546
weighted avg       0.90      0.74      0.79     26546



In [26]:


knn = KNeighborsClassifier()

# Definizione dei parametri da ottimizzare
param_dist = {
    'n_neighbors': sp_randint(1, 31),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Configurazione della Random Search
random_search = RandomizedSearchCV(estimator=knn, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, random_state=42)

# Addestramento del modello
random_search.fit(X_res, y_res)

# Migliori parametri trovati
print(f"Migliori parametri trovati: {random_search.best_params_}")

# Predizioni sul set di test
y_pred = random_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))

knn_randomunder = random_search

Migliori parametri trovati: {'n_neighbors': 24, 'p': 1, 'weights': 'distance'}
              precision    recall  f1-score   support

           0       0.97      0.73      0.84     24237
           1       0.22      0.76      0.34      2309

    accuracy                           0.74     26546
   macro avg       0.59      0.75      0.59     26546
weighted avg       0.90      0.74      0.79     26546



### Tomek

In [27]:
from collections import Counter


tl = TomekLinks(sampling_strategy= 'majority')
X_res, y_res = tl.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({np.int64(0): 55335, np.int64(1): 5389})


In [32]:
X_res.shape

(60724, 19)

In [28]:
# DT con grid

dtree = DecisionTreeClassifier(random_state=42)

# Definizione dei parametri da ottimizzare
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20, 30, 40, 50],
    'min_samples_leaf': [1, 5, 10, 20, 30, 40]
}

# Configurazione della Grid Search
grid_search = RandomizedSearchCV(estimator=dtree, param_distributions=param_grid, n_iter=10,  cv=5, n_jobs=-1)

# Addestramento del modello
grid_search.fit(X_res, y_res)

# Migliori parametri trovati
print(f"Migliori parametri trovati: {grid_search.best_params_}")

# Predizioni sul set di test
y_pred = grid_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))

dt_tomek = grid_search

Migliori parametri trovati: {'min_samples_split': 2, 'min_samples_leaf': 30, 'max_depth': None, 'criterion': 'entropy'}
              precision    recall  f1-score   support

           0       0.93      0.97      0.95     24237
           1       0.50      0.29      0.37      2309

    accuracy                           0.91     26546
   macro avg       0.72      0.63      0.66     26546
weighted avg       0.90      0.91      0.90     26546



In [33]:
knn = KNeighborsClassifier()

# Definizione dei parametri da ottimizzare
param_dist = {
    'n_neighbors': sp_randint(1, 31),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Configurazione della Random Search
random_search = RandomizedSearchCV(estimator=knn, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, random_state=42)

# Addestramento del modello
random_search.fit(X_res, y_res)

# Migliori parametri trovati
print(f"Migliori parametri trovati: {random_search.best_params_}")

# Predizioni sul set di test
y_pred = random_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))

knn_tomek = random_search

Migliori parametri trovati: {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     24237
           1       0.71      0.22      0.33      2309

    accuracy                           0.92     26546
   macro avg       0.82      0.60      0.65     26546
weighted avg       0.91      0.92      0.91     26546



### Edited Nearest Neighbors

In [34]:
enn = EditedNearestNeighbours()
X_res, y_res = enn.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({np.int64(0): 47790, np.int64(1): 5389})


In [35]:
# DT con grid

dtree = DecisionTreeClassifier(random_state=42)

# Definizione dei parametri da ottimizzare
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20, 30, 40, 50],
    'min_samples_leaf': [1, 5, 10, 20, 30, 40]
}

# Configurazione della Grid Search
grid_search = RandomizedSearchCV(estimator=dtree, param_distributions=param_grid, n_iter=10,  cv=5, n_jobs=-1)

# Addestramento del modello
grid_search.fit(X_res, y_res)

# Migliori parametri trovati
print(f"Migliori parametri trovati: {grid_search.best_params_}")

# Predizioni sul set di test
y_pred = grid_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))

dt_enn = grid_search

Migliori parametri trovati: {'min_samples_split': 2, 'min_samples_leaf': 10, 'max_depth': 10, 'criterion': 'entropy'}
              precision    recall  f1-score   support

           0       0.94      0.95      0.95     24237
           1       0.42      0.34      0.38      2309

    accuracy                           0.90     26546
   macro avg       0.68      0.65      0.66     26546
weighted avg       0.89      0.90      0.90     26546



In [36]:
knn = KNeighborsClassifier()

# Definizione dei parametri da ottimizzare
param_dist = {
    'n_neighbors': sp_randint(1, 31),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Configurazione della Random Search
random_search = RandomizedSearchCV(estimator=knn, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, random_state=42)

# Addestramento del modello
random_search.fit(X_res, y_res)

# Migliori parametri trovati
print(f"Migliori parametri trovati: {random_search.best_params_}")

# Predizioni sul set di test
y_pred = random_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))

knn_enn = random_search

Migliori parametri trovati: {'n_neighbors': 2, 'p': 2, 'weights': 'distance'}
              precision    recall  f1-score   support

           0       0.94      0.90      0.92     24237
           1       0.30      0.44      0.35      2309

    accuracy                           0.86     26546
   macro avg       0.62      0.67      0.64     26546
weighted avg       0.89      0.86      0.87     26546



## Oversampling

### Random 

In [37]:
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({np.int64(0): 56549, np.int64(1): 56549})


In [38]:
# DT con grid

dtree = DecisionTreeClassifier(random_state=42)

# Definizione dei parametri da ottimizzare
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20, 30, 40, 50],
    'min_samples_leaf': [1, 5, 10, 20, 30, 40]
}

# Configurazione della Grid Search
grid_search = RandomizedSearchCV(estimator=dtree, param_distributions=param_grid, n_iter=10,  cv=5, n_jobs=-1)

# Addestramento del modello
grid_search.fit(X_res, y_res)

# Migliori parametri trovati
print(f"Migliori parametri trovati: {grid_search.best_params_}")

# Predizioni sul set di test
y_pred = grid_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))

dt_over = grid_search

Migliori parametri trovati: {'min_samples_split': 40, 'min_samples_leaf': 5, 'max_depth': None, 'criterion': 'entropy'}
              precision    recall  f1-score   support

           0       0.95      0.90      0.92     24237
           1       0.32      0.51      0.39      2309

    accuracy                           0.86     26546
   macro avg       0.63      0.70      0.66     26546
weighted avg       0.90      0.86      0.88     26546



In [39]:

knn = KNeighborsClassifier()

# Definizione dei parametri da ottimizzare
param_dist = {
    'n_neighbors': sp_randint(1, 31),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Configurazione della Random Search
random_search = RandomizedSearchCV(estimator=knn, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, random_state=42)

# Addestramento del modello
random_search.fit(X_res, y_res)

# Migliori parametri trovati
print(f"Migliori parametri trovati: {random_search.best_params_}")

# Predizioni sul set di test
y_pred = random_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))

knn_over = random_search

Migliori parametri trovati: {'n_neighbors': 2, 'p': 2, 'weights': 'distance'}
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     24237
           1       0.34      0.32      0.33      2309

    accuracy                           0.89     26546
   macro avg       0.64      0.63      0.63     26546
weighted avg       0.88      0.89      0.89     26546



### Smote

In [40]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({np.int64(0): 56549, np.int64(1): 56549})


In [41]:
# DT con grid

dtree = DecisionTreeClassifier(random_state=42)

# Definizione dei parametri da ottimizzare
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20, 30, 40, 50],
    'min_samples_leaf': [1, 5, 10, 20, 30, 40]
}

# Configurazione della Grid Search
grid_search = RandomizedSearchCV(estimator=dtree, param_distributions=param_grid, n_iter=10,  cv=5, n_jobs=-1)

# Addestramento del modello
grid_search.fit(X_res, y_res)

# Migliori parametri trovati
print(f"Migliori parametri trovati: {grid_search.best_params_}")

# Predizioni sul set di test
y_pred = grid_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))

dt_smote = grid_search

Migliori parametri trovati: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 30, 'criterion': 'gini'}
              precision    recall  f1-score   support

           0       0.95      0.86      0.90     24237
           1       0.26      0.52      0.35      2309

    accuracy                           0.83     26546
   macro avg       0.61      0.69      0.63     26546
weighted avg       0.89      0.83      0.85     26546



In [42]:

knn = KNeighborsClassifier()

# Definizione dei parametri da ottimizzare
param_dist = {
    'n_neighbors': sp_randint(1, 31),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Configurazione della Random Search
random_search = RandomizedSearchCV(estimator=knn, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, random_state=42)

# Addestramento del modello
random_search.fit(X_res, y_res)

# Migliori parametri trovati
print(f"Migliori parametri trovati: {random_search.best_params_}")

# Predizioni sul set di test
y_pred = random_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))

knn_smote = random_search

Migliori parametri trovati: {'n_neighbors': 2, 'p': 2, 'weights': 'distance'}
              precision    recall  f1-score   support

           0       0.95      0.87      0.91     24237
           1       0.26      0.47      0.33      2309

    accuracy                           0.84     26546
   macro avg       0.60      0.67      0.62     26546
weighted avg       0.89      0.84      0.86     26546



### Adasyn

In [43]:
ada = ADASYN(random_state=42)
X_res, y_res = ada.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({np.int64(1): 57036, np.int64(0): 56549})


In [44]:
# DT con grid

dtree = DecisionTreeClassifier(random_state=42)

# Definizione dei parametri da ottimizzare
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20, 30, 40, 50],
    'min_samples_leaf': [1, 5, 10, 20, 30, 40]
}

# Configurazione della Grid Search
grid_search = RandomizedSearchCV(estimator=dtree, param_distributions=param_grid, n_iter=10,  cv=5, n_jobs=-1)

# Addestramento del modello
grid_search.fit(X_res, y_res)

# Migliori parametri trovati
print(f"Migliori parametri trovati: {grid_search.best_params_}")

# Predizioni sul set di test
y_pred = grid_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))

dt_ada = grid_search

Migliori parametri trovati: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 20, 'criterion': 'entropy'}
              precision    recall  f1-score   support

           0       0.95      0.83      0.89     24237
           1       0.24      0.56      0.34      2309

    accuracy                           0.81     26546
   macro avg       0.60      0.70      0.61     26546
weighted avg       0.89      0.81      0.84     26546



In [45]:

knn = KNeighborsClassifier()

# Definizione dei parametri da ottimizzare
param_dist = {
    'n_neighbors': sp_randint(1, 31),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Configurazione della Random Search
random_search = RandomizedSearchCV(estimator=knn, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, random_state=42)

# Addestramento del modello
random_search.fit(X_res, y_res)

# Migliori parametri trovati
print(f"Migliori parametri trovati: {random_search.best_params_}")

# Predizioni sul set di test
y_pred = random_search.predict(X_test)

# Report di classificazione
print(classification_report(y_test, y_pred))

knn_ada = random_search

Migliori parametri trovati: {'n_neighbors': 2, 'p': 2, 'weights': 'distance'}
              precision    recall  f1-score   support

           0       0.95      0.87      0.90     24237
           1       0.25      0.48      0.33      2309

    accuracy                           0.83     26546
   macro avg       0.60      0.67      0.62     26546
weighted avg       0.89      0.83      0.85     26546

