In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_curve, auc, RocCurveDisplay
from scikitplot.metrics import plot_roc
from sklearn.preprocessing import LabelEncoder



In [4]:
#importo il train e test encoded (sto importando quelli GIÀ ONE HOT ENCODED)
df_train = pd.read_csv('df_train_mod2.csv')
df_test = pd.read_csv('df_test_mod2.csv')

In [12]:
#se vi dovessero servire NON ONE-HOT-ENCODED ho i file già pronti, basta che me lo dite

In [5]:
#Prendo la variabile target, in questo caso emotion:
y_train = df_train['emotion']
y_test = df_test['emotion']

In [6]:
#tolgo emotion visto che l'ho salvata nella y - IMPORTANTE!
df_train.drop(columns=['emotion'],inplace=True)
df_test.drop(columns=['emotion'],inplace=True)

In [7]:
#visualizzo df_train - che qui non ha già più la variabile target emotion
df_train

Unnamed: 0,frame_count,sum,std,q25,q50,q75,kur,skew,lag1_sum,lag1_std,...,stft_min_w4,stft_q75_w4,stft_q95_w4,stft_kur_w4,stft_skew_w4,vocal_channel,emotional_intensity,statement,repetition,sex
0,158558,0.145081,0.004001,-0.000031,0.0,0.000000,15.028520,0.904033,0.000000,0.000511,...,0.000000,0.896606,1.0,6.742190,-1.706215,1,0,1,0,1
1,160160,0.114319,0.004283,-0.000031,0.0,0.000000,16.488415,1.001578,0.000000,0.000540,...,0.270133,0.874713,1.0,-0.700420,-0.201495,1,0,1,1,1
2,156956,0.149963,0.005084,0.000000,0.0,0.000000,17.035218,1.269509,-0.000061,0.000873,...,0.000000,0.899156,1.0,1.688986,-1.024773,1,0,0,0,1
3,152152,0.139618,0.004886,-0.000031,0.0,0.000000,21.824521,1.799676,-0.000031,0.000907,...,0.205616,0.886474,1.0,-0.594111,-0.412871,1,0,0,1,1
4,169769,0.137665,0.002956,-0.000031,0.0,0.000000,13.236022,0.619367,-0.000031,0.000397,...,0.177847,0.878014,1.0,0.126535,-0.620782,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1797,168168,-0.154480,0.005399,0.000000,0.0,0.000031,6.504521,-0.437772,0.000000,0.001333,...,0.000000,0.893268,1.0,0.262900,-1.051814,1,0,0,1,0
1798,174575,-0.139679,0.008788,-0.000031,0.0,0.000031,9.706586,-0.079604,0.000000,0.002307,...,0.000000,0.900089,1.0,-0.857759,-0.608583,1,1,1,0,0
1799,169770,-0.210968,0.009933,0.000000,0.0,0.000031,5.874582,-0.026582,0.000000,0.001970,...,0.000000,0.891486,1.0,-0.656632,-0.586954,1,1,1,1,0
1800,185786,-0.085876,0.012221,-0.000031,0.0,0.000092,4.482247,-0.106998,0.000000,0.004435,...,0.000000,0.869563,1.0,-0.499022,-0.849045,1,1,0,0,0


In [9]:
#Faccio la normalizzazione con StandardScaler - 
#le cose che ci sono scritte prima sono perchè si fa la normalizzazione solo sulle colonne numeriche, non su quelle categoriche encoded 
#(specialmente visto che sono anche binarie), e queste categoriche sono le ultime 5 colonne dei dataframe

columns_to_scale_train = df_train.iloc[:, :-5]
columns_to_scale_test = df_test.iloc[:, :-5]


# Extract the columns to be kept as they are
columns_to_keep_train = df_train.iloc[:, -5:]
columns_to_keep_test = df_test.iloc[:, -5:]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the columns to be scaled
scaler.fit(columns_to_scale_train) #faccio il fit solo sulle colonne del train!
train_scaled = scaler.transform(columns_to_scale_train)
test_scaled = scaler.transform(columns_to_scale_test)


# Concatenate the scaled columns and the columns to be kept
df_train = pd.concat([pd.DataFrame(train_scaled, columns=columns_to_scale_train.columns), columns_to_keep_train], axis=1)
df_test = pd.concat([pd.DataFrame(test_scaled, columns=columns_to_scale_test.columns), columns_to_keep_test], axis=1)

In [11]:
# Salvo in x_train e x_test i valori 
x_train = df_train.values
x_test = df_test.values

<h5>Abbiamo così: <br>
- x_train con i valori del train normalizzati e one-hot-encoded <br>
- x_test con i valori del test normalizzati e one-hot-encoded <br>
- y_train con la variabile target emotion del train (con il one-hot-encoding) <br>
- y_test con la variabile target emotion del train (con il one-hot-encoding) <br>
</h5>



Le mappature di one-hot-encoding che stiamo usando, se dovessero servire, sono: 
    
    EMOTION:
    0: 'angry', 1: 'calm', 2: 'disgust', 3: 'fearful', 4: 'happy', 5: 'neutral', 6: 'sad', 7: 'surprised'

    VOCAL CHANNEL:
    song:0, speech:1
    SEX:
    Female:0, Male:1
    EMOTIONAL INTENSITY:
    normal:0, strong:1
    STATEMENT:
    Dogs are sitting by the door:0, Kids are talking by the door:1
    REPETITION:
    1st:0, 2nd:1


