# 1. Setup the environment

Load libraries for data analysis

In [None]:
import pandas as pd
from functions import *
import warnings
warnings.filterwarnings("ignore")

Load data

In [None]:
posizione_dati_reali = "data/mds_real_data_nan.xlsx"

real_data = pd.read_excel(posizione_dati_reali)

In [None]:
real_data.head()

# 2. Explorative Data Analysis (EDA)

In [None]:
real_data.shape

In [None]:
real_data.shape[0]*real_data.shape[1]

In [None]:
real_data.isnull().values.sum()

In [None]:
plot_missing_data_histogram(real_data)

In [None]:
real_data_cleaned = real_data.dropna()

In [None]:
plot_missing_data_histogram(real_data_cleaned)

In [None]:
real_data_cleaned.shape

In [None]:
plot_dataframe_rows_histogram(real_data, real_data_cleaned)

# 3. Build classificator

### Data preparation

Import libraries

In [None]:
import sklearn
from sklearn.model_selection import train_test_split

In [None]:
real_data_cleaned.loc[:,"IPSSR Risk Group"] = real_data_cleaned.loc[:,"IPSSR Risk Group"].map({"High": "High",
                                                                                   "Very-High": "Very High",
                                                                                   "Low": "Low",
                                                                                   "Int": "Intermediate",
                                                                                   "Very-Low": "Very Low"})

In [None]:
df_real_data = real_data_cleaned
df_real_data_col = list(df_real_data.columns)

print(len(df_real_data_col))

In [None]:
label_column = 'IPSSR Risk Group'
df_real_data_col.pop(0)

print(df_real_data[label_column].unique().shape)
print(df_real_data[label_column].unique())

In [None]:
cols_to_leave = ['del5q', 'Lossofchr5ordel5qPLUSother',
       'Gainofchr8', 'Lossofchr9ordel9q', 'Lossofchr11ordel11q',
       'Lossofchr12ordel12port12p', 'Lossofchr13ordel13q', 'Isochr17qort17p',
       'Lossofchr20ordel20q', 'LossofchrY', 'idicXq13', 'ASXL1', 'ATRX',
       'BCOR', 'BCORL1', 'BRAF', 'CBL', 'CEBPA', 'DNMT3A', 'ETV6',
       'EZH2', 'FLT3', 'GATA2', 'GNAS', 'GNB1', 'IDH1', 'IDH2',
       'JAK2', 'KIT', 'KRAS', 'MPL', 'NF1', 'NOTCH1', 'NPM1', 'NRAS', 'PHF6',
       'PTPN11', 'RAD21', 'RUNX1', 'SF3B1', 'SMC1A', 'SMC3',
       'SRSF2', 'STAG2', 'TET2', 'TP53', 'U2AF1', 'WT1', 'ZRSR2',
       'CSF3R', 'SETBP1', 'PPM1D']

## One Hot Encoding


Il One Hot Encoding è una tecnica utilizzata nell'ambito del machine learning e dell'analisi dei dati per convertire variabili categoriche in una forma che possa essere fornita agli algoritmi di machine learning per l'analisi dei dati.

### Come funziona

Supponiamo di avere una variabile categorica che può assumere uno qualsiasi di diversi valori. Ad esempio, consideriamo una variabile `Colore` con tre possibili valori: Rosso, Blu e Verde. Utilizzando il One Hot Encoding, trasformiamo questa variabile in tre nuove variabili binarie:

- `Colore_Rosso`: 1 se il colore è Rosso, altrimenti 0
- `Colore_Blu`: 1 se il colore è Blu, altrimenti 0
- `Colore_Verde`: 1 se il colore è Verde, altrimenti 0

Queste nuove variabili binarie sono chiamate "dummy variables" o "indicator variables".

### Esempio

Supponiamo di avere un DataFrame con una colonna `Colore`:

| Colore |
|--------|
| Rosso  |
| Blu    |
| Verde  |
| Rosso  |

Applicando il One Hot Encoding, il DataFrame sarà trasformato in:

| Colore_Rosso | Colore_Blu | Colore_Verde |
|--------------|------------|--------------|
| 1            | 0          | 0            |
| 0            | 1          | 0            |
| 0            | 0          | 1            |
| 1            | 0          | 0            |

Ogni riga rappresenta una singola osservazione e le colonne rappresentano i diversi valori che può assumere la variabile `Colore`.

In [None]:
X_train_data = df_real_data[cols_to_leave].values
Y_train_data = df_real_data[[label_column]]

ohe = sklearn.preprocessing.OneHotEncoder()
Y_train_data = ohe.fit_transform(Y_train_data).toarray()

In [None]:
df_real_data[cols_to_leave]

In [None]:
df_real_data[[label_column]]

In [None]:
list(ohe.get_feature_names_out())

In [None]:
df_real_data[label_column].head()

In [None]:
pd.DataFrame(Y_train_data).head()

## Split in training and test dataset

In [None]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X_train_data, Y_train_data, test_size=0.30, random_state=21)

In [None]:
plot_dataset_shapes(X_train, X_test, Y_train, Y_test)

## Create and train the model

In [None]:
import tensorflow as tf

In [None]:
vector_length = X_train.shape[1]
num_classes = Y_train.shape[1]
input_shape = (vector_length,)

In [None]:
def build_model():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(100, input_shape=input_shape, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(50, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
    return model

### Training

In [None]:
model = build_model()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, Y_train, epochs=500, batch_size=50, verbose=0, validation_split=0.2, callbacks=[PlotAccuracy()])

### Testing

In [None]:
test_results = model.evaluate(X_test, Y_test, verbose=1)
print(f'Test results - Loss: {test_results[0]} - Accuracy: {test_results[1]}%')

In [None]:
# Esegui le predizioni sul set di test per ogni classe
from sklearn.metrics import confusion_matrix

predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)
y_test_single = np.argmax(Y_test, axis=1)
conf_matrix = confusion_matrix(y_test_single, predicted_classes)
class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1) * 100

print(f'{name_list[4]}: {class_accuracy[4]:.2f}%')

# 4. Add synthetic data to enhance model performance

In [None]:
posizione_dati_sintetici = "data/synth_data__.csv"
synth_data = pd.read_csv(posizione_dati_sintetici)

In [None]:
synth_data.shape

In [None]:
plot_missing_data_histogram(synth_data)

In [None]:
df_hybrid_data = synth_data
X_train_data = df_hybrid_data[cols_to_leave].values
Y_train_data = df_hybrid_data[[label_column]]

Y_train_data = ohe.fit_transform(Y_train_data).toarray()

In [None]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X_train_data, Y_train_data, test_size=0.30, random_state=21)

In [None]:
plot_dataset_shapes(X_train, X_test, Y_train, Y_test)

In [None]:
vector_length = X_train.shape[1]
num_classes = Y_train.shape[1]
input_shape = (vector_length,)

In [None]:
def build_model():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(100, input_shape=input_shape, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(50, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
    return model

In [None]:
model = build_model()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, Y_train, epochs=400, batch_size=100, verbose=0, validation_split=0.2, callbacks=[PlotAccuracy()])

In [None]:
test_results = model.evaluate(X_test, Y_test, verbose=1)
print(f'Test results - Loss: {test_results[0]} - Accuracy: {test_results[1]}%')

In [None]:
# Esegui le predizioni sul set di test per ogni classe
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)
y_test_single = np.argmax(Y_test, axis=1)
conf_matrix = confusion_matrix(y_test_single, predicted_classes)
class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1) * 100

print(f'{name_list[4]}: {class_accuracy[4]:.2f}%')