# Progetto Analisi Dati (2021/22)
Giuseppe Bruno (579265)

In [None]:
#link per il download del dataset di test
link_test=''

Nel seguito vengono riportati i migliori due modelli ottenuti dall'analisi. Il primo è un SVC, il secondo un "ibrido" FCN+SVC.

(I link presenti nel resto del codice sono diretti ai file dei modelli già allenati, così da non dover ripetere la procedura, che richiederebbe almeno 4 ore. Per completezza è comunque riportato in fondo il codice per effettuare nuovamente il training.)


# Modello 1 (SVC)

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
import joblib
import pickle
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, cohen_kappa_score, balanced_accuracy_score

# Autenticazione
print("1) Autenticazione")
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#Download dei dati
print("2) Download dei dati")
#link_test = 'https://drive.google.com/file/d/1iwVcXgCJMUARL0pmdASFRTC8vJTHZF/view?usp=sharing'
link=link_test
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('data.csv')
df = pd.read_csv('data.csv')

print("3) Download del modello")
link = "https://drive.google.com/file/d/11h8P8L4tpm9ztEiOMbYzJWvAoXofIhWx/view?usp=sharing"
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('svc.sav')
clf=pickle.load(open("svc.sav", 'rb'))

link = 'https://drive.google.com/file/d/11fRpMfM9bcxvVIFB9hql35OYGutuH_tW/view?usp=sharing'
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('scaler.sav')
scaler=pickle.load(open("scaler.sav", 'rb'))

# Separazione di X e Y
Y=df["V1"].values
X=df.drop(["V1"],axis=1).values

# Standardizzazione di X
n_istances, n_timesteps= X.shape
X=np.reshape(X, (-1,1))
X=scaler.transform(X)
X=np.reshape(X, (n_istances, n_timesteps))

print("4) Predizione (in fase di sperimentazione ha richiesto circa 10 minuti)")
Y_pred=clf.predict(X)

print("5) Valutazione:")
print(" - Accuracy: ", accuracy_score(Y,Y_pred))
print(" - K: ", cohen_kappa_score( Y,Y_pred))
print(" - Balanced Accuracy: ", balanced_accuracy_score(Y, Y_pred))



# Modello 2 (FCN+SVM)

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
import joblib
import pickle
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, cohen_kappa_score, balanced_accuracy_score

from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dropout, Dense
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam


# Autenticazione
print("1) Autenticazione")
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



#Download dei dati
print("2) Download dei dati")
#link = 'https://drive.google.com/file/d/1iwVcXgCJMUARL0pmdASFRAYTC8vJTHZF/view?usp=sharing'
link= link_test
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('data.csv')
df = pd.read_csv('data.csv')


#Download dei modelli
print("3) Importazione dei modelli")
#SVC
link = "https://drive.google.com/file/d/14SVESXejr8_wad3qVtEV_8agawrAvbtz/view?usp=sharing"
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('svc_prob_all.sav')
model_svc=pickle.load(open("svc_prob_all.sav", 'rb'))

#Scaler
link = 'https://drive.google.com/file/d/11fRpMfM9bcxvVIFB9hql35OYGutuH_tW/view?usp=sharing'
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('scaler.sav')
scaler=pickle.load(open("scaler.sav", 'rb'))

#Econder
link = "https://drive.google.com/file/d/1ESp3Tdgb_SmBbvJkKp7Mc5W__N4TWuWr/view?usp=sharing"
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('encoder.sav')
encoder=pickle.load(open("encoder.sav", 'rb'))

#FCN
#link = 'https://drive.google.com/file/d/11iDyvmqKYw7t2ZCecHuEFlfRJKV5i4RE/view?usp=sharing'
link= 'https://drive.google.com/file/d/1GXMDuU13cPiHht5rf1_tTEQhTiCEYKaW/view?usp=sharing'
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('mdl_wts_8542.hdf5')

model_fcn = Sequential()
model_fcn.add(Conv1D(filters=256, kernel_size=24, activation='relu', input_shape=(46,1)))
model_fcn.add(Conv1D(filters=512, kernel_size=12, activation='relu'))
model_fcn.add(Conv1D(filters=256, kernel_size=3, activation='relu'))
model_fcn.add(Dropout(0.5))
model_fcn.add(MaxPooling1D(1))
model_fcn.add(Flatten())
model_fcn.add(Dense(1024, activation='relu'))
model_fcn.add(Dense(18, activation='softmax'))
optimizer = Adam(learning_rate=0.001, decay=0.0)
model_fcn.compile(loss='categorical_crossentropy', optimizer=optimizer , metrics=['accuracy'])
#model.summary()
model_fcn.load_weights("mdl_wts_8542.hdf5")


print("4) Preprocessing")
# Separazione di X e Y
Y=df["V1"].values
X=df.drop(["V1"],axis=1).values

# Standardizzazione di X
n_istances, n_timesteps= X.shape
X=np.reshape(X, (-1,1))
X=scaler.transform(X)
X=np.reshape(X, (n_istances, n_timesteps))

#Econding di Y
encoded_Y = encoder.transform(Y)
Y_encoded = np_utils.to_categorical(encoded_Y)



print("5) Predizione (in fase di sperimentazione ha richiesto circa 12 minuti)")
Y_svc=model_svc.predict_proba(X)
Y_fcn=model_fcn.predict(X)
Y_pred=encoder.inverse_transform(np.argmax(Y_svc+0.9*Y_fcn,axis=1))



print("6) Valutazione:")
print(" - Accuracy: ", accuracy_score(Y,Y_pred))
print(" - K: ", cohen_kappa_score( Y,Y_pred))
print(" - Balanced Accuracy: ", balanced_accuracy_score(Y, Y_pred))

# Training

Primo modello (circa 40 minuti)

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, cohen_kappa_score, balanced_accuracy_score

# Autenticazione
print("Autenticazione")
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

print("Download del training set")
link='https://drive.google.com/file/d/1iwVcXgCJMUARL0pmdASFRAYTC8vJTHZF/view?usp=sharing'
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('train.csv')
df_train = pd.read_csv('train.csv')

print("Download del test set")
link= link_test
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('test.csv')
df_test = pd.read_csv('test.csv')


# Separazione di X e Y
Y_train=df_train["V1"].values
X_train=df_train.drop(["V1"],axis=1).values
Y_test=df_test["V1"].values
X_test=df_test.drop(["V1"],axis=1).values

# Standardizzazione dell'intero dataset
n_istances, n_timesteps= X_train.shape
X_train=np.reshape(X_train, (-1,1))
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_train=np.reshape(X_train, (n_istances, n_timesteps))

n_istances, n_timesteps= X_test.shape
X_test=np.reshape(X_test, (-1,1))
X_test=scaler.transform(X_test)
X_test=np.reshape(X_test, (n_istances, n_timesteps))

# Training
print("Training")
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
clf_svc=SVC(C=3,gamma=0.2,kernel='rbf')
clf_svc.fit(X_train,Y_train)
print("Training concluso")

# Predizione sul test set
print("Predizione sul test set")
Y_pred=clf_svc.predict(X_test)


print("Valutazione:")
print(" - Accuracy: ", accuracy_score(Y_test,Y_pred))
print(" - K: ", cohen_kappa_score( Y_test,Y_pred))
print(" - Balanced Accuracy: ", balanced_accuracy_score(Y_test, Y_pred))

Secondo modello (circa 4 ore). Per il training della FCN è necessario attivare la GPU.

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, cohen_kappa_score, balanced_accuracy_score

# Autenticazione
print("Autenticazione")
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

print("Download del training set")
link='https://drive.google.com/file/d/1iwVcXgCJMUARL0pmdASFRAYTC8vJTHZF/view?usp=sharing'
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('train.csv')
df_train = pd.read_csv('train.csv')

print("Download del test set")
link= link_test
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('test.csv')
df_test = pd.read_csv('test.csv')


# Separazione di X e Y
Y_train=df_train["V1"].values
X_train=df_train.drop(["V1"],axis=1).values
Y_test=df_test["V1"].values
X_test=df_test.drop(["V1"],axis=1).values

# Standardizzazione dell'intero dataset
n_istances, n_timesteps= X_train.shape
X_train=np.reshape(X_train, (-1,1))
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_train=np.reshape(X_train, (n_istances, n_timesteps))

n_istances, n_timesteps= X_test.shape
X_test=np.reshape(X_test, (-1,1))
X_test=scaler.transform(X_test)
X_test=np.reshape(X_test, (n_istances, n_timesteps))

# Training SVC
print("Training del SVC")
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
clf_svc=SVC(C=3,gamma=0.2,kernel='rbf', probability=True)
clf_svc.fit(X_train,Y_train)
print("Training del SVC concluso")


#Training FCN
print("Training della FCN")
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dropout, Dense
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from keras.callbacks import  EarlyStopping, ReduceLROnPlateau
from keras.callbacks import ModelCheckpoint

model_fcn = Sequential()
model_fcn.add(Conv1D(filters=256, kernel_size=24, activation='relu', input_shape=(46,1)))
model_fcn.add(Conv1D(filters=512, kernel_size=12, activation='relu'))
model_fcn.add(Conv1D(filters=256, kernel_size=3, activation='relu'))
model_fcn.add(Dropout(0.5))
model_fcn.add(MaxPooling1D(1))
model_fcn.add(Flatten())
model_fcn.add(Dense(1024, activation='relu'))
model_fcn.add(Dense(18, activation='softmax'))
optimizer = Adam(learning_rate=0.001, decay=0.0)
model_fcn.compile(loss='categorical_crossentropy', optimizer=optimizer , metrics=['accuracy'])
#model.summary()

encoder = LabelEncoder()
encoder.fit(Y_train)
encoded_Y_train = encoder.transform(Y_train)
Y_train_enc = np_utils.to_categorical(encoded_Y_train)

encoded_Y_test = encoder.transform(Y_test)
Y_test_enc = np_utils.to_categorical(encoded_Y_test)

X_train2, X_val, Y_train2, Y_val = train_test_split(X_train, Y_train_enc, test_size = 0.1, stratify=Y_train,random_state = 0)



earlyStopping = EarlyStopping(monitor='val_accuracy', patience=15, verbose=0, mode='max')
mcp_save = ModelCheckpoint('mdl_wts_full.hdf5', save_best_only=True, monitor='val_accuracy', mode='max')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=8, verbose=1,  min_delta=1e-4, mode='max')
model_fcn.fit(X_train2, Y_train2, batch_size=1024, epochs=150, validation_data=(X_val, Y_val), callbacks=[earlyStopping, reduce_lr_loss])


# Predizione sul test set
print("Predizione sul test set")
Y_svc=model_svc.predict_proba(X_test)
Y_fcn=model_fcn.predict(X_test)
Y_pred=encoder.inverse_transform(np.argmax(Y_svc+Y_fcn,axis=1))


print("Valutazione:")
print(" - Accuracy: ", accuracy_score(Y_test,Y_pred))
print(" - K: ", cohen_kappa_score(Y_test,Y_pred))
print(" - Balanced Accuracy: ", balanced_accuracy_score(Y_test, Y_pred))
