In [None]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
import pickle
import os 



from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import time



U PPGBP datasetu nemamo kolone sa null vrednostima osim ['ppg_fft_peaks_0', 'ppg_fft_peaks_heights_0', 'ppg_fft_peaks_neighbor_avgs_0'] koje imaju sve null vrednosti. U ovom setu podataka imamo oznaku za pacijenta i za merenje i mislim da su sa SP i DP oznaceni gornji i donji krvni pritisak. 

In [None]:
def read_data(file_name):
    '''Read folds for feature to label training and features importances'''
    filtered_paths = [i for i in os.listdir(file_name) if i.startswith('feat')]
    loaded_files = [pd.read_csv(os.path.join(file_name,path)) for path in filtered_paths]

    if file_name == 'ppgbp_dataset':
        for file in loaded_files:
            file.dropna(axis=1,inplace=True)
    if file_name == 'uci2_dataset':
        for file in loaded_files:
            file.dropna(axis=0,inplace=True)
        

    dp_string = f"bp-benchmark-main\\feat_importance\\{file_name}\\featImportance-DP.pkl"
    sp_string = f"bp-benchmark-main\\feat_importance\\{file_name}\\featImportance-SP.pkl"

    dp_feature_importance = pickle.load(open(dp_string,'rb'))
    sp_feature_importance = pickle.load(open(sp_string,'rb'))

    return loaded_files, [dp_feature_importance,sp_feature_importance]

In [None]:
def custom_folds(loaded_files):
    '''Takes in different df folds and returns indexes that represent these exact folds in GridSearchCV function'''
    num_folds = len(loaded_files)
    lower_bound = 0
    upper_bound = 0
    indices = []
    for fold in range(num_folds):
        upper_bound += len(loaded_files[fold])
        indices.append(list(range(lower_bound,upper_bound)))
        lower_bound = upper_bound

    CVs =[]
    for i in range(num_folds):
        test_indices = indices[i]
        train_indices = [indices[j] for j in range(num_folds) if j !=i]
        train_indices = np.concatenate(train_indices)

        CVs.append((train_indices,test_indices))
    return CVs

In [None]:
def fit_models(num_features=7):
    '''Trains all models for sytolic and dystoli pressure and returns results. Training is done using GridSearch to 
    find optimal parameters and with different number of features.'''

    def load_grid_searches():
        """Initializes GridSearch object that consists of ML models and appropriate parameters"""
        grid_searches = []
        for model, param_grid in zip(models,param_grids):
            grid_search = GridSearchCV(model,param_grid,cv =CVs,scoring='neg_mean_absolute_error',n_jobs=-1)
            grid_searches.append(grid_search)
        return grid_searches
    

    def train_grid_searches(grid_searches,X,y,num_feature): 
        """Trains grid searches defines in function 'load_grid_searches', displays and returns results."""
        results = [] 
        for grid_search in grid_searches:

            print(f'Traning {grid_search.estimator} model')
            grid_search.fit(X,y)
            y_pred = grid_search.best_estimator_.predict(X)
            error = y_pred-y
            
            print('\n')

            #grid_score = -grid_search.score(X,y)
            grid_score = grid_search.best_score_
            print('Overall score', grid_score)
            print(f'ME +- SD:{np.mean(error)} +- {np.std(error)}')
            naive_error = mean_absolute_error(y,np.full(y.shape,np.mean(y)))
            mase = grid_score/naive_error
            print('MASE score: ',mase)

            # Showing results of cross validation
            columns_of_interest = ['params','mean_test_score','std_test_score'] + [f'split{i}_test_score' for i in range(5)]
            display(pd.DataFrame({col: grid_search.cv_results_[col] for col in columns_of_interest}))

            print('Best params:',grid_search.best_params_)
            print('\n\n')
            results.append([grid_search.estimator,grid_search.best_params_,num_feature,grid_score,np.mean(error),np.std(error),mase])
        return pd.DataFrame(results,columns = ['Model','Parameters','Number of features','Overall score','ME','SD','MASE'])

    def train(type = 'DP'):    
        """Performs training with different number of features either for dystolic of systolic pressure"""        
        feature_row = 0 if 'DP' else 1
        if isinstance(num_features,list):
            results = []
            for num_feature in num_features:
                print('Number of features is:',num_feature,'\n')
                features = feature_importances[feature_row].features[:num_feature]
                X = data[features]
                y = data[type]

                grid_searches = load_grid_searches()
                result = train_grid_searches(grid_searches,X,y,num_feature)
                results.append(result)
            return pd.concat(results)
        else: 
            features = feature_importances[feature_row].features[:num_features]
            X = data[features]
            y = data[type]

            grid_searches = load_grid_searches()
            print(f'Training {type}')
            train_grid_searches(grid_searches)

    print('Training DP')
    dp_results = train(type='DP')
    
    print('Training SP')
    sp_results = train(type='SP')
    
    return [dp_results,sp_results]





In [None]:
models = [SVR(),
          MLPRegressor(),
          RandomForestRegressor(),
          ]

svr_param_grid = {
    'kernel': ['linear', 'rbf'],  
    'C': [0.1, 1, 10, 100],                                      
    'epsilon': [0.1, 0.2, 0.3]           
}

mlp_param_grid = {
    'hidden_layer_sizes': [(25,), (50,), (25, 25), (50, 25)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.01,0.1],
    'learning_rate': ['adaptive'],
    'max_iter': [400]
}


forest_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
}


param_grids = [svr_param_grid, mlp_param_grid, forest_param_grid]



In [None]:
num_features = [4,8,12,16]

In [None]:
loaded_files, feature_importances = read_data('ppgbp_dataset')
CVs = custom_folds(loaded_files)
data = pd.concat(loaded_files,ignore_index=True)

results_dp_ppgbp,results_sp_ppgbp = fit_models(num_features = num_features)
 

In [None]:
trenutak = time.strftime('%H_%M')
results_dp_ppgbp.to_csv(f'results_dp_ppgbp_{trenutak}.csv')
results_sp_ppgbp.to_csv(f'results_sp_ppgbp_{trenutak}.csv')

In [None]:
loaded_files, feature_importances = read_data('sensors_dataset')
CVs = custom_folds(loaded_files)
data = pd.concat(loaded_files,ignore_index=True)

results_dp_sensor, results_sp_sensor = fit_models(num_features = num_features)
 

In [None]:
trenutak = time.strftime('%H_%M')
results_dp_sensor.to_csv(f'results_dp_sensor_{trenutak}.csv')
results_sp_sensor.to_csv(f'results_sp_sensor_{trenutak}.csv')

In [None]:
loaded_files, feature_importances = read_data('bcg_dataset')
CVs = custom_folds(loaded_files)
data = pd.concat(loaded_files,ignore_index=True)

results_dp_bcg,results_sp_bcg = fit_models(num_features = num_features)
 

In [None]:
trenutak = time.strftime('%H_%M')
results_dp_bcg.to_csv(f'results_dp_bcg_{trenutak}.csv')
results_sp_bcg.to_csv(f'results_sp_bcg_{trenutak}.csv')

In [None]:
results_dp_bcg.sort_values('Overall score')
results_dp_bcg

In [None]:
results_sp_bcg