In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
cd drive/MyDrive/Colab\ Notebooks/scripts

/content/drive/MyDrive/Colab Notebooks/scripts


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import sys
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
import os
from collections import Counter
from imblearn.over_sampling import SMOTE

In [5]:
from feature_selection import forwardFeatureSelection

from NonLinCFA import NonLinCFA
from aux_GenLinCFA import prepare_target_binary

from aux_NonLinCFA import *
import random

In [6]:
cd ..

/content/drive/MyDrive/Colab Notebooks


In [7]:
def compute_stat_mean(values):
  mean = np.mean(values)
  std_dev = 1.96*np.std(values)/np.sqrt(len(values))
  return round(mean,2), round(std_dev,2)

In [7]:
compute_stat_mean([0.1, 0.26, 0.18, 0.19, 0.19, 0.23, 0.21, 0.17, 0.13, 0.13])

(0.18, 0.03)

In [9]:
import random

def compute_random_seeds(num_seeds, limit):
  random.seed(12)
  randomlist = []
  for i in range(0,num_seeds):
    n = random.randint(1,limit)
    randomlist.append(n)
  return randomlist

# NonLinCFA aggregations standardized target, moving avg



## Temp + Prec

In [10]:
randomlist = compute_random_seeds(12, 100)
randomlist

[61, 35, 85, 68, 86, 45, 19, 49, 2, 48, 62, 36]

In [15]:
basins = ['Emiliani1', 'Emiliani2', 'Garda_Mincio']

path_target = './csv/'
path_features = './features_allvalues/'
destination_folder = './NonLinCFA/temp_prec_shuffle/'
plots_folder = './NonLinCFA/for_plots/'

for basin in basins:
  selected_colnames_CMI5 = []
  outputs = []
  for i, random_seed in enumerate(randomlist):
    print('####################' + basin + '####################')
    target_df_train, target_df_val, target_df_test, target_df_trainVal = prepare_target('',max_train='2010-01-01', max_val='2015-01-01', max_test='2020-01-01', 
        path=path_target+basin+'.csv', window_size = 1)
    eps = 0.001
    actual_path = path_features+basin+'_aggreg.csv'
    output, aggregate_trainVal, aggregate_test = aggregate_unfolded_data(actual_path,['cyclostationary_mean_tg', 
                                                                              'cyclostationary_mean_tg_1w',
                                                                              'cyclostationary_mean_tg_4w', 
                                                                              'cyclostationary_mean_tg_8w',
                                                                              'cyclostationary_mean_tg_12w', 
                                                                              'cyclostationary_mean_tg_16w',
                                                                              'cyclostationary_mean_tg_24w',
                                                                              'cyclostationary_mean_rr', 
                                                                              'cyclostationary_mean_rr_1w',
                                                                              'cyclostationary_mean_rr_4w', 
                                                                              'cyclostationary_mean_rr_8w',
                                                                              'cyclostationary_mean_rr_12w', 
                                                                              'cyclostationary_mean_rr_16w',
                                                                              'cyclostationary_mean_rr_24w'
                                                                              ],
                                                                        target_df_trainVal, eps=eps,
                                                                        max_train='2010-01-01', max_val='2015-01-01', max_test='2020-01-01',
                                                                        curr_seed=random_seed, shuffle=True)
    
    agg_trainVal_string = plots_folder + basin + "_trainVal_aggreg_" + str(i)
    agg_test_string = plots_folder + basin + "_test_aggreg_" + str(i)
    aggregate_trainVal.to_csv(agg_trainVal_string, index = False)
    aggregate_test.to_csv(agg_test_string, index = False)
    
    outputs.append(output)
    #starting_points.append(starting_point)

    res = {
              "delta" : [], 
              "numSelected" : [], 
              "selectedFeatures" : [] 
          }
    
    res['selectedFeatures'] = forwardFeatureSelection(10,np.array(aggregate_trainVal),np.array(target_df_trainVal.mean_std),res,10,1)
    
    selectedFeatures='selectedFeatures'
    print(f'\n{res[selectedFeatures]}\n')
    selected_colnames_CMI = aggregate_trainVal.columns[res['selectedFeatures']]

    print('\nFull model and selected features with CMI\n')
    compare_methods(aggregate_trainVal, aggregate_test, target_df_trainVal, target_df_test, selected_colnames_CMI)

    print('\nFull model and best 5 selected features with CMI\n')
    compare_methods(aggregate_trainVal, aggregate_test, target_df_trainVal, target_df_test, selected_colnames_CMI[0:5])
    
    selected_colnames_CMI5.append(aggregate_trainVal.loc[:,selected_colnames_CMI[0:5]].columns.values)

    train_string = destination_folder + basin + '_' + str(i) + '_nonLinCFA_best5_CMI_train.csv'
    val_string = destination_folder + basin + '_' + str(i) + '_nonLinCFA_best5_CMI_val.csv'
    test_string = destination_folder + basin + '_' + str(i) + '_nonLinCFA_best5_CMI_test.csv'

    X_train_CMI5 = aggregate_trainVal.loc[:410,selected_colnames_CMI[0:5]]
    X_validation_CMI5 = aggregate_trainVal.loc[411:,selected_colnames_CMI[0:5]]
    X_train_validation_CMI5 = pd.concat([X_train_CMI5, X_validation_CMI5])
    X_test_CMI5 = aggregate_test.loc[:,selected_colnames_CMI[0:5]]
            
    X_train_CMI5.to_csv(train_string, index=False)
    X_validation_CMI5.to_csv(val_string, index=False)
    X_test_CMI5.to_csv(test_string, index=False)


    print('###### Linear Regression ######')

    lin_regr = LinearRegression()

    # CMI best 5
    lin_regr.fit(X_train_validation_CMI5, target_df_trainVal['mean_std'])
    print("Train R2 linear regression CMI best 5: ", round(lin_regr.score(X_train_validation_CMI5, target_df_trainVal['mean_std']),3))
    print("Test R2 linear regression CMI best 5: ", round(lin_regr.score(X_test_CMI5, target_df_test['mean_std']),3), "\n")

    print('###### Binary Classification ######')

    target_df_train = target_df_train.apply(lambda x: np.sign(x.mean_std), axis=1)
    target_df_val = target_df_val.apply(lambda x: np.sign(x.mean_std), axis=1)
    target_df_test = target_df_test.apply(lambda x: np.sign(x.mean_std), axis=1)
    target_df_trainVal = target_df_trainVal.apply(lambda x: np.sign(x.mean_std), axis=1)

    log_regr = LogisticRegression(solver='lbfgs', random_state = 42)
    log_regr.fit(X_train_validation_CMI5.values, target_df_trainVal)
    print("Train accuracy logregr CMI best 5 for shuffle n." + str(i) + ": ", round(log_regr.score(X_train_validation_CMI5.values, target_df_trainVal),3))
    print("Test accuracy logregr CMI best 5 for shuffle n." + str(i) + ": ", round(log_regr.score(X_test_CMI5.values, target_df_test),3), "\n")
  #output_string = plots_folder + basin + '_aggregations.npy'
  #sel_col_string = plots_folder + basin + '_chosen_features.npy'
  #np.save(sel_col_string, selected_colnames_CMI5)
  #np.save(output_string, outputs)

####################Emiliani1####################
Number of features: 172

Number of aggregated features: 10

Number of features: 172

Number of aggregated features: 11

Number of features: 172

Number of aggregated features: 12

Number of features: 172

Number of aggregated features: 10

Number of features: 172

Number of aggregated features: 7

Number of features: 172

Number of aggregated features: 6

Number of features: 172

Number of aggregated features: 6

Number of features: 172

Number of aggregated features: 5

Number of features: 172

Number of aggregated features: 5

Number of features: 172

Number of aggregated features: 5

Number of features: 172

Number of aggregated features: 3

Number of features: 172

Number of aggregated features: 3

Number of features: 172

Number of aggregated features: 4

Number of features: 172

Number of aggregated features: 4

----- MI Scores -----
[(72, 0.17504095111370133), (77, 0.15685898782413568), (67, 0.15663937732758673), (80, 0.147879668

  arr = np.asanyarray(arr)


# Multi task scores

In [None]:
# for binary classification 

from sklearn.metrics import accuracy_score
def MTL_scores(clust_basins, df_train, df_val, df_test, targets_df_train, targets_df_val, targets_df_test):
    
    colnames = [x for x in df_train.columns if x.startswith(tuple(clust_basins))]

    clusterdf_train_withClass = pd.DataFrame()
    clusterdf_val_withClass = pd.DataFrame()
    clusterdf_test_withClass = pd.DataFrame()

    for i in range(len(clust_basins)):
        clusterdf_train_withClass = pd.concat((clusterdf_train_withClass,pd.concat((df_train[colnames],pd.DataFrame(1+i*np.ones(len(df_train)),columns=['basin'])),axis=1)),axis=0)
        clusterdf_val_withClass = pd.concat((clusterdf_val_withClass,pd.concat((df_val[colnames],pd.DataFrame(1+i*np.ones(len(df_val)),columns=['basin'])),axis=1)),axis=0)
        clusterdf_test_withClass = pd.concat((clusterdf_test_withClass,pd.concat((df_test[colnames],pd.DataFrame(1+i*np.ones(len(df_test)),columns=['basin'])),axis=1)),axis=0)
    
    for i in range(len(clust_basins)):
        clusterdf_train_withClass[clust_basins[i]] = clusterdf_train_withClass.apply(lambda x: int(x.basin==i+1),axis=1)
        clusterdf_val_withClass[clust_basins[i]] = clusterdf_val_withClass.apply(lambda x: int(x.basin==i+1),axis=1)
        clusterdf_test_withClass[clust_basins[i]] = clusterdf_test_withClass.apply(lambda x: int(x.basin==i+1),axis=1)

    clusterdf_train_withClass = clusterdf_train_withClass.loc[:,clusterdf_train_withClass.columns != 'basin']
    clusterdf_val_withClass = clusterdf_val_withClass.loc[:,clusterdf_val_withClass.columns != 'basin']
    clusterdf_test_withClass = clusterdf_test_withClass.loc[:,clusterdf_test_withClass.columns != 'basin']

    targets_df_train_unfolded = pd.DataFrame()
    targets_df_val_unfolded = pd.DataFrame()
    targets_df_test_unfolded = pd.DataFrame()
    
    for basin in clust_basins:
        targets_df_train_unfolded =  pd.concat((targets_df_train_unfolded,targets_df_train[basin]),axis=0)
        targets_df_val_unfolded =  pd.concat((targets_df_val_unfolded,targets_df_val[basin]),axis=0)
        targets_df_test_unfolded =  pd.concat((targets_df_test_unfolded,targets_df_test[basin]),axis=0)
    targets_df_train_unfolded = targets_df_train_unfolded.reset_index(drop=True)
    targets_df_val_unfolded = targets_df_val_unfolded.reset_index(drop=True)
    targets_df_test_unfolded = targets_df_test_unfolded.reset_index(drop=True)

    # same scores changing the solver, some differences changing penalty, some improve with l1
    model_ohe = LogisticRegression(max_iter = 500)
    model_ohe.fit(pd.concat((clusterdf_train_withClass,clusterdf_val_withClass)).values,pd.concat((targets_df_train_unfolded,targets_df_val_unfolded)).values.ravel())
    
    for basin in clust_basins:
        print(basin)
        res = model_ohe.predict(clusterdf_test_withClass.loc[clusterdf_test_withClass[basin]==1].values)
        print(accuracy_score(targets_df_test[basin].values.ravel(), res))

In [None]:
### binary targets
basins = ['Emiliani1','Emiliani2','Garda_Mincio']
path_targets = "./csv/"
targets_df_train = pd.DataFrame()
targets_df_val = pd.DataFrame()
targets_df_test = pd.DataFrame()
targets_df_trainVal = pd.DataFrame()

for basin in basins:
    target_df_train,target_df_val,target_df_test,target_df_trainVal = prepare_target_binary('',max_train='2010-01-01', max_val='2015-01-01', 
                                                                                            max_test='2020-01-01', path=path_targets+basin+'.csv', 
                                                                                            threshold = None, nopeaks = False, window_size = 2)
    targets_df_train[basin] = target_df_train.mean_std
    targets_df_val[basin] = target_df_val.mean_std
    targets_df_test[basin] = target_df_test.mean_std
    targets_df_trainVal[basin] = target_df_trainVal.mean_std

In [16]:
# for linear regression 

def MTL_scores(clust_basins, df_train, df_val, df_test, targets_df_train, targets_df_val, targets_df_test):
    
    colnames = [x for x in df_train.columns if x.startswith(tuple(clust_basins))]

    clusterdf_train_withClass = pd.DataFrame()
    clusterdf_val_withClass = pd.DataFrame()
    clusterdf_test_withClass = pd.DataFrame()

    for i in range(len(clust_basins)):
        clusterdf_train_withClass = pd.concat((clusterdf_train_withClass,pd.concat((df_train[colnames],pd.DataFrame(1+i*np.ones(len(df_train)),columns=['basin'])),axis=1)),axis=0)
        clusterdf_val_withClass = pd.concat((clusterdf_val_withClass,pd.concat((df_val[colnames],pd.DataFrame(1+i*np.ones(len(df_val)),columns=['basin'])),axis=1)),axis=0)
        clusterdf_test_withClass = pd.concat((clusterdf_test_withClass,pd.concat((df_test[colnames],pd.DataFrame(1+i*np.ones(len(df_test)),columns=['basin'])),axis=1)),axis=0)
    
    for i in range(len(clust_basins)):
        clusterdf_train_withClass[clust_basins[i]] = clusterdf_train_withClass.apply(lambda x: int(x.basin==i+1),axis=1)
        clusterdf_val_withClass[clust_basins[i]] = clusterdf_val_withClass.apply(lambda x: int(x.basin==i+1),axis=1)
        clusterdf_test_withClass[clust_basins[i]] = clusterdf_test_withClass.apply(lambda x: int(x.basin==i+1),axis=1)

    clusterdf_train_withClass = clusterdf_train_withClass.loc[:,clusterdf_train_withClass.columns != 'basin']
    clusterdf_val_withClass = clusterdf_val_withClass.loc[:,clusterdf_val_withClass.columns != 'basin']
    clusterdf_test_withClass = clusterdf_test_withClass.loc[:,clusterdf_test_withClass.columns != 'basin']

    targets_df_train_unfolded = pd.DataFrame()
    targets_df_val_unfolded = pd.DataFrame()
    targets_df_test_unfolded = pd.DataFrame()
    
    for basin in clust_basins:
        targets_df_train_unfolded =  pd.concat((targets_df_train_unfolded,targets_df_train[basin]),axis=0)
        targets_df_val_unfolded =  pd.concat((targets_df_val_unfolded,targets_df_val[basin]),axis=0)
        targets_df_test_unfolded =  pd.concat((targets_df_test_unfolded,targets_df_test[basin]),axis=0)
    targets_df_train_unfolded = targets_df_train_unfolded.reset_index(drop=True)
    targets_df_val_unfolded = targets_df_val_unfolded.reset_index(drop=True)
    targets_df_test_unfolded = targets_df_test_unfolded.reset_index(drop=True)

    # same scores changing the solver, some differences changing penalty, some improve with l1
    model_ohe = LinearRegression()
    model_ohe.fit(pd.concat((clusterdf_train_withClass,clusterdf_val_withClass)).values,pd.concat((targets_df_train_unfolded,targets_df_val_unfolded)).values.ravel())
    
    for basin in clust_basins:
        print(basin)
        res = model_ohe.predict(clusterdf_test_withClass.loc[clusterdf_test_withClass[basin]==1].values)
        print(r2_score(targets_df_test[basin].values.ravel(), res))

In [17]:
### continuous targets
basins = ['Adda','Dora','Emiliani1','Emiliani2','Garda_Mincio','Lambro_Olona','Oglio_Iseo','Piemonte_Nord','Piemonte_Sud','Ticino']
path_targets = "./csv/"
targets_df_train = pd.DataFrame()
targets_df_val = pd.DataFrame()
targets_df_test = pd.DataFrame()
targets_df_trainVal = pd.DataFrame()

for basin in basins:
    target_df_train,target_df_val,target_df_test,target_df_trainVal = prepare_target('',max_train='2010-01-01', max_val='2015-01-01', 
                                                                                     max_test='2020-01-01', path=path_targets+basin+'.csv', 
                                                                                     window_size = 1)
    targets_df_train[basin] = target_df_train.mean_std
    targets_df_val[basin] = target_df_val.mean_std
    targets_df_test[basin] = target_df_test.mean_std
    targets_df_trainVal[basin] = target_df_trainVal.mean_std

In [20]:
basins = ['Garda_Mincio', 'Emiliani1', 'Emiliani2']
# basins = ['Dora','Piemonte_Sud', 'Piemonte_Nord']
# basins = ['Adda', 'Lambro_Olona', 'Oglio_Iseo', 'Ticino']

### CMI best5 features
path_features = './NonLinCFA/temp_prec_inverted_for/'

best5_CMI_fulldf_train = pd.DataFrame()
best5_CMI_fulldf_val = pd.DataFrame()
best5_CMI_fulldf_test = pd.DataFrame()

for basin in basins:
    train_temp = pd.read_csv(path_features+basin+'__nonLinCFA_best5_CMI_train.csv')
    val_temp = pd.read_csv(path_features+basin+'__nonLinCFA_best5_CMI_val.csv')
    test_temp = pd.read_csv(path_features+basin+'__nonLinCFA_best5_CMI_test.csv')
    best5_CMI_fulldf_train[basin+'_'+train_temp.columns.values] = train_temp
    best5_CMI_fulldf_val[basin+'_'+val_temp.columns.values] = val_temp
    best5_CMI_fulldf_test[basin+'_'+test_temp.columns.values] = test_temp

In [21]:
lin_regr = LinearRegression()
train_val = pd.concat([train_temp,val_temp])
lin_regr.fit(train_val, targets_df_trainVal[basin])
print("Train R2 linear regression CMI best 5: ", round(lin_regr.score(train_val, targets_df_trainVal[basin]),3))
#print("Valid R2 linear regression CMI best 5: ", round(lin_regr.score(X_valid, y_valid),3))

Train R2 linear regression CMI best 5:  0.208


In [23]:
MTL_scores(clust_basins=['Emiliani1','Emiliani2','Garda_Mincio'], df_train=best5_CMI_fulldf_train, df_val=best5_CMI_fulldf_val, df_test=best5_CMI_fulldf_test, targets_df_train=targets_df_train, targets_df_val=targets_df_val, targets_df_test=targets_df_test)

Emiliani1
0.4008771698941773
Emiliani2
0.30430115186516615
Garda_Mincio
0.2796271561379128


In [24]:
MTL_scores(clust_basins=['Dora','Piemonte_Sud', 'Piemonte_Nord'], df_train=best5_CMI_fulldf_train, df_val=best5_CMI_fulldf_val, df_test=best5_CMI_fulldf_test, targets_df_train=targets_df_train, targets_df_val=targets_df_val, targets_df_test=targets_df_test)

Dora
-0.03526230960408849
Piemonte_Sud
-0.006262944919169566
Piemonte_Nord
-0.0027953824162572083


In [25]:
MTL_scores(clust_basins=['Adda', 'Lambro_Olona', 'Oglio_Iseo', 'Ticino'], df_train=best5_CMI_fulldf_train, df_val=best5_CMI_fulldf_val, df_test=best5_CMI_fulldf_test, targets_df_train=targets_df_train, targets_df_val=targets_df_val, targets_df_test=targets_df_test)

Adda
-0.0029049821434274925
Lambro_Olona
-0.0036278021269215976
Oglio_Iseo
-0.017553155607163307
Ticino
-0.014508782688654298
