# Preliminaries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import sys
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

sys.path.append("/Users/paolo/Documents/methods/CMI_FS")
from feature_selection import forwardFeatureSelection

sys.path.append("/Users/paolo/Documents/methods/LinCFA")
from LinCFA import LinCFA

sys.path.append("/Users/paolo/Documents/Droughts/Paolo/regression_LinCFA")
from aux import standardize,unfold_dataset,compute_r2,prepare_target,prepare_features,aggregate_unfolded_data,FS_with_linearWrapper,compare_methods,standardize


In [2]:
def plot_cells(output,selected_colnames, xmin=9, xmax=11, ymin=44, ymax=45.5):
    x = []
    y = []
    colors = cm.rainbow(np.linspace(0,1,len(output)))
    np.random.shuffle(colors)
    fig, ax = plt.subplots(2)
    ax[0].set_xlim(xmin,xmax)
    ax[1].set_xlim(xmin,xmax)
    ax[0].set_ylim(ymin,ymax)
    ax[1].set_ylim(ymin,ymax)
    for i in range(len(output)): 
        #print(len(output[i]))
        x = []
        y = []
        
        for datum in output[i]:
            x.append(float(datum.split('_')[1]))
            y.append(float(datum.split('_')[2]))
        ax[0].scatter(x,y,color=colors[i])
    
    x = []
    y = []
    col = cm.rainbow(np.linspace(0,1,len(selected_colnames)))
    for i in range(len(selected_colnames)): 
        idx = int(selected_colnames[i].split('_')[-1])
        for datum in output[idx]:
            x.append(float(datum.split('_')[1]))
            y.append(float(datum.split('_')[2]))
        ax[1].scatter(x,y,color=col[i])
    

# Target 

In [3]:
target_df = pd.read_csv('/Users/paolo/Documents/OneDrive - Politecnico di Milano/droughts/csv_VHI/Emiliani2.csv')
target_df = target_df.rename({'Unnamed: 0':'date'},axis=1)
target_df
### va ancora standardizzato

Unnamed: 0,date,mean,median,year,week
0,2001-01-05,0.214281,0.00,2001,1
1,2001-01-13,0.484737,0.52,2001,2
2,2001-01-21,0.466071,0.47,2001,3
3,2001-01-29,0.417470,0.44,2001,5
4,2001-02-06,0.492202,0.53,2001,6
...,...,...,...,...,...
983,2022-07-18,0.137749,0.11,2022,29
984,2022-07-26,0.119336,0.08,2022,30
985,2022-08-03,0.175099,0.15,2022,31
986,2022-08-11,0.227789,0.19,2022,32


# Features of entire basins

## Emiliani2

In [4]:
path='/Users/paolo/Documents/OneDrive - Politecnico di Milano/droughts/features/csv_mean/features_with_aggregations.csv'

df_emil = pd.read_csv(path)
df_emil = df_emil.loc[:,['date','cyclostationary_mean_Emiliani2_rr',
                        'cyclostationary_mean_Emiliani2_rr_1w',
                        'cyclostationary_mean_Emiliani2_rr_4w',
                        'cyclostationary_mean_Emiliani2_rr_8w',
                        'cyclostationary_mean_Emiliani2_rr_12w',
                        'cyclostationary_mean_Emiliani2_rr_16w',
                        'cyclostationary_mean_Emiliani2_rr_24w',
                        'cyclostationary_mean_Emiliani2_tg',
                        'cyclostationary_mean_Emiliani2_tg_1w',
                        'cyclostationary_mean_Emiliani2_tg_4w',
                        'cyclostationary_mean_Emiliani2_tg_8w',
                        'cyclostationary_mean_Emiliani2_tg_12w',
                        'cyclostationary_mean_Emiliani2_tg_16w',
                        'cyclostationary_mean_Emiliani2_tg_24w']]

full_df_emil = df_emil.set_index('date').join(target_df.set_index('date').loc[:,'mean'],on='date')
full_df_emil = full_df_emil
full_df_emil = full_df_emil.sample(frac=1,random_state=42).reset_index().iloc[:,1:]
full_df_emil = (full_df_emil-full_df_emil[0:587].mean())/full_df_emil[0:587].std()
full_df_emil = full_df_emil.rename(columns={'mean':'mean_std'})
full_df_emil

Unnamed: 0,cyclostationary_mean_Emiliani2_rr,cyclostationary_mean_Emiliani2_rr_1w,cyclostationary_mean_Emiliani2_rr_4w,cyclostationary_mean_Emiliani2_rr_8w,cyclostationary_mean_Emiliani2_rr_12w,cyclostationary_mean_Emiliani2_rr_16w,cyclostationary_mean_Emiliani2_rr_24w,cyclostationary_mean_Emiliani2_tg,cyclostationary_mean_Emiliani2_tg_1w,cyclostationary_mean_Emiliani2_tg_4w,cyclostationary_mean_Emiliani2_tg_8w,cyclostationary_mean_Emiliani2_tg_12w,cyclostationary_mean_Emiliani2_tg_16w,cyclostationary_mean_Emiliani2_tg_24w,mean_std
0,0.524302,0.157558,1.093683,1.104316,0.939520,1.369578,0.325633,0.186608,0.149698,-0.535308,-1.505113,-0.479228,-0.358055,-0.508335,0.463787
1,-1.161748,-1.682053,0.126582,-0.657991,-0.787927,-0.678756,-0.225455,-1.206555,-1.211424,-1.469446,-0.488423,0.292782,-0.294743,-0.103686,-0.479136
2,1.215539,0.554803,0.360345,0.283570,1.063649,0.692451,0.032648,0.517479,0.470635,0.110337,0.551733,-0.273130,-0.077799,0.685891,-0.067946
3,0.376869,-0.295749,-0.163961,0.800103,1.439970,2.091421,2.096346,-2.421726,-0.806943,-1.636368,-1.338809,-1.508013,-1.832927,-1.522427,-0.727431
4,-0.417147,-0.257116,0.054241,0.342975,-0.333173,-0.374726,-1.400288,-0.485484,-0.389354,-0.420201,-1.161823,-0.622394,-0.149486,-0.995921,0.248559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,-0.916202,-1.244376,-0.650640,-1.203343,-1.461452,-0.885803,-0.419054,0.521754,0.334061,-0.732691,-0.731812,-1.085905,-0.688233,-0.154488,-1.479312
977,-1.889644,-0.997692,-2.450834,-2.071970,-1.172678,-0.487388,-0.789769,0.646330,0.984838,0.496367,0.647662,0.253260,-0.131624,-0.078438,-0.538063
978,-0.111602,-0.605914,-0.488927,-0.657819,-0.451612,-0.220726,0.840648,0.672917,1.636813,2.167862,2.284608,2.232513,2.487259,1.224023,0.393759
979,-0.175273,-0.285777,1.096019,0.508300,0.749960,0.721446,0.952282,1.080765,1.064468,0.027597,-0.128616,-0.031113,-0.367089,-0.638149,0.671123


In [5]:
### wrapper FS, linear regression

selected_colnames = FS_with_linearWrapper(full_df_emil.iloc[0:784,:-1], full_df_emil.iloc[0:587], full_df_emil.iloc[587:784], max_feat=12 , val_len=197)
compare_methods(full_df_emil.iloc[0:784,:-1], full_df_emil.iloc[784:,:-1], full_df_emil.iloc[0:784], full_df_emil.iloc[784:], selected_colnames)


actual training score: 0.17381104087702315
actual validation score: 0.1598897316945469, number of remaining columns: 12

actual training score: 0.1765820656802578
actual validation score: 0.17863144103703354, number of remaining columns: 11

actual training score: 0.1938121549106997
actual validation score: 0.1902243973995006, number of remaining columns: 10

actual training score: 0.1943787825690343
actual validation score: 0.18954114986997395, number of remaining columns: 9

actual training score: 0.19686083364675722
actual validation score: 0.18780475315272926, number of remaining columns: 8

actual training score: 0.19686366346108541
actual validation score: 0.18761812788593735, number of remaining columns: 7

actual training score: 0.1968676040447226
actual validation score: 0.18724922023813684, number of remaining columns: 6

actual training score: 0.20519532516024674
actual validation score: 0.18110113215657142, number of remaining columns: 5

actual training score: 0.2063022024

In [6]:
### CMI FS, linear regression

res = {
    "delta" : [], # list with all deltas
    "numSelected" : [], #
    "selectedFeatures" : [] 
}

res['selectedFeatures'] = forwardFeatureSelection(10,np.array(full_df_emil.iloc[0:784,:-1]),np.array(full_df_emil.iloc[0:784,-1]),res,10,1)
selectedFeatures='selectedFeatures'
print(f'\n{res[selectedFeatures]}\n')
selected_colnames = full_df_emil.iloc[0:784,:-1].columns[res['selectedFeatures']]
compare_methods(full_df_emil.iloc[0:784,:-1], full_df_emil.iloc[784:,:-1], full_df_emil.iloc[0:784], full_df_emil.iloc[784:], selected_colnames)


----- MI Scores -----
[(8, 0.10677308044457821), (7, 0.07998379304570234), (4, 0.07568326651673606), (2, 0.0707690093560488), (0, 0.06681270116458896), (3, 0.06611643096346552), (1, 0.058977132394013), (5, 0.05272266190975899), (9, 0.05071543878043399), (6, 0.024314421910493933), (11, 0.022846847681457682), (10, 0.02160972710057967), (13, 0.02026627050130758), (12, 0.015021149115334188)]
Best MI score: 0.10677308044457821
Adding first best original feature: 8
CMI: 0.01904473341615727
CMI: 0.02758615011876482
CMI: 0.044616852132317905
CMI: 0.06624167323971823
CMI: 0.04575812555935198
CMI: 0.029118158141289138
CMI: 0.01151758323873553
CMI: 0.004205476190006691
CMI: 0.0029593693038672403
CMI: 0.0163007354102217
CMI: 0.0022666848944239704
Highest CMI score: 0.06624167323971823
Adding original feature: 3
CMI: 0.007889197252995112
CMI: 0.008674231438313945
CMI: 0.01725593486305943
CMI: 0.008223682412345834
CMI: 0.0022632118889105157
CMI: 0.032793339874439636
CMI: 0.03046836772971484
CMI: 0.0

## All

In [4]:
path='/Users/paolo/Documents/OneDrive - Politecnico di Milano/droughts/features/csv_mean/features_with_aggregations.csv'

df = pd.read_csv(path)

full_df = df.set_index('date').join(target_df.set_index('date').loc[:,'mean'],on='date')
full_df = full_df
full_df = full_df.sample(frac=1,random_state=42).reset_index().iloc[:,1:]
full_df = (full_df-full_df[0:587].mean())/full_df[0:587].std()
full_df = full_df.rename(columns={'mean':'mean_std'})
full_df

Unnamed: 0,cyclostationary_mean_Adda_tg,cyclostationary_mean_Dora_tg,cyclostationary_mean_Emiliani1_tg,cyclostationary_mean_Piemonte_Sud_tg,cyclostationary_mean_Piemonte_Nord_tg,cyclostationary_mean_Oglio_Iseo_tg,cyclostationary_mean_Ticino_tg,cyclostationary_mean_Garda_Mincio_tg,cyclostationary_mean_Lambro_Olona_tg,cyclostationary_mean_Emiliani2_tg,...,cyclostationary_mean_Lambro_Olona_rr_12w,cyclostationary_mean_Lambro_Olona_rr_16w,cyclostationary_mean_Lambro_Olona_rr_24w,cyclostationary_mean_Emiliani2_rr_1w,cyclostationary_mean_Emiliani2_rr_4w,cyclostationary_mean_Emiliani2_rr_8w,cyclostationary_mean_Emiliani2_rr_12w,cyclostationary_mean_Emiliani2_rr_16w,cyclostationary_mean_Emiliani2_rr_24w,mean_std
0,0.006824,0.080891,0.596731,-0.160059,0.137428,0.095889,-0.274535,0.526869,0.054171,0.186608,...,0.727145,0.970206,-0.144642,0.157558,1.093683,1.104316,0.939520,1.369578,0.325633,0.463787
1,-0.357125,0.293655,-1.610214,-0.147668,-0.496976,-0.987415,-0.048956,-1.448620,-1.558525,-1.206555,...,-0.415499,-0.141138,0.380466,-1.682053,0.126582,-0.657991,-0.787927,-0.678756,-0.225455,-0.479136
2,-0.085151,-0.383801,1.129743,0.279558,0.274454,0.281178,-0.054993,0.564053,0.673981,0.517479,...,0.830621,0.594429,-0.104474,0.554803,0.360345,0.283570,1.063649,0.692451,0.032648,-0.067946
3,-2.366818,-2.368039,-2.542257,-2.373116,-2.270612,-2.448647,-2.375566,-2.367601,-2.179528,-2.421726,...,1.875842,1.904663,1.314706,-0.295749,-0.163961,0.800103,1.439970,2.091421,2.096346,-0.727431
4,-0.356840,-0.020777,-0.521238,-0.439547,-0.204214,-0.343638,-0.176151,-0.328871,-0.383372,-0.485484,...,0.843995,1.310773,0.209057,-0.257116,0.054241,0.342975,-0.333173,-0.374726,-1.400288,0.248559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,0.332998,0.296616,0.746974,0.382502,0.279021,0.459944,0.301819,0.387113,0.259188,0.521754,...,-1.660833,-1.473781,-0.277854,-1.244376,-0.650640,-1.203343,-1.461452,-0.885803,-0.419054,-1.479312
977,1.080595,1.477705,0.386989,1.326431,1.368809,1.000030,1.295577,0.799559,1.067010,0.646330,...,-0.823433,-0.516489,-1.037792,-0.997692,-2.450834,-2.071970,-1.172678,-0.487388,-0.789769,-0.538063
978,-0.182650,-0.185003,0.111882,0.643581,0.391294,0.029811,-0.129098,0.072135,0.531165,0.672917,...,1.375164,0.998210,0.870613,-0.605914,-0.488927,-0.657819,-0.451612,-0.220726,0.840648,0.393759
979,1.626584,1.546044,0.968828,1.266546,1.420689,1.686314,1.750038,1.248080,1.184139,1.080765,...,0.242012,0.414568,0.494937,-0.285777,1.096019,0.508300,0.749960,0.721446,0.952282,0.671123


In [5]:
### wrapper FS, linear regression

selected_colnames = FS_with_linearWrapper(full_df.iloc[0:784,:-1], full_df.iloc[0:587], full_df.iloc[587:784], max_feat=50 , val_len=197)
compare_methods(full_df.iloc[0:784,:-1], full_df.iloc[784:,:-1], full_df.iloc[0:784], full_df.iloc[784:], selected_colnames)


actual training score: 0.19102362878485557
actual validation score: 0.19350904777114175, number of remaining columns: 138

actual training score: 0.1977056464882181
actual validation score: 0.22179924614012436, number of remaining columns: 137

actual training score: 0.20807516110200608
actual validation score: 0.2396065074085464, number of remaining columns: 136

actual training score: 0.2157560108637726
actual validation score: 0.24849183370483519, number of remaining columns: 135

actual training score: 0.22071948298632627
actual validation score: 0.25344508952474243, number of remaining columns: 134

actual training score: 0.22180723272519887
actual validation score: 0.2576345967559206, number of remaining columns: 133

actual training score: 0.23367638101787724
actual validation score: 0.26621563074742294, number of remaining columns: 132

actual training score: 0.2351339875533136
actual validation score: 0.27097192220775324, number of remaining columns: 131

actual training score

In [6]:
### CMI FS, linear regression

res = {
    "delta" : [], # list with all deltas
    "numSelected" : [], #
    "selectedFeatures" : [] 
}

res['selectedFeatures'] = forwardFeatureSelection(10,np.array(full_df.iloc[0:784,:-1]),np.array(full_df.iloc[0:784,-1]),res,10,1)
selectedFeatures='selectedFeatures'
print(f'\n{res[selectedFeatures]}\n')
selected_colnames = full_df.iloc[0:784,:-1].columns[res['selectedFeatures']]
compare_methods(full_df.iloc[0:784,:-1], full_df.iloc[784:,:-1], full_df.iloc[0:784], full_df.iloc[784:], selected_colnames)


----- MI Scores -----
[(68, 0.12000571547026052), (44, 0.11196962226646494), (38, 0.1072229147971109), (74, 0.10677308044457821), (93, 0.10675943690342135), (20, 0.10355586263873504), (92, 0.10086591322800217), (56, 0.09866145041396812), (6, 0.08773596694503226), (129, 0.08649024765350415), (130, 0.08400687106386533), (8, 0.08390900183096155), (5, 0.0826914728426489), (4, 0.08072865495173999), (124, 0.08069341270139493), (9, 0.07998379304570234), (3, 0.07868858777334835), (26, 0.07604351868366817), (137, 0.07568326651673606), (133, 0.07448582530901979), (62, 0.0735625739887098), (0, 0.07282420220752643), (50, 0.07270322125082382), (135, 0.0707690093560488), (82, 0.07019565957609351), (2, 0.06969696381609347), (1, 0.06927999529385478), (102, 0.06927975430611882), (122, 0.06704982995599282), (19, 0.06681270116458896), (96, 0.06667272272337936), (136, 0.06611643096346552), (33, 0.06543560390346351), (131, 0.06541802682289403), (123, 0.06396214398905382), (99, 0.06330594319379951), (32, 0.

### not considering last years

In [7]:
### emiliani2

In [14]:
path='/Users/paolo/Documents/OneDrive - Politecnico di Milano/droughts/features/csv_mean/features_with_aggregations.csv'

df_emil = pd.read_csv(path)
df_emil = df_emil.loc[:,['date','cyclostationary_mean_Emiliani2_rr',
                        'cyclostationary_mean_Emiliani2_rr_1w',
                        'cyclostationary_mean_Emiliani2_rr_4w',
                        'cyclostationary_mean_Emiliani2_rr_8w',
                        'cyclostationary_mean_Emiliani2_rr_12w',
                        'cyclostationary_mean_Emiliani2_rr_16w',
                        'cyclostationary_mean_Emiliani2_rr_24w',
                        'cyclostationary_mean_Emiliani2_tg',
                        'cyclostationary_mean_Emiliani2_tg_1w',
                        'cyclostationary_mean_Emiliani2_tg_4w',
                        'cyclostationary_mean_Emiliani2_tg_8w',
                        'cyclostationary_mean_Emiliani2_tg_12w',
                        'cyclostationary_mean_Emiliani2_tg_16w',
                        'cyclostationary_mean_Emiliani2_tg_24w']]

full_df_emil = df_emil.set_index('date').join(target_df.set_index('date').loc[:,'mean'],on='date')
full_df_emil = full_df_emil[:-114]
full_df_emil = full_df_emil.sample(frac=1,random_state=42).reset_index().iloc[:,1:]
full_df_emil = (full_df_emil-full_df_emil[0:587].mean())/full_df_emil[0:522].std()
full_df_emil = full_df_emil.rename(columns={'mean':'mean_std'})
full_df_emil

Unnamed: 0,cyclostationary_mean_Emiliani2_rr,cyclostationary_mean_Emiliani2_rr_1w,cyclostationary_mean_Emiliani2_rr_4w,cyclostationary_mean_Emiliani2_rr_8w,cyclostationary_mean_Emiliani2_rr_12w,cyclostationary_mean_Emiliani2_rr_16w,cyclostationary_mean_Emiliani2_rr_24w,cyclostationary_mean_Emiliani2_tg,cyclostationary_mean_Emiliani2_tg_1w,cyclostationary_mean_Emiliani2_tg_4w,cyclostationary_mean_Emiliani2_tg_8w,cyclostationary_mean_Emiliani2_tg_12w,cyclostationary_mean_Emiliani2_tg_16w,cyclostationary_mean_Emiliani2_tg_24w,mean_std
0,3.137763,1.253949,2.181901,0.468638,-0.198191,-0.381265,0.776173,-1.731953,-0.912991,0.037496,0.020651,0.107443,0.042016,-0.464475,-0.659768
1,-0.011466,0.017953,0.072129,0.448192,0.858997,0.765849,0.479436,0.154758,0.454989,-0.325855,-0.907882,-0.964694,-0.776038,-0.580594,-0.291700
2,-0.329903,-0.545009,-0.778419,-0.187246,-0.341337,-0.573176,-1.138025,1.398985,0.596533,0.269121,-0.464928,-0.516485,0.045693,-0.754607,-0.424929
3,-0.459953,-0.545016,0.059672,0.101762,-0.269854,-0.678169,-0.611521,1.014296,1.077430,1.424495,0.539859,0.312970,0.038473,0.261657,-0.764559
4,-0.254981,-0.447667,-0.645477,-0.313133,-0.770727,-0.170846,0.202178,1.329270,1.010015,0.753830,0.064039,0.477932,0.385753,0.304312,-0.775374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862,-0.920180,-1.299560,-0.701741,-1.294808,-1.530390,-0.919833,-0.437819,0.562016,0.407474,-0.689512,-0.698752,-1.050392,-0.649898,-0.098451,-1.542125
863,-1.879523,-1.049745,-2.620077,-2.225628,-1.229755,-0.511595,-0.806448,0.686745,1.074422,0.568207,0.692155,0.300246,-0.084478,-0.019234,-0.599071
864,-0.127234,-0.652994,-0.529415,-0.710224,-0.479075,-0.238358,0.814797,0.713365,1.742598,2.278680,2.342668,2.296453,2.575864,1.337449,0.334537
865,-0.189983,-0.328794,1.159546,0.539387,0.771848,0.727042,0.925804,1.121717,1.156031,0.088506,-0.090556,0.013436,-0.323670,-0.602247,0.612434


In [18]:
### wrapper FS, linear regression

selected_colnames = FS_with_linearWrapper(full_df_emil.iloc[0:696,:-1], full_df_emil.iloc[0:522], full_df_emil.iloc[522:696], max_feat=12 , val_len=174)
compare_methods(full_df_emil.iloc[0:696,:-1], full_df_emil.iloc[696:,:-1], full_df_emil.iloc[0:696], full_df_emil.iloc[696:], selected_colnames)


actual training score: 0.19207909117865352
actual validation score: 0.20525185197614915, number of remaining columns: 12

actual training score: 0.20522479304882457
actual validation score: 0.2228733815190136, number of remaining columns: 11

actual training score: 0.21511622817393383
actual validation score: 0.23449632537562248, number of remaining columns: 10

actual training score: 0.21525595918709983
actual validation score: 0.23665104026055783, number of remaining columns: 9

actual training score: 0.2155552073519167
actual validation score: 0.23738719114008966, number of remaining columns: 8

actual training score: 0.21580491912129685
actual validation score: 0.23782823219082838, number of remaining columns: 7

actual training score: 0.2169371299044841
actual validation score: 0.23554419959816897, number of remaining columns: 6

actual training score: 0.2256468809397436
actual validation score: 0.23217121166979848, number of remaining columns: 5

actual training score: 0.23346728

In [19]:
### CMI FS, linear regression

res = {
    "delta" : [], # list with all deltas
    "numSelected" : [], #
    "selectedFeatures" : [] 
}

res['selectedFeatures'] = forwardFeatureSelection(10,np.array(full_df_emil.iloc[0:696,:-1]),np.array(full_df_emil.iloc[0:696,-1]),res,10,1)
selectedFeatures='selectedFeatures'
print(f'\n{res[selectedFeatures]}\n')
selected_colnames = full_df_emil.iloc[0:696,:-1].columns[res['selectedFeatures']]
compare_methods(full_df_emil.iloc[0:696,:-1], full_df_emil.iloc[696:,:-1], full_df_emil.iloc[0:696], full_df_emil.iloc[696:], selected_colnames)


----- MI Scores -----
[(8, 0.10785149396760195), (7, 0.0925598801150079), (4, 0.08573739171513929), (3, 0.06919918548391668), (1, 0.06846795273706198), (2, 0.06801108095247393), (0, 0.062210882090445334), (5, 0.0490032047453643), (9, 0.045858418932004714), (6, 0.02590810773838242), (10, 0.025728206674913337), (11, 0.017848888073754034), (12, 0.010638385809115239), (13, 0.006162598157561256)]
Best MI score: 0.10785149396760195
Adding first best original feature: 8
CMI: 0.028897808187487314
CMI: 0.0459666787351949
CMI: 0.05127150084619923
CMI: 0.07830366322577818
CMI: 0.07029450796278795
CMI: 0.05542789877949962
CMI: 0.03009536146948509
CMI: 0.01669249553935731
CMI: 0.020169126521406386
CMI: 0.023260041967649983
CMI: 0.008706104840388099
Highest CMI score: 0.07830366322577818
Adding original feature: 3
CMI: 0.005326730005477093
CMI: 0.007298483191220562
CMI: 0.02557754808247603
CMI: 0.011856590857385413
CMI: 0.017954700737115192
CMI: 0.008096350040648154
CMI: 0.026943284884130836
CMI: 0.

In [20]:
### all

In [21]:
path='/Users/paolo/Documents/OneDrive - Politecnico di Milano/droughts/features/csv_mean/features_with_aggregations.csv'

df = pd.read_csv(path)

full_df = df.set_index('date').join(target_df.set_index('date').loc[:,'mean'],on='date')
full_df = full_df[:-114]
full_df = full_df.sample(frac=1,random_state=42).reset_index().iloc[:,1:]
full_df = (full_df-full_df[0:587].mean())/full_df[0:522].std()
full_df = full_df.rename(columns={'mean':'mean_std'})
full_df

Unnamed: 0,cyclostationary_mean_Adda_tg,cyclostationary_mean_Dora_tg,cyclostationary_mean_Emiliani1_tg,cyclostationary_mean_Piemonte_Sud_tg,cyclostationary_mean_Piemonte_Nord_tg,cyclostationary_mean_Oglio_Iseo_tg,cyclostationary_mean_Ticino_tg,cyclostationary_mean_Garda_Mincio_tg,cyclostationary_mean_Lambro_Olona_tg,cyclostationary_mean_Emiliani2_tg,...,cyclostationary_mean_Lambro_Olona_rr_12w,cyclostationary_mean_Lambro_Olona_rr_16w,cyclostationary_mean_Lambro_Olona_rr_24w,cyclostationary_mean_Emiliani2_rr_1w,cyclostationary_mean_Emiliani2_rr_4w,cyclostationary_mean_Emiliani2_rr_8w,cyclostationary_mean_Emiliani2_rr_12w,cyclostationary_mean_Emiliani2_rr_16w,cyclostationary_mean_Emiliani2_rr_24w,mean_std
0,-1.758187,-2.033375,-1.221340,-2.029975,-1.869988,-1.600931,-1.646392,-1.355072,-1.816625,-1.731953,...,0.042774,-0.127681,1.183497,1.253949,2.181901,0.468638,-0.198191,-0.381265,0.776173,-0.659768
1,-0.596105,-0.665636,0.480257,-0.560292,-0.762011,-0.237416,-0.653932,-0.039245,-0.426897,0.154758,...,0.537901,0.627999,0.394173,0.017953,0.072129,0.448192,0.858997,0.765849,0.479436,-0.291700
2,1.170615,1.514403,1.364940,1.469126,1.460958,1.451657,1.413544,1.541356,1.312803,1.398985,...,0.765946,0.899758,0.503096,-0.545009,-0.778419,-0.187246,-0.341337,-0.573176,-1.138025,-0.424929
3,0.513674,0.709105,1.066441,1.440118,1.212746,0.474372,0.741635,0.707087,1.463315,1.014296,...,-0.469616,-0.903569,-0.672614,-0.545016,0.059672,0.101762,-0.269854,-0.678169,-0.611521,-0.764559
4,1.346417,1.397291,1.089196,1.336848,1.430618,1.454830,1.496827,1.398917,1.478756,1.329270,...,-0.413923,0.456834,0.736452,-0.447667,-0.645477,-0.313133,-0.770727,-0.170846,0.202178,-0.775374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862,0.350608,0.348667,0.766686,0.442571,0.332239,0.471640,0.333891,0.403558,0.305765,0.562016,...,-1.673113,-1.494331,-0.253450,-1.299560,-0.701741,-1.294808,-1.530390,-0.919833,-0.437819,-1.542125
863,1.096878,1.559192,0.409483,1.409023,1.440078,1.010069,1.333960,0.812876,1.116827,0.686745,...,-0.819908,-0.508847,-1.023851,-1.049745,-2.620077,-2.225628,-1.229755,-0.511595,-0.806448,-0.599071
864,-0.164125,-0.144956,0.136503,0.709879,0.446372,0.042827,-0.099763,0.090968,0.578833,0.713365,...,1.420184,1.050460,0.910831,-0.652994,-0.529415,-0.710224,-0.479075,-0.238358,0.814797,0.334537
865,1.641897,1.629234,0.986825,1.347709,1.492818,1.694247,1.791308,1.257996,1.234425,1.121717,...,0.265646,0.449629,0.529982,-0.328794,1.159546,0.539387,0.771848,0.727042,0.925804,0.612434


In [22]:
### wrapper FS, linear regression

selected_colnames = FS_with_linearWrapper(full_df.iloc[0:696,:-1], full_df.iloc[0:522], full_df.iloc[522:696], max_feat=12 , val_len=174)
compare_methods(full_df.iloc[0:696,:-1], full_df.iloc[696:,:-1], full_df.iloc[0:696], full_df.iloc[696:], selected_colnames)


actual training score: 0.20378426048248366
actual validation score: 0.2427999631369473, number of remaining columns: 138

actual training score: 0.2071742141923142
actual validation score: 0.25420934405757545, number of remaining columns: 137

actual training score: 0.22806836001534014
actual validation score: 0.26825654281206346, number of remaining columns: 136

actual training score: 0.2412188981512392
actual validation score: 0.2830869009771929, number of remaining columns: 135

actual training score: 0.24461617928422774
actual validation score: 0.2986721151198445, number of remaining columns: 134

actual training score: 0.24710993381251167
actual validation score: 0.3151158528836042, number of remaining columns: 133

actual training score: 0.2513158109754534
actual validation score: 0.3252643237897356, number of remaining columns: 132

actual training score: 0.2551025844651862
actual validation score: 0.3285914256350678, number of remaining columns: 131

actual training score: 0.2

In [23]:
### CMI FS, linear regression

res = {
    "delta" : [], # list with all deltas
    "numSelected" : [], #
    "selectedFeatures" : [] 
}

res['selectedFeatures'] = forwardFeatureSelection(10,np.array(full_df.iloc[0:696,:-1]),np.array(full_df.iloc[0:696,-1]),res,10,1)
selectedFeatures='selectedFeatures'
print(f'\n{res[selectedFeatures]}\n')
selected_colnames = full_df.iloc[0:696,:-1].columns[res['selectedFeatures']]
compare_methods(full_df.iloc[0:696,:-1], full_df.iloc[696:,:-1], full_df.iloc[0:696], full_df.iloc[696:], selected_colnames)


----- MI Scores -----
[(38, 0.12525279788700655), (20, 0.1242244889241374), (68, 0.11756652913766734), (93, 0.11418231052810235), (44, 0.10940977531524726), (56, 0.1089491031168935), (74, 0.10785149396760195), (6, 0.10396690353784173), (8, 0.10212088871487965), (92, 0.10162089330494874), (26, 0.09717563121719888), (0, 0.09529656706491889), (3, 0.09499684951475833), (9, 0.0925598801150079), (122, 0.0924392394698762), (62, 0.08989574790286198), (5, 0.08886401753593759), (50, 0.08859160318190125), (1, 0.08794628031977536), (137, 0.08573739171513929), (102, 0.08419695208528334), (4, 0.082902421916586), (131, 0.08258842484478227), (130, 0.08144567912680117), (7, 0.07974300242724029), (124, 0.0795134513128012), (94, 0.07602292850573968), (32, 0.07206694722109043), (39, 0.07204003574375592), (82, 0.07048269345880516), (101, 0.07002870050115716), (136, 0.06919918548391668), (134, 0.06846795273706198), (12, 0.06842816402356891), (135, 0.06801108095247393), (133, 0.06655586963399417), (81, 0.066

CMI: 0.000502172586024946
CMI: 0.005694708624840422
CMI: 0.0008545745333239929
CMI: 0.0059906151674751495
CMI: 0.008658497360091194
CMI: 0.0011566543052717315
Highest CMI score: 0.009573141304141719
Adding original feature: 101
CMI: 3.133627546486606e-05
CMI: 0.0033410086374403847
CMI: 0.0033206444321957074
Highest CMI score: 0.0033410086374403847
Adding original feature: 113
CMI: 0.00015806725616368533
CMI: 0.0015869613634850177
CMI: 0.0008837175139006503
Highest CMI score: 0.0015869613634850177
Adding original feature: 100
CMI: 0.0006966666584330428
Highest CMI score: 0.0006966666584330428
Adding original feature: 136
CMI: 0.004917802118595438
Highest CMI score: 0.004917802118595438
Adding original feature: 74
Highest CMI score: -0.0016810224605749713

[38, 123, 93, 31, 68, 101, 113, 100, 136, 74]

Full aggregate regression train score: 0.5147474556658631, test score: 0.14645440245848917
Aggregate regression train score with FS: 0.2396924780361489, test score: 0.19243911998107788
