In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 50)

In [2]:
data_folder = '../Data/Training/pickle_data'
exclude_features = ['TIMESTAMP', 'pit_number', 'Redox_error_flag']

## 2022 data

In [3]:
# Get train and test data for 2022
# Train X
folder = f'{data_folder}/2022/X_train.pkl'
train_X = pd.read_pickle(open(folder, 'rb'))
train_X = train_X.loc[:,~train_X.columns.isin(exclude_features)]
# Test X
folder = f'{data_folder}/2022/X_test.pkl'
test_X = pd.read_pickle(open(folder, 'rb'))
test_X = test_X.loc[:,~test_X.columns.isin(exclude_features)]
# Train y
folder = f'{data_folder}/2022/y_train.pkl'
train_y = pd.read_pickle(open(folder, 'rb'))

### Feature selection

In [4]:
selector = SelectKBest(f_classif, k=10)
selector.fit(train_X, np.ravel(train_y))

print('Number of input features: ', selector.n_features_in_)

feature_scores = {'Input features Names': list(selector.feature_names_in_),
                  'Input features scores': list(selector.scores_),
                  'Input features pvalues': list(selector.pvalues_)}

df = pd.DataFrame(feature_scores)
df.sort_values(by=['Input features scores'], ascending=False, inplace=True)

print('Top 10 features Names:\n\t', list(selector.get_feature_names_out()))

df

Number of input features:  48
Top 10 features Names:
	 ['Redox_Avg(2)_sigma_b_24', 'Redox_Avg(2)_sigma_f_24', 'Redox_Avg(3)_sigma_b_24', 'Redox_Avg(3)_sigma_f_24', 'Redox_Avg(4)_sigma_b_24', 'Redox_Avg(4)_sigma_f_24', 'Redox_Avg(5)_sigma_b_24', 'Redox_Avg(5)_sigma_f_24', 'Redox_Avg(3)_sigma_f_12', 'Redox_Avg(4)_sigma_b_12']


Unnamed: 0,Input features Names,Input features scores,Input features pvalues
36,Redox_Avg(5)_sigma_b_24,37785.75981,0.0
34,Redox_Avg(4)_sigma_b_24,37514.940848,0.0
37,Redox_Avg(5)_sigma_f_24,36051.098122,0.0
35,Redox_Avg(4)_sigma_f_24,35833.798843,0.0
32,Redox_Avg(3)_sigma_b_24,35499.949963,0.0
33,Redox_Avg(3)_sigma_f_24,34869.160166,0.0
31,Redox_Avg(2)_sigma_f_24,32303.661227,0.0
30,Redox_Avg(2)_sigma_b_24,30880.686566,0.0
44,Redox_Avg(4)_sigma_b_12,29524.1691,0.0
43,Redox_Avg(3)_sigma_f_12,29488.149863,0.0


## 2022_sensors data

In [5]:
sensor_data = dict()

# Get train and test data for 2022_sensors
for sensor in range(1,6):
    features = [f'Redox_Avg({sensor})', f'EC_Avg({sensor})', f'Matric_potential_Avg({sensor})', f'Temp_T12_Avg({sensor})',
                    'Water_level_Avg', 'Temp_ottpls_Avg', 'BatterymV_Min', f'WC{sensor}',
                    f'Redox_Avg({sensor})_sigma_b_24', f'Redox_Avg({sensor})_sigma_f_24',
                    f'Redox_Avg({sensor})_sigma_b_12', f'Redox_Avg({sensor})_sigma_f_12']

    # Train X
    folder = f'{data_folder}/2022_sensors/X_train_sensor_{sensor}.pkl'
    train_X = pd.read_pickle(open(folder, 'rb'))
    train_X = train_X.loc[:,features]
    # Test X
    folder = f'{data_folder}/2022_sensors/X_test_sensor_{sensor}.pkl'
    test_X = pd.read_pickle(open(folder, 'rb'))
    test_X = test_X.loc[:,features]
    # Train y
    folder = f'{data_folder}/2022_sensors/y_train_sensor_{sensor}.pkl'
    train_y = pd.read_pickle(open(folder, 'rb'))

    sensor_data[f'train_X_{sensor}'] = train_X
    sensor_data[f'tets_X_{sensor}'] = test_X
    sensor_data[f'train_y_{sensor}'] = train_y

### Sensor 1

In [6]:
train_X = sensor_data['train_X_1']
train_y = sensor_data['train_y_1']

selector = SelectKBest(f_classif, k=10)
selector.fit(train_X, np.ravel(train_y))

print('Number of input features: ', selector.n_features_in_)

feature_scores = {'Input features Names': list(selector.feature_names_in_),
                  'Input features scores': list(selector.scores_),
                  'Input features pvalues': list(selector.pvalues_)}

df = pd.DataFrame(feature_scores)
df.sort_values(by=['Input features scores'], ascending=False, inplace=True)

print('Top 10 features Names:\n\t', list(selector.get_feature_names_out()))

df

Number of input features:  12
Top 10 features Names:
	 ['EC_Avg(1)', 'Matric_potential_Avg(1)', 'Water_level_Avg', 'Temp_ottpls_Avg', 'BatterymV_Min', 'WC1', 'Redox_Avg(1)_sigma_b_24', 'Redox_Avg(1)_sigma_f_24', 'Redox_Avg(1)_sigma_b_12', 'Redox_Avg(1)_sigma_f_12']


Unnamed: 0,Input features Names,Input features scores,Input features pvalues
8,Redox_Avg(1)_sigma_b_24,4495.23934,0.0
11,Redox_Avg(1)_sigma_f_12,3911.708658,0.0
9,Redox_Avg(1)_sigma_f_24,3778.678981,0.0
10,Redox_Avg(1)_sigma_b_12,3770.28276,0.0
1,EC_Avg(1),3548.809454,0.0
4,Water_level_Avg,3476.641388,0.0
5,Temp_ottpls_Avg,1595.607131,0.0
2,Matric_potential_Avg(1),850.799652,3.186549e-186
6,BatterymV_Min,793.68017,6.55052e-174
7,WC1,320.564813,1.427359e-71


### Sensor 2

In [7]:
train_X = sensor_data['train_X_2']
train_y = sensor_data['train_y_2']

selector = SelectKBest(f_classif, k=10)
selector.fit(train_X, np.ravel(train_y))

print('Number of input features: ', selector.n_features_in_)

feature_scores = {'Input features Names': list(selector.feature_names_in_),
                  'Input features scores': list(selector.scores_),
                  'Input features pvalues': list(selector.pvalues_)}

df = pd.DataFrame(feature_scores)
df.sort_values(by=['Input features scores'], ascending=False, inplace=True)

print('Top 10 features Names:\n\t', list(selector.get_feature_names_out()))

df

Number of input features:  12
Top 10 features Names:
	 ['Redox_Avg(2)', 'EC_Avg(2)', 'Matric_potential_Avg(2)', 'Water_level_Avg', 'Temp_ottpls_Avg', 'BatterymV_Min', 'Redox_Avg(2)_sigma_b_24', 'Redox_Avg(2)_sigma_f_24', 'Redox_Avg(2)_sigma_b_12', 'Redox_Avg(2)_sigma_f_12']


Unnamed: 0,Input features Names,Input features scores,Input features pvalues
9,Redox_Avg(2)_sigma_f_24,32418.505639,0.0
8,Redox_Avg(2)_sigma_b_24,30883.173054,0.0
11,Redox_Avg(2)_sigma_f_12,28562.323056,0.0
10,Redox_Avg(2)_sigma_b_12,27132.647302,0.0
0,Redox_Avg(2),23616.452709,0.0
4,Water_level_Avg,12407.256341,0.0
6,BatterymV_Min,6819.797537,0.0
5,Temp_ottpls_Avg,3224.664039,0.0
2,Matric_potential_Avg(2),2619.342862,0.0
1,EC_Avg(2),1417.654918,5.489453e-308


### Sensor 3

In [8]:
train_X = sensor_data['train_X_3']
train_y = sensor_data['train_y_3']

selector = SelectKBest(f_classif, k=10)
selector.fit(train_X, np.ravel(train_y))

print('Number of input features: ', selector.n_features_in_)

feature_scores = {'Input features Names': list(selector.feature_names_in_),
                  'Input features scores': list(selector.scores_),
                  'Input features pvalues': list(selector.pvalues_)}

df = pd.DataFrame(feature_scores)
df.sort_values(by=['Input features scores'], ascending=False, inplace=True)

print('Top 10 features Names:\n\t', list(selector.get_feature_names_out()))

df

Number of input features:  12
Top 10 features Names:
	 ['Redox_Avg(3)', 'EC_Avg(3)', 'Matric_potential_Avg(3)', 'Water_level_Avg', 'Temp_ottpls_Avg', 'BatterymV_Min', 'Redox_Avg(3)_sigma_b_24', 'Redox_Avg(3)_sigma_f_24', 'Redox_Avg(3)_sigma_b_12', 'Redox_Avg(3)_sigma_f_12']


Unnamed: 0,Input features Names,Input features scores,Input features pvalues
8,Redox_Avg(3)_sigma_b_24,35499.949963,0.0
9,Redox_Avg(3)_sigma_f_24,34869.160166,0.0
11,Redox_Avg(3)_sigma_f_12,29488.149863,0.0
10,Redox_Avg(3)_sigma_b_12,29372.650393,0.0
4,Water_level_Avg,12419.345558,0.0
6,BatterymV_Min,6833.546186,0.0
5,Temp_ottpls_Avg,3236.152214,0.0
0,Redox_Avg(3),3099.415461,0.0
2,Matric_potential_Avg(3),1764.837164,0.0
1,EC_Avg(3),429.024342,4.282659e-95


### Sensor 4

In [9]:
train_X = sensor_data['train_X_4']
train_y = sensor_data['train_y_4']

selector = SelectKBest(f_classif, k=10)
selector.fit(train_X, np.ravel(train_y))

print('Number of input features: ', selector.n_features_in_)

feature_scores = {'Input features Names': list(selector.feature_names_in_),
                  'Input features scores': list(selector.scores_),
                  'Input features pvalues': list(selector.pvalues_)}

df = pd.DataFrame(feature_scores)
df.sort_values(by=['Input features scores'], ascending=False, inplace=True)

print('Top 10 features Names:\n\t', list(selector.get_feature_names_out()))

df

Number of input features:  12
Top 10 features Names:
	 ['Redox_Avg(4)', 'EC_Avg(4)', 'Water_level_Avg', 'Temp_ottpls_Avg', 'BatterymV_Min', 'WC4', 'Redox_Avg(4)_sigma_b_24', 'Redox_Avg(4)_sigma_f_24', 'Redox_Avg(4)_sigma_b_12', 'Redox_Avg(4)_sigma_f_12']


Unnamed: 0,Input features Names,Input features scores,Input features pvalues
8,Redox_Avg(4)_sigma_b_24,37514.940848,0.0
9,Redox_Avg(4)_sigma_f_24,35833.798843,0.0
10,Redox_Avg(4)_sigma_b_12,29524.1691,0.0
11,Redox_Avg(4)_sigma_f_12,28993.701297,0.0
4,Water_level_Avg,12419.345558,0.0
6,BatterymV_Min,6833.546186,0.0
5,Temp_ottpls_Avg,3236.152214,0.0
1,EC_Avg(4),1868.917452,0.0
0,Redox_Avg(4),1003.04506,5.3015130000000005e-219
7,WC4,728.81649,6.438412e-160


### Sensor 5

In [10]:
train_X = sensor_data['train_X_5']
train_y = sensor_data['train_y_5']

selector = SelectKBest(f_classif, k=10)
selector.fit(train_X, np.ravel(train_y))

print('Number of input features: ', selector.n_features_in_)

feature_scores = {'Input features Names': list(selector.feature_names_in_),
                  'Input features scores': list(selector.scores_),
                  'Input features pvalues': list(selector.pvalues_)}

df = pd.DataFrame(feature_scores)
df.sort_values(by=['Input features scores'], ascending=False, inplace=True)

print('Top 10 features Names:\n\t', list(selector.get_feature_names_out()))

df

Number of input features:  12
Top 10 features Names:
	 ['EC_Avg(5)', 'Matric_potential_Avg(5)', 'Water_level_Avg', 'Temp_ottpls_Avg', 'BatterymV_Min', 'WC5', 'Redox_Avg(5)_sigma_b_24', 'Redox_Avg(5)_sigma_f_24', 'Redox_Avg(5)_sigma_b_12', 'Redox_Avg(5)_sigma_f_12']


Unnamed: 0,Input features Names,Input features scores,Input features pvalues
8,Redox_Avg(5)_sigma_b_24,37785.75981,0.0
9,Redox_Avg(5)_sigma_f_24,36051.098122,0.0
10,Redox_Avg(5)_sigma_b_12,28935.228681,0.0
11,Redox_Avg(5)_sigma_f_12,28459.880133,0.0
4,Water_level_Avg,12419.345558,0.0
7,WC5,7417.736448,0.0
6,BatterymV_Min,6833.546186,0.0
1,EC_Avg(5),4941.956136,0.0
2,Matric_potential_Avg(5),4051.378508,0.0
5,Temp_ottpls_Avg,3236.152214,0.0


# Initial results

In [4]:
results_csv = pd.read_csv('./pack/Results.csv')
results_1_csv = pd.read_csv('./pack/Results_1.csv')

In [16]:
list(results_csv.columns.array)

['Unnamed: 0',
 'mean_fit_time',
 'std_fit_time',
 'mean_score_time',
 'std_score_time',
 'param_C',
 'param_degree',
 'params',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'split5_test_score',
 'split6_test_score',
 'split7_test_score',
 'split8_test_score',
 'split9_test_score',
 'mean_test_score',
 'std_test_score',
 'rank_test_score',
 'split0_train_score',
 'split1_train_score',
 'split2_train_score',
 'split3_train_score',
 'split4_train_score',
 'split5_train_score',
 'split6_train_score',
 'split7_train_score',
 'split8_train_score',
 'split9_train_score',
 'mean_train_score',
 'std_train_score']

In [20]:
# 'mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'params', 'mean_test_score', 'std_test_score', 'rank_test_score', 'mean_train_score'
results_csv.sort_values(by=['rank_test_score']).loc[:, ['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'params', 'mean_test_score', 'std_test_score', 'rank_test_score', 'mean_train_score']]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,mean_test_score,std_test_score,rank_test_score,mean_train_score
32,39.147348,60.206797,0.697735,0.063821,"{'C': 6, 'degree': 5}",0.999891,0.000123,1,0.999913
37,16.884655,1.032047,0.628651,0.036295,"{'C': 7, 'degree': 5}",0.999883,0.000118,2,0.999913
27,20.478305,1.66503,0.795193,0.060853,"{'C': 5, 'degree': 5}",0.999876,0.000126,3,0.999897
38,20.077468,1.637983,0.731186,0.035285,"{'C': 7, 'degree': 6}",0.999876,9.8e-05,4,0.9999
36,16.776244,0.952237,0.606928,0.048837,"{'C': 7, 'degree': 4}",0.999869,0.000125,5,0.999893
31,18.846284,1.011086,0.644017,0.035454,"{'C': 6, 'degree': 4}",0.999861,0.000124,6,0.999879
26,20.658843,1.531004,0.722552,0.037715,"{'C': 5, 'degree': 4}",0.99984,0.00013,7,0.999867
33,21.550211,1.238632,0.808091,0.034756,"{'C': 6, 'degree': 6}",0.999796,8.5e-05,8,0.999828
35,18.319232,1.641818,0.623619,0.028464,"{'C': 7, 'degree': 3}",0.999789,0.000147,9,0.999802
30,19.75101,0.722692,0.702813,0.057918,"{'C': 6, 'degree': 3}",0.999781,0.000142,10,0.999793


In [18]:
results_csv.sort_values(by=['rank_test_score']).loc[:, ['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'params', 'mean_test_score', 'std_test_score', 'rank_test_score', 'mean_train_score']]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,mean_test_score,std_test_score,rank_test_score,mean_train_score
32,39.147348,60.206797,0.697735,0.063821,"{'C': 6, 'degree': 5}",0.999891,0.000123,1,0.999913
37,16.884655,1.032047,0.628651,0.036295,"{'C': 7, 'degree': 5}",0.999883,0.000118,2,0.999913
27,20.478305,1.66503,0.795193,0.060853,"{'C': 5, 'degree': 5}",0.999876,0.000126,3,0.999897
38,20.077468,1.637983,0.731186,0.035285,"{'C': 7, 'degree': 6}",0.999876,9.8e-05,4,0.9999
36,16.776244,0.952237,0.606928,0.048837,"{'C': 7, 'degree': 4}",0.999869,0.000125,5,0.999893
31,18.846284,1.011086,0.644017,0.035454,"{'C': 6, 'degree': 4}",0.999861,0.000124,6,0.999879
26,20.658843,1.531004,0.722552,0.037715,"{'C': 5, 'degree': 4}",0.99984,0.00013,7,0.999867
33,21.550211,1.238632,0.808091,0.034756,"{'C': 6, 'degree': 6}",0.999796,8.5e-05,8,0.999828
35,18.319232,1.641818,0.623619,0.028464,"{'C': 7, 'degree': 3}",0.999789,0.000147,9,0.999802
30,19.75101,0.722692,0.702813,0.057918,"{'C': 6, 'degree': 3}",0.999781,0.000142,10,0.999793
