In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GroupKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## Custom libraries
import index_helpers as ih

In [2]:
## Import, index, and split
df = ih.read_and_merge_segmented_data()
df = ih.index_df_by_person(df)
df = ih.categorical_float_to_int(df)
df = ih.categorical_to_dummy(df)
X_train, X_val, y_train, y_val = ih.train_test_split_on_index(features = df.drop("Label", axis=1),
                                                             label = df["Label"])

## Modify data for GroupKFold
groups = y_train.reset_index()['File_Name_split']
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Label,Zero_Crossing_Rate,RMS_Power,Dominant_Freq,Spectral_Centroid,Spectral_Rolloff,Spectral_Spread,Spectral_Skewness,Spectral_Kurtosis,Spectral_Bandwidth,...,EEPD950_1000_6,EEPD950_1000_7,EEPD950_1000_8,EEPD950_1000_9,EEPD950_1000_10,EEPD950_1000_11,EEPD950_1000_12,EEPD950_1000_13,EEPD950_1000_15,EEPD950_1000_17
File_Name_split,File_n_recording,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
008ba489-31ad-44d8-856b-fcf72369dc46,0,1,0.160222,0.128937,0.03125,1773.765792,3415.0,1422.780346,0.758778,2.332231,262068.236944,...,0,0,0,0,0,0,0,0,0,0
008ba489-31ad-44d8-856b-fcf72369dc46,1,1,0.157047,0.092217,0.0625,1801.097694,2501.0,1486.201986,0.657349,2.000908,183434.937753,...,0,0,0,0,0,0,0,0,0,0
008ba489-31ad-44d8-856b-fcf72369dc46,2,1,0.154893,0.075413,0.046875,1589.223922,2522.0,1294.383203,1.037579,3.070197,137156.195637,...,0,0,0,0,0,0,0,0,0,0
008ba489-31ad-44d8-856b-fcf72369dc46,3,1,0.150417,0.049969,0.046875,1476.036555,2273.0,1301.461039,1.146937,3.091304,101036.32178,...,0,0,0,0,0,0,0,0,0,0
008ba489-31ad-44d8-856b-fcf72369dc46,4,1,0.151335,0.049786,0.050781,1478.032136,2298.0,1299.889388,1.138023,3.063961,101436.817158,...,0,0,0,0,0,0,0,0,0,0


In [3]:
### Linear Discriminant Analysis

## Make pipeline - name classifier "clf"
clf_pipeline = Pipeline([("st_scaler", StandardScaler()),
                        ("clf", LinearDiscriminantAnalysis())])

## Use "clf__" in order to correctly assign parameters to the clf object
clf_param_grid = {'clf__tol':[0.0003, 0.0002, 0.0001, 0.00005],
                 'clf__solver':['svd', 'lsqr','eigen']}

## Instantiate GroupKFold to avoid data leakage - to be passed to cv
gkf=GroupKFold(n_splits=10)

## Set up Randomized search CV
clf_rand_auc = RandomizedSearchCV(estimator=clf_pipeline,
                                  param_distributions=clf_param_grid,
                                  cv=gkf, scoring='roc_auc', verbose=1, n_jobs=2, n_iter=50)

## Perform Group K-Cross-validation
clf_rand_auc.fit(X_train, y_train, groups=groups)

## Print results
print("Best score: ",  clf_rand_auc.best_score_)
print("Best estimator: ", clf_rand_auc.best_estimator_)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   18.2s
[Parallel(n_jobs=2)]: Done 120 out of 120 | elapsed:   41.8s finished


Best score:  0.5470849141889804
Best estimator:  Pipeline(steps=[('st_scaler', StandardScaler()),
                ('clf', LinearDiscriminantAnalysis(solver='lsqr', tol=0.0003))])
