In [47]:
from preprocessing import preprocess 
from cols_trie import gen_trie

import pandas as pd
import json
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [48]:
df_train = pd.read_csv('datasets/train_radiomics_hipocamp.csv')
df_test = pd.read_csv('datasets/test_radiomics_hipocamp.csv')

X,y,le = preprocess(df_train,mode="all")
df_train = pd.concat([X, pd.Series(y.values.ravel(), name='Transition')], axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2013 entries, diagnostics_Image-original_Mean to Age
dtypes: float64(2013)
memory usage: 4.7 MB


In [49]:
cols_trie = gen_trie(X.columns)

important = {"Age", "Sex","diagnostics","Transition"}
target = "Transition"
important_cols = [list(filter(lambda x:x.startswith(g), df_train.columns)) for g in important]
important_cols = [item for sublist in important_cols for item in sublist]
groups = set(cols_trie.keys()) - (important | {target})

grouped = [list(filter(lambda x:x.startswith(g), df_train.columns)) for g in groups]

In [50]:
dfs = [pd.concat([df_train[g],df_train[list(important_cols)]],axis=1) for g in grouped]
dfs[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Data columns (total 99 columns):
 #   Column                                                         Non-Null Count  Dtype  
---  ------                                                         --------------  -----  
 0   log-sigma-2-0-mm-3D_firstorder_10Percentile                    305 non-null    float64
 1   log-sigma-2-0-mm-3D_firstorder_90Percentile                    305 non-null    float64
 2   log-sigma-2-0-mm-3D_firstorder_Energy                          305 non-null    float64
 3   log-sigma-2-0-mm-3D_firstorder_Entropy                         305 non-null    float64
 4   log-sigma-2-0-mm-3D_firstorder_InterquartileRange              305 non-null    float64
 5   log-sigma-2-0-mm-3D_firstorder_Kurtosis                        305 non-null    float64
 6   log-sigma-2-0-mm-3D_firstorder_Maximum                         305 non-null    float64
 7   log-sigma-2-0-mm-3D_firstorder_MeanAbsoluteDeviation          

In [102]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score,f1_score

def train_svm_voting(df_list, target_column, test_size=0.2, random_state=42):
    classifiers = []
    weights = []
    models = []
    for i, df in enumerate(df_list):
        X = df.drop(columns=[target_column])
        y = df[target_column]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        
        svm = SVC(
                probability=True,
                random_state=random_state,
                C=5,
                gamma='auto',
                kernel='rbf'
                )
        svm.fit(X_train, y_train)

        y_pred = svm.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        weights.append(f1)
        classifiers.append((f'svm_{i} acc:{accuracy} f1:{f1}', svm))
        models.append({'model':svm,'accuracy':accuracy,'f1':f1,'cols':df.columns})
    # Aggregate results via voting
    voting_clf = VotingClassifier(estimators=classifiers, voting='soft',weights=weights)
    voting_clf.fit(X_train, y_train)
    
    y_pred = voting_clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted') 
    return accuracy,f1,voting_clf,models, weights

_,_,voting_clf,models, _ = train_svm_voting(dfs,target)
m = max(*models,key=lambda x:x['f1'])
m

{'model': SVC(C=5, gamma='auto', probability=True, random_state=42),
 'accuracy': 0.6065573770491803,
 'f1': 0.5979655528354835,
 'cols': Index(['lbp-3D-k_firstorder_10Percentile', 'lbp-3D-k_firstorder_90Percentile',
        'lbp-3D-k_firstorder_Energy', 'lbp-3D-k_firstorder_Entropy',
        'lbp-3D-k_firstorder_InterquartileRange',
        'lbp-3D-k_firstorder_Kurtosis', 'lbp-3D-k_firstorder_Maximum',
        'lbp-3D-k_firstorder_MeanAbsoluteDeviation', 'lbp-3D-k_firstorder_Mean',
        'lbp-3D-k_firstorder_Median', 'lbp-3D-k_firstorder_Minimum',
        'lbp-3D-k_firstorder_Range',
        'lbp-3D-k_firstorder_RobustMeanAbsoluteDeviation',
        'lbp-3D-k_firstorder_RootMeanSquared', 'lbp-3D-k_firstorder_Skewness',
        'lbp-3D-k_firstorder_TotalEnergy', 'lbp-3D-k_firstorder_Uniformity',
        'lbp-3D-k_firstorder_Variance', 'lbp-3D-k_glcm_Autocorrelation',
        'lbp-3D-k_glcm_ClusterProminence', 'lbp-3D-k_glcm_ClusterShade',
        'lbp-3D-k_glcm_ClusterTendency', 'lbp

In [98]:
svmModel =  m['model']
test_df = preprocess(df_test, mode='test')
m['cols']
cols = [col for col in m['cols'] if col != 'Transition']
#test_df[cols]
test_df
dt_predictions = voting_clf.predict(test_df)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- exponential_firstorder_10Percentile
- exponential_firstorder_90Percentile
- exponential_firstorder_Energy
- exponential_firstorder_Entropy
- exponential_firstorder_InterquartileRange
- ...


In [88]:
submission = pd.DataFrame(
    {'RowId': df_test.index + 1, 'Result': le.inverse_transform(dt_predictions)})

with open(f'good_submissions/customBagging.csv', 'w+') as file:
    submission.to_csv(file, index=False, sep=",")