In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load dataset
radi = pd.read_csv("sbsppdaa24/train_radiomics_hipocamp.csv")

# Drop unique identifier columns
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)

# Drop non-numeric columns except for 'Transition'
columns_to_drop = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop, inplace=True)
print(f"Dropped {len(columns_to_drop)} non-numeric columns.")

# Drop columns where all entries are the same
same_value_cols = [col for col in radi.columns if radi[col].nunique() == 1]
radi.drop(columns=same_value_cols, inplace=True)
print(f"Dropped {len(same_value_cols)} columns with the same value for every entry.")

# Apply MinMax scaling to float columns
float_cols = radi.select_dtypes(include=['float','int']).columns
scaler = MinMaxScaler()
radi[float_cols] = scaler.fit_transform(radi[float_cols])
radi.info()

Dropped 16 non-numeric columns.
Dropped 148 columns with the same value for every entry.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2014 entries, diagnostics_Image-original_Mean to Transition
dtypes: float64(2013), object(1)
memory usage: 4.7+ MB


In [2]:
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

X = radi.drop(["Transition"], axis=1)
y = radi["Transition"]

# Define a range of k values to test
k_values = range(1, X.shape[1] + 1)  # From 1 to the total number of features
average_scores = []

# Iterate over each k
for k in k_values:
    # Select the top k features
    selector = SelectKBest(f_classif, k=k)
    X_new = selector.fit_transform(X, y)
    
    # Evaluate a classifier (e.g., RandomForest) with cross-validation
    model = RandomForestClassifier(random_state=2025)
    scores = cross_val_score(model, X_new, y, cv=5, scoring='f1_weighted',n_jobs=-1,)
    print(f"For the k value of {k}, the score was: {np.mean(scores)}")
    
    # Store the average score for this k
    average_scores.append(np.mean(scores))

# Find the optimal k value with the best cross-validation score
optimal_k = k_values[np.argmax(average_scores)]
print(f"Optimal number of features (k): {optimal_k}")
print(f"Cross-Validation Scores for each k: {average_scores}")

# Select the columns for the best k value configuration
best_selector = SelectKBest(f_classif, k=optimal_k)
X_best = best_selector.fit_transform(X, y)
selected_columns = X.columns[best_selector.get_support()]
print("Selected columns for optimal k:", selected_columns)


For the k value of 1, the score was: 0.3428933587131432
For the k value of 2, the score was: 0.35702922428318173
For the k value of 3, the score was: 0.34493527258122875
For the k value of 4, the score was: 0.3325045386259678
For the k value of 5, the score was: 0.3362638120228074
For the k value of 6, the score was: 0.326560108100748
For the k value of 7, the score was: 0.32749566150676734
For the k value of 8, the score was: 0.33512484291028616
For the k value of 9, the score was: 0.3334783965405916
For the k value of 10, the score was: 0.3409846055868847
For the k value of 11, the score was: 0.33082913767945843
For the k value of 12, the score was: 0.32866516421938513
For the k value of 13, the score was: 0.33981097503570357
For the k value of 14, the score was: 0.3159455899634386
For the k value of 15, the score was: 0.3438870711952517
For the k value of 16, the score was: 0.33885790113482195
For the k value of 17, the score was: 0.3087879817445105
For the k value of 18, the score 

In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV,StratifiedKFold
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, f1_score

# Estado vai ser comum para todos os modelos, 
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2025)
classification_reports = {}

In [12]:
# Apply the best selector directly to X to get the reduced feature set
X_best = best_selector.transform(X)  # Transformed data with optimal k features

# Create a new DataFrame with the selected columns
# Re-add the target column "Transition" to the reduced radi DataFrame
radi = radi[selected_columns].copy()
radi["Transition"] = y
# Display the updated DataFrame information
radi.info()

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Age
- Sex
- diagnostics_Image-original_Maximum
- diagnostics_Image-original_Mean
- exponential_firstorder_10Percentile
- ...


In [21]:
# Split data into features and target
df = radi.copy()
X = df.drop(columns=["Transition"]) 
y = df["Transition"]

# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2025, stratify=y)
rf_model = RandomForestClassifier(random_state=2025,criterion="gini",max_depth=5,n_estimators=500)

rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
classification_reports["RandomForest"] = classification_report(y_test, y_pred_rf, output_dict=True)
print(f"RandomForest Classification Report:\n", classification_report(y_test, y_pred_rf))

RandomForest Classification Report:
               precision    recall  f1-score   support

       AD-AD       0.42      0.33      0.37        15
       CN-CN       0.54      0.79      0.64        24
      CN-MCI       0.00      0.00      0.00         3
      MCI-AD       0.29      0.29      0.29        17
     MCI-MCI       0.54      0.39      0.45        18

    accuracy                           0.47        77
   macro avg       0.36      0.36      0.35        77
weighted avg       0.44      0.47      0.44        77



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
from sklearn.svm import SVC

# Split data into features and target
df = radi.copy()
X = df.drop(columns=["Transition"]) 
y = df["Transition"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2025, stratify=y)

# Initialize the SVC model
svm_model = SVC(random_state=2025,C=1000,kernel="poly",gamma="scale")
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
classification_reports["SVM"] = classification_report(y_test, y_pred_svm, output_dict=True)
print(f"SVM Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM Classification Report:
               precision    recall  f1-score   support

       AD-AD       0.31      0.27      0.29        15
       CN-CN       0.48      0.54      0.51        24
      CN-MCI       0.00      0.00      0.00         3
      MCI-AD       0.11      0.12      0.11        17
     MCI-MCI       0.22      0.22      0.22        18

    accuracy                           0.30        77
   macro avg       0.22      0.23      0.23        77
weighted avg       0.29      0.30      0.29        77



In [27]:
# Split data into features and target
df = radi.copy()
X = df.drop(columns=["Transition"]) 
y = df["Transition"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2025, stratify=y)

gbparams_model = GradientBoostingClassifier(learning_rate=0.3,random_state=2025,n_estimators=500)
gbparams_model.fit(X_train, y_train)
y_pred_gbparams = gbparams_model.predict(X_test)
classification_reports["GradientBoosting"] = classification_report(y_test, y_pred_gbparams, output_dict=True)
print(f"GradientBoosting Classification Report:\n", classification_report(y_test, y_pred_gbparams))


GradientBoosting Classification Report:
               precision    recall  f1-score   support

       AD-AD       0.31      0.27      0.29        15
       CN-CN       0.55      0.71      0.62        24
      CN-MCI       0.00      0.00      0.00         3
      MCI-AD       0.24      0.24      0.24        17
     MCI-MCI       0.47      0.39      0.42        18

    accuracy                           0.42        77
   macro avg       0.31      0.32      0.31        77
weighted avg       0.39      0.42      0.40        77



In [32]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Split data into features and target
df = radi.copy()
X = df.drop(columns=["Transition"]) 
y = df["Transition"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2025, stratify=y)

meta_model = GradientBoostingClassifier(random_state=25)
meta_model2 = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=25)
meta_model3 = RandomForestClassifier(random_state=25)

estimators = [("gb", gbparams_model), ("svm",svm_model), ("rf", rf_model)]
st_model = StackingClassifier(estimators=estimators, final_estimator = meta_model3) 
st_model.fit(X_train, y_train)
st_predictions = st_model.predict(X_test)
classification_reports["Stacking"] = classification_report(y_test, st_predictions, output_dict=True)


print(f"Stacking Classification Report:\n", classification_report(y_test, st_predictions))

Stacking Classification Report:
               precision    recall  f1-score   support

       AD-AD       0.39      0.47      0.42        15
       CN-CN       0.57      0.83      0.68        24
      CN-MCI       0.00      0.00      0.00         3
      MCI-AD       0.25      0.24      0.24        17
     MCI-MCI       0.50      0.22      0.31        18

    accuracy                           0.45        77
   macro avg       0.34      0.35      0.33        77
weighted avg       0.43      0.45      0.42        77



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# Load the test dataset
test_data = pd.read_csv("sbsppdaa24/test_radiomics_hipocamp.csv")

# Apply the same preprocessing as in the training phase
# Drop unique identifier columns
test_data.drop(columns=["Mask", "ID", "Image"], inplace=True)

# Ensure 'columns_to_drop' is available for test data
# If you haven't redefined this variable, you need to redo this step for the test set.
# Use the same method to identify non-numeric columns for dropping
non_numeric_columns = [col for col in test_data.columns if test_data[col].dtype == 'object']
test_data.drop(columns=non_numeric_columns, inplace=True)

# Apply the same MinMaxScaler that was fit on the training data
test_data[float_cols] = scaler.transform(test_data[float_cols])  # Correctly reference columns in test_data

# re-train stacking algorithm with full data!
test_data = test_data[selected_columns].copy()

# Split data into features and target
df = radi.copy()
X = df.drop(columns=["Transition"]) 
y = df["Transition"]

# Train-test split

estimators = [("gb", gbparams_model), ("svm",svm_model), ("rf", rf_model)]
st_model = StackingClassifier(estimators=estimators, final_estimator = meta_model3) 
st_model.fit(X,y)
st_predictions_full_data_test = st_model.predict(test_data)


rf_model.fit(X, y)
rf_predictions_full_data_test = rf_model.predict(test_data)


res0 = pd.DataFrame({
    'RowId': range(1, len(st_predictions_full_data_test) + 1),
    'Result': st_predictions_full_data_test 
})

res1 = pd.DataFrame({
    'RowId': range(1, len(rf_predictions_full_data_test) + 1),
    'Result': rf_predictions_full_data_test
})

# Save predictions to a CSV file
res0.to_csv('Stacking3ModelosFullData6.0.csv', index=False)
# Save predictions to a CSV file
res1.to_csv('RandomForestFullData6.0.csv', index=False)

