In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [19]:
import warnings, pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
pd.set_option('display.max_rows', 100)  # Mostra todas as linhas
pd.set_option('display.max_columns', 100)  # Mostra todas as colunas

In [20]:
header = {
    "rowid": "rowid",
    "toi":            "TESS Object of Interest",
"toipfx":         "TESS Object of Interest Prefix",
"tid":            "TESS Input Catalog ID",
"ctoi_alias":     "TESS Input Catalog of Interest Alias",
"pl_pnum":        "Pipeline Signal ID",
"tfopwg_disp":    "TFOPWG Dispostion (CP | FP | KP | PC)",
"rastr":          "RA [sexagesimal]",
"ra":             "RA [deg]",
"raerr1":         "RA Upper Unc",
"raerr2":         "RA Lower Unc",
"decstr":         "Dec [sexagesimal]",
"dec":            "Dec [deg]",
"decerr1":        "Dec Upper Unc",
"decerr2":        "Dec Lower Unc",
"st_pmra":        "PMRA [mas/yr]",
"st_pmraerr1":    "PMRA Upper Unc [mas/yr]",
"st_pmraerr2":    "PMRA Lower Unc [mas/yr]",
"st_pmralim":     "PMRA Limit Flag",
"st_pmrasymerr":  "PMRA Symmetric Error Flag",
"st_pmdec":       "PMDec [mas/yr]",
"st_pmdecerr1":   "PMDec Upper Unc [mas/yr]",
"st_pmdecerr2":   "PMDec Lower Unc [mas/yr]",
"st_pmdeclim":    "PMDec Limit Flag",
"st_pmdecsymerr": "PMDec Symmetric Error Flag",
"pl_tranmid":     "Planet Transit Midpoint Value [BJD]",
"pl_tranmiderr1": "Planet Transit Midpoint Upper Unc [BJD]",
"pl_tranmiderr2": "Planit Transit Midpoint Lower Unc [BJD]",
"pl_tranmidlim":  "Planet Transit Midpoint Limit Flag",
"pl_tranmidsymerr": "Planet Transit Midpoint Symmetric Error Flag",
"pl_orbper":      "Planet Orbital Period Value [days]",
"pl_orbpererr1":  "Planet Orbital Period Upper Unc [days]",
"pl_orbpererr2":  "Planet Orbital Period Lower Unc [days]",
"pl_orbperlim":   "Planet Orbital Period Limit Flag",
"pl_orbpersymerr": "Planet Orbital Period Symmetric Error Flag",
"pl_trandurh":    "Planet Transit Duration Value [hours]",
"pl_trandurherr1": "Planet Transit Duration Upper Unc [hours]",
"pl_trandurherr2": "Planet Transit Duration Lower Unc [hours]",
"pl_trandurhlim": "Planet Transit Duration Limit Flag",
"pl_trandurhsymerr": "Planet Transit Duration Symmetric Error Flag",
"pl_trandep":     "Planet Transit Depth Value [ppm]",
"pl_trandeperr1": "Planet Transit Depth Upper Unc [ppm]",
"pl_trandeperr2": "Planet Transit Depth Lower Unc [ppm]",
"pl_trandeplim":  "Planet Transit Depth Limit Flag",
"pl_trandepsymerr": "Planet Transit Depth Symmetric Error Flag",
"pl_rade":        "Planet Radius Value [R_Earth]",
"pl_radeerr1":    "Planet Radius Upper Unc [R_Earth]",
"pl_radeerr2":    "Planet Radius Lower Unc [R_Earth]",
"pl_radelim":     "Planet Radius Limit Flag",
"pl_radesymerr":  "Planet Radius Symmetric Error Flag",
"pl_insol":       "Planet Insolation Value [Earth flux]",
"pl_insolerr1":   "Planet Insolation Upper Unc [Earth flux]",
"pl_insolerr2":   "Planet Insolation Lower Unc [Earth flux]",
"pl_insollim":    "Planet Insolation Limit Flag",
"pl_insolsymerr": "Planet Insolation Symmetric Error Flag",
"pl_eqt":         "Planet Equilibrium Temperature Value [K]",
"pl_eqterr1":     "Planet Equilibrium Temperature Upper Unc [K]",
"pl_eqterr2":     "Planet Equilibrium Temperature Lower Unc [K]",
"pl_eqtlim":      "Planet Equilibrium Temperature Limit Flag",
"pl_eqtsymerr":   "Planet Equilibrium Temperature Symmetric Error Flag",
"st_tmag":        "TESS Magnitude",
"st_tmagerr1":    "TESS Magnitude Upper Unc",
"st_tmagerr2":    "TESS Magnitude Lower Unc",
"st_tmaglim":     "TESS Magnitude Limit Flag",
"st_tmagsymerr":  "TESS Magnitude Symmetric Error Flag",
"st_dist":        "Stellar Distance [pc]",
"st_disterr1":    "Stellar Distance Upper Unc [pc]",
"st_disterr2":    "Stellar Distance Lower Unc [pc]",
"st_distlim":     "Stellar Distance Limit Flag",
"st_distsymerr":  "Stellar Distance Symmetric Error Flag",
"st_teff":        "Stellar Effective Temperature Value [K]",
"st_tefferr1":    "Stellar Effective Termperature Upper Unc [K]",
"st_tefferr2":    "Stellar Effective Temperature Lower Unc [K]",
"st_tefflim":     "Stellar Effective Temperature Limit Flag",
"st_teffsymerr":  "Stellar Effective Temperature Symmetric Error Flag",
"st_logg":        "Stellar log(g) Value [cm/s**2]",
"st_loggerr1":    "Stellar log(g) Upper Unc [cm/s**]",
"st_loggerr2":    "Stellar log(g) Lower Unc [cm/s**2]",
"st_logglim":     "Stellar log(g) Limit Flag",
"st_loggsymerr":  "Stellar log(g) Symmetric Error Flag",
"st_rad":         "Stellar Radius Value [R_Sun]",
"st_raderr1":     "Stellar Radius Upper Unc [R_Sun]",
"st_raderr2":     "Stellar Radius Lower Unc [R_Sun]",
"st_radlim":      "Stellar Radius Limit Flag",
"st_radsymerr":   "Stellar Radius Symmetric Error Flag",
"toi_created":    "TOI Created Date",
"rowupdate":      "Date Modified"
}
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NSAC25/data/raw/tess_objects_of_interest.csv",
                  comment = "#"
                  )
print(data.shape)

# TESS Follow-up Observing Program Working Group (TFOPWG) Dispostion:
# APC = ambiguous planetary candidate
# CP = confirmed planet
# FA = false alarm
# FP = false positive
# KP = known planet
# PC = planetary candidate

(7703, 87)


In [None]:
# Chosen manually, one by one, out of the 87 available.
tess_identification_columns = ["ctoi_alias", "pl_pnum", "tfopwg_disp", 'ra', 'dec', 'st_pmra', 'st_pmdec']
planetary_properties = ['pl_tranmid', 'pl_orbper', 'pl_trandurh', 'pl_trandep', 'pl_rade', 'pl_insol', 'pl_eqt']
stellar_properties = ['st_tmag', 'st_dist', 'st_teff', 'st_logg', 'st_rad']

columns_kept = [*tess_identification_columns, *planetary_properties, *stellar_properties]
data = data[columns_kept]
print(data.shape)

In [None]:
# Label Encoding
data = data[data['tfopwg_disp'].values != 'PC']
data = data[data['tfopwg_disp'].values != 'APC']

with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    data['tfopwg_disp'][data['tfopwg_disp'].values == 'KP'] = 1
    data['tfopwg_disp'][data['tfopwg_disp'].values == 'CP'] = 1
    data['tfopwg_disp'][data['tfopwg_disp'].values == 'FA'] = 0
    data['tfopwg_disp'][data['tfopwg_disp'].values == 'FP'] = 0

data['tfopwg_disp'] = data['tfopwg_disp'].astype('int64')
print(data.shape)

In [None]:
print(f"Formato do DataFrame: {data.shape}")  # (m_linhas, n_colunas)
print(f"Número de linhas: {len(data)}")
print(f"Número de colunas: {len(data.columns)}")
print(f"Memória usada: {data.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB")

percentages_of_missing_data = data.isnull().mean() * 100 #percentages of NaN values in each column
bad_columns = [header[key] for key in percentages_of_missing_data[percentages_of_missing_data.values > 30].index] # bad columns are those which have 30% or more NaNs

print("\nBAD COLUMNS (DROPPED):")
for _ in bad_columns: print(_)
bad_columns = [key for key in percentages_of_missing_data[percentages_of_missing_data.values > 30].index]
data = data.drop(columns = bad_columns)

In [None]:
# This cell was written with the help of Mistral AI's LeChat chatbot
numeric_cols = data.select_dtypes(include=[np.number]).columns

for col in numeric_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR

    avg = data[col].mean()
    s_dev = data[col].std()
    lower_2, upper_2 = avg - 3 * s_dev, avg + 3 * s_dev

    plt.figure(figsize=(8, 4))
    sns.histplot(data[col], kde=True)

    plt.axvline(lower, color='red', linestyle='--', label='Limite Inferior IQR')
    plt.axvline(upper, color='red', linestyle='--', label='Limite Superior IQR')
    plt.axvline(lower_2, color='purple', linestyle='--', label=r'Limite Inferior $3\sigma$')
    plt.axvline(upper_2, color='purple', linestyle='--', label=r'Limite Superior $3\sigma$')

    plt.title(f"Distribuição de {col} com limites de outliers")
    plt.legend()
    plt.show()

In [None]:
# This cell was written with the help of Mistral AI's LeChat chatbot

# Removal of outliers
data_to_keep = np.array([True for i in range(len(data))])
for col in data.columns:
    avg = data[col].mean()
    s_dev = data[col].std()
    lower, upper = avg - 3 * s_dev, avg + 3 * s_dev
    temp = (data[col].values > lower) & (data[col].values < upper)
    data_to_keep = data_to_keep & temp
data = data[data_to_keep]

In [None]:
# The four cells below were written with the help of Mistral AI's LeChat chatbot

In [None]:
import missingno as msno
msno.matrix(data)  # Matriz de missing values (gráfico interativo)

In [None]:
msno.bar(data)     # Gráfico de barras

In [None]:
msno.heatmap(data)  # Mostra correlação entre missing values

In [None]:
data.isnull().sum()

In [None]:
# The four cells above were written with the help of Mistral AI's LeChat chatbot

In [None]:
#Luckily for me, the outlier removal has also removed all remaining missing numbers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Separate features from targets
X = data.drop(['ctoi_alias', 'pl_pnum', 'tfopwg_disp'], axis=1)
y = data['tfopwg_disp']

# Rescale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size = 0.2,
    stratify = y,
    random_state = 42
)

# Target balancing
smote = SMOTE(random_state = 42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# This cell is a handshake more than anything else, just because I was rusty from not coding in a long time

# RandomForest Training
rf_model = RandomForestClassifier(
    random_state = 42,
    n_estimators = 1
)
rf_model.fit(X_train_res, y_train_res)

# Prediction
y_pred_rf = rf_model.predict(X_test)

# Evaluating the model's performance
print("\nRandomForest - Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nRandomForest - Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

# Feature Importance (suggestion by Mistral AI's LeChat chatbot)
print("\nRandomForest - Feature Importance:")
for feature, importance in zip(X.columns, rf_model.feature_importances_):
    print(f"{feature}: {importance:.4f}")

In [None]:
import time
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Handshake-y, written with the help of Mistral AI's LeChat chatbot

models = {
    "Random Forest 1": RandomForestClassifier(random_state = 42, n_estimators=1),
    "Random Forest 50": RandomForestClassifier(random_state = 42, n_estimators=50),
    "Random Forest 100": RandomForestClassifier(random_state = 42, n_estimators=100),
    "Random Forest 150": RandomForestClassifier(random_state = 42, n_estimators=150),
    "Random Forest 200": RandomForestClassifier(random_state = 42, n_estimators=200),
    "Random Forest 500": RandomForestClassifier(random_state = 42, n_estimators=500)#,
    #"XGBoost": XGBClassifier(random_state=42),
    #"LightGBM": LGBMClassifier(random_state=42)
}


results = {}


for name, model in models.items():
    print(f"\nTreinando {name}...")


    start_time = time.time()
    model.fit(X_train_res, y_train_res)
    training_time = time.time() - start_time

    # Predictions and Metrics
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    confusion = confusion_matrix(y_test, y_pred)

    # Store Results
    results[name] = {
        "training_time": training_time,
        "classification_report": report,
        "confusion_matrix": confusion,
        "model": model
    }


    print(f"Tempo de treinamento: {training_time:.2f} segundos")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Print ALL models trained
results_df = pd.DataFrame({
    "Modelo": results.keys(),
    "Tempo de Treinamento (s)": [res["training_time"] for res in results.values()],
    "Acurácia": [res["classification_report"]["accuracy"] for res in results.values()],
    "Precision (Classe 1)": [res["classification_report"]["1"]["precision"] for res in results.values()],
    "Recall (Classe 1)": [res["classification_report"]["1"]["recall"] for res in results.values()],
    "F1-Score (Classe 1)": [res["classification_report"]["1"]["f1-score"] for res in results.values()]
})

print("\nResumo dos Resultados:")
results_df


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Written with the help of Mistral AI's LeChat chatbot
param_dist = {
    'n_estimators': randint(30, 500),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 10),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True]
}

rf = RandomForestClassifier(random_state=42, oob_score=True)
rf_search = RandomizedSearchCV(rf, param_dist, n_iter = 200, cv=2, random_state=42, n_jobs=-1)
rf_search.fit(X_train_res, y_train_res)

best_rf = rf_search.best_estimator_
print(f"Melhores parâmetros: {rf_search.best_params_}")




import pickle
with open('rf_search_models.pkl', 'wb') as file:
    pickle.dump(rf_search, file)
with open('best_random_forest_model.pkl', 'wb') as file:
    pickle.dump(best_rf, file)

y_pred = best_rf.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# This cell was written BY Mistral AI's LeChat chatbot, and it ran on the first try, look at that

from scipy.stats import randint, uniform
from xgboost import XGBClassifier

# Definir o espaço de busca para hiperparâmetros do XGBoost
param_dist = {
    'n_estimators': randint(30, 500),          # Número de árvores
    'max_depth': randint(3, 15),              # Profundidade máxima das árvores
    'learning_rate': uniform(0.01, 0.3),       # Taxa de aprendizado
    'subsample': uniform(0.6, 0.4),           # Fração de amostras usadas para treinar cada árvore
    'colsample_bytree': uniform(0.6, 0.4),    # Fração de features usadas para treinar cada árvore
    'gamma': uniform(0, 0.5),                  # Limiar mínimo de redução de perda para dividir um nó
    'min_child_weight': randint(1, 10),       # Soma mínima dos pesos das amostras em um nó filho
}

# Criar o modelo base do XGBoost
xgb = XGBClassifier(
    random_state=42,
    eval_metric='logloss',  # Métrica de avaliação
    use_label_encoder=False  # Evita warnings sobre label encoding
)

# RandomizedSearchCV para otimização do XGBoost
xgb_search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=200,  # Número de combinações a testar
    cv=2,        # Número de folds para validação cruzada
    random_state=42,
    n_jobs=-1,   # Usa todos os núcleos da CPU
    verbose=1   # Mostra o progresso
)

# Treinar com os dados balanceados (SMOTE)
xgb_search.fit(X_train_res, y_train_res)

# Melhor modelo
best_xgb = xgb_search.best_estimator_
print(f"Melhores parâmetros: {xgb_search.best_params_}")

# Salvar o objeto completo do RandomizedSearchCV
with open('xgb_search_models.pkl', 'wb') as file:
    pickle.dump(xgb_search, file)

# Salvar o melhor modelo
with open('best_xgboost_model.pkl', 'wb') as file:
    pickle.dump(best_xgb, file)

# Previsões
y_pred = best_xgb.predict(X_test)
# report = classification_report(y_test, y_pred, output_dict=True)
# confusion = confusion_matrix(y_test, y_pred)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# This cell was written BY Mistral AI's LeChat chatbot, and it ran on the first try, look at that
from sklearn.svm import SVC

# Definir o espaço de busca para hiperparâmetros do SVM
param_dist = {
    'C': uniform(0.1, 10),          # Parâmetro de regularização
    'gamma': ['scale', 'auto'] + list(uniform(0.001, 1).rvs(5)),  # Coeficiente do kernel RBF
    'kernel': ['linear', 'rbf', 'poly'],  # Tipo de kernel
    'degree': randint(2, 5),       # Grau do kernel polinomial (só aplicável se kernel='poly')
    'class_weight': ['balanced', None]  # Balanceamento de classes
}

# Criar o modelo base do SVM
svm = SVC(random_state=42, probability=True)  # probability=True para obter probabilidades

# RandomizedSearchCV para otimização do SVM
svm_search = RandomizedSearchCV(
    svm,
    param_distributions=param_dist,
    n_iter=50,  # Número de combinações a testar
    cv=2,        # Número de folds para validação cruzada
    random_state=42,
    n_jobs=-1,   # Usa todos os núcleos da CPU
    verbose=1   # Mostra o progresso
)

# Treinar com os dados balanceados (SMOTE)
svm_search.fit(X_train_res, y_train_res)

# Melhor modelo
best_svm = svm_search.best_estimator_
print(f"Melhores parâmetros: {svm_search.best_params_}")

# Salvar o objeto completo do RandomizedSearchCV
with open('svm_search_models.pkl', 'wb') as file:
    pickle.dump(svm_search, file)

# Salvar o melhor modelo
with open('best_svm_model.pkl', 'wb') as file:
    pickle.dump(best_svm, file)

# Previsões
y_pred = best_svm.predict(X_test)
# report = classification_report(y_test, y_pred, output_dict=True)
# confusion = confusion_matrix(y_test, y_pred)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Saving the processed data, just in case
with open('tess_clean_data.pkl', 'wb') as file:
    pickle.dump(data, file)
data.to_csv('tess_clean_data_no_index.csv', index=False)
data.to_csv('tess_clean_data_yes_index.csv', index=True)

In [None]:
# Comparing feature importance for each model.
print("RF:")
for feature, importance in zip(X.columns, best_rf.feature_importances_):
    print(f"{feature}: {importance:.4f}")

print("XBG:")
for feature, importance in zip(X.columns, best_xgb.feature_importances_):
    print(f"{feature}: {importance:.4f}")

print("SVC:")
for feature, importance in zip(X.columns, best_svm.feature_importances_):
    print(f"{feature}: {importance:.4f}")

In [None]:
# Actually re-training the models for the web application, since inputing all 16 would be tedious
# This cell, as well as the next one, is just ctrl+c ctrl+v of previous ones

reduced_features = ['st_dist', 'pl_rade', 'pl_eqt', 'pl_trandep', 'pl_insol']

# Separa features de targets
X = data[reduced_features]
y = data['tfopwg_disp']

# Normaliza as features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size = 0.2,
    stratify = y,
    random_state = 42
)

# Balanceamento dos targets no conjunto de treino
smote = SMOTE(random_state = 42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(30, 500),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 10),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True]
}

rf = RandomForestClassifier(random_state=42, oob_score=True)
rf_search = RandomizedSearchCV(rf, param_dist, n_iter = 200, cv=2, random_state=42, n_jobs=-1)
rf_search.fit(X_train_res, y_train_res)

best_rf = rf_search.best_estimator_
print(f"Melhores parâmetros: {rf_search.best_params_}")

import pickle
with open('rf_search_models_reduced_features.pkl', 'wb') as file:
    pickle.dump(rf_search, file)
with open('best_random_forest_model_reduced_features.pkl', 'wb') as file:
    pickle.dump(best_rf, file)





from scipy.stats import randint, uniform
from xgboost import XGBClassifier

# Definir o espaço de busca para hiperparâmetros do XGBoost
param_dist = {
    'n_estimators': randint(30, 500),          # Número de árvores
    'max_depth': randint(3, 15),              # Profundidade máxima das árvores
    'learning_rate': uniform(0.01, 0.3),       # Taxa de aprendizado
    'subsample': uniform(0.6, 0.4),           # Fração de amostras usadas para treinar cada árvore
    'colsample_bytree': uniform(0.6, 0.4),    # Fração de features usadas para treinar cada árvore
    'gamma': uniform(0, 0.5),                  # Limiar mínimo de redução de perda para dividir um nó
    'min_child_weight': randint(1, 10),       # Soma mínima dos pesos das amostras em um nó filho
}

# Criar o modelo base do XGBoost
xgb = XGBClassifier(
    random_state=42,
    eval_metric='logloss',  # Métrica de avaliação
    use_label_encoder=False  # Evita warnings sobre label encoding
)

# RandomizedSearchCV para otimização do XGBoost
xgb_search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=200,  # Número de combinações a testar
    cv=2,        # Número de folds para validação cruzada
    random_state=42,
    n_jobs=-1,   # Usa todos os núcleos da CPU
    verbose=1   # Mostra o progresso
)

# Treinar com os dados balanceados (SMOTE)
xgb_search.fit(X_train_res, y_train_res)

# Melhor modelo
best_xgb = xgb_search.best_estimator_
print(f"Melhores parâmetros: {xgb_search.best_params_}")

# Salvar o objeto completo do RandomizedSearchCV
with open('xgb_search_models_reduced_features.pkl', 'wb') as file:
    pickle.dump(xgb_search, file)

# Salvar o melhor modelo
with open('best_xgboost_model_reduced_features.pkl', 'wb') as file:
    pickle.dump(best_xgb, file)




from sklearn.svm import SVC

# Definir o espaço de busca para hiperparâmetros do SVM
param_dist = {
    'C': uniform(0.1, 10),          # Parâmetro de regularização
    'gamma': ['scale', 'auto'] + list(uniform(0.001, 1).rvs(5)),  # Coeficiente do kernel RBF
    'kernel': ['linear', 'rbf', 'poly'],  # Tipo de kernel
    'degree': randint(2, 5),       # Grau do kernel polinomial (só aplicável se kernel='poly')
    'class_weight': ['balanced', None]  # Balanceamento de classes
}

# Criar o modelo base do SVM
svm = SVC(random_state=42, probability=True)  # probability=True para obter probabilidades

# RandomizedSearchCV para otimização do SVM
svm_search = RandomizedSearchCV(
    svm,
    param_distributions=param_dist,
    n_iter=200,  # Número de combinações a testar
    cv=2,        # Número de folds para validação cruzada
    random_state=42,
    n_jobs=-1,   # Usa todos os núcleos da CPU
    verbose=1   # Mostra o progresso
)

# Treinar com os dados balanceados (SMOTE)
svm_search.fit(X_train_res, y_train_res)

# Melhor modelo
best_svm = svm_search.best_estimator_
print(f"Melhores parâmetros: {svm_search.best_params_}")

# Salvar o objeto completo do RandomizedSearchCV
with open('svm_search_models_reduced_features.pkl', 'wb') as file:
    pickle.dump(svm_search, file)

# Salvar o melhor modelo
with open('best_svm_model_reduced_features.pkl', 'wb') as file:
    pickle.dump(best_svm, file)

In [None]:
# Previsões
y_pred = best_rf.predict(X_test)
# report = classification_report(y_test, y_pred, output_dict=True)
# confusion = confusion_matrix(y_test, y_pred)
print("RANDOM FORESTS")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))



# Previsões
y_pred = best_xgb.predict(X_test)
# report = classification_report(y_test, y_pred, output_dict=True)
# confusion = confusion_matrix(y_test, y_pred)
print("\nXGB")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))




# Previsões
y_pred = best_svm.predict(X_test)
# report = classification_report(y_test, y_pred, output_dict=True)
# confusion = confusion_matrix(y_test, y_pred)
print("\nSVM")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))