In [1]:
import pandas as pd

# Carga del CSV
df = pd.read_csv('../training.csv')  # ../ sube un nivel desde /SVM/

# Primer vistazo
print(df.shape)
df.head()


(250000, 33)


Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [2]:
import numpy as np

# Codificar la variable objetivo: 's' → 1, 'b' → 0
df['Label'] = df['Label'].map({'s': 1, 'b': 0})

# Reemplazar -999.0 por NaN
df.replace(-999.0, np.nan, inplace=True)


In [3]:
# Eliminar columnas que no usaremos
df.drop(['EventId', 'Weight'], axis=1, inplace=True)

# Separar X (features) e y (target)
X = df.drop('Label', axis=1)
y = df['Label']


In [4]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.2, random_state=42, stratify=y
)


In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Reducimos el conjunto de entrenamiento al 10% por tuning.

In [7]:
from sklearn.model_selection import train_test_split

# Reducimos el conjunto de entrenamiento al 10%
X_sub, _, y_sub, _ = train_test_split(
    X_train_scaled, y_train, train_size=0.10, stratify=y_train, random_state=42
)

print(f"Subconjunto para tuning: {X_sub.shape[0]} muestras")


Subconjunto para tuning: 20000 muestras


In [8]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import time

# Lista de configuraciones a probar
svm_configuraciones = [
    {'kernel': 'rbf', 'C': 1.0, 'gamma': 'scale'},
    {'kernel': 'rbf', 'C': 1.5, 'gamma': 0.7},
    {'kernel': 'rbf', 'C': 10.0, 'gamma': 0.1},
    {'kernel': 'poly', 'C': 1.0, 'gamma': 'scale'},
    {'kernel': 'poly', 'C': 1.5, 'gamma': 0.7},
    {'kernel': 'sigmoid', 'C': 1.0, 'gamma': 'scale'},
    {'kernel': 'linear', 'C': 1.0}
]

# Guardar resultados
resultados_sub = {}

for i, config in enumerate(svm_configuraciones):
    print(f"\n⏳ Entrenando SVM #{i+1}: {config}")
    inicio = time.time()
    
    modelo = SVC(**config, random_state=42)
    modelo.fit(X_sub, y_sub)
    y_pred = modelo.predict(X_test_scaled)  # ¡Seguimos evaluando en el test completo!
    
    fin = time.time()
    
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    resultados_sub[f"SVM_{i+1}_{config}"] = {
        'accuracy': acc,
        'recall_1': report['1']['recall'],
        'f1_1': report['1']['f1-score'],
        'tiempo': fin - inicio
    }

    print(f"✔️ Terminado en {fin - inicio:.2f} s - Accuracy: {acc:.4f}")



⏳ Entrenando SVM #1: {'kernel': 'rbf', 'C': 1.0, 'gamma': 'scale'}
✔️ Terminado en 1017.64 s - Accuracy: 0.8187

⏳ Entrenando SVM #2: {'kernel': 'rbf', 'C': 1.5, 'gamma': 0.7}
✔️ Terminado en 6575.92 s - Accuracy: 0.7093

⏳ Entrenando SVM #3: {'kernel': 'rbf', 'C': 10.0, 'gamma': 0.1}
✔️ Terminado en 9942.39 s - Accuracy: 0.7972

⏳ Entrenando SVM #4: {'kernel': 'poly', 'C': 1.0, 'gamma': 'scale'}
✔️ Terminado en 3659.59 s - Accuracy: 0.7858

⏳ Entrenando SVM #5: {'kernel': 'poly', 'C': 1.5, 'gamma': 0.7}
✔️ Terminado en 86312.32 s - Accuracy: 0.7657

⏳ Entrenando SVM #6: {'kernel': 'sigmoid', 'C': 1.0, 'gamma': 'scale'}
✔️ Terminado en 25.12 s - Accuracy: 0.6295

⏳ Entrenando SVM #7: {'kernel': 'linear', 'C': 1.0}
✔️ Terminado en 32.62 s - Accuracy: 0.7489


In [9]:
print("\n📊 Resumen tuning con 10% de datos:")
for nombre, res in resultados_sub.items():
    print(f"\nModelo: {nombre}")
    print(f"Accuracy: {res['accuracy']:.4f}")
    print(f"Recall clase 1: {res['recall_1']:.4f}")
    print(f"F1-score clase 1: {res['f1_1']:.4f}")
    print(f"Tiempo: {res['tiempo']:.2f} s")



📊 Resumen tuning con 10% de datos:

Modelo: SVM_1_{'kernel': 'rbf', 'C': 1.0, 'gamma': 'scale'}
Accuracy: 0.8187
Recall clase 1: 0.6747
F1-score clase 1: 0.7184
Tiempo: 1017.64 s

Modelo: SVM_2_{'kernel': 'rbf', 'C': 1.5, 'gamma': 0.7}
Accuracy: 0.7093
Recall clase 1: 0.2578
F1-score clase 1: 0.3781
Tiempo: 6575.92 s

Modelo: SVM_3_{'kernel': 'rbf', 'C': 10.0, 'gamma': 0.1}
Accuracy: 0.7972
Recall clase 1: 0.6585
F1-score clase 1: 0.6899
Tiempo: 9942.39 s

Modelo: SVM_4_{'kernel': 'poly', 'C': 1.0, 'gamma': 'scale'}
Accuracy: 0.7858
Recall clase 1: 0.5449
F1-score clase 1: 0.6354
Tiempo: 3659.59 s

Modelo: SVM_5_{'kernel': 'poly', 'C': 1.5, 'gamma': 0.7}
Accuracy: 0.7657
Recall clase 1: 0.6517
F1-score clase 1: 0.6559
Tiempo: 86312.32 s

Modelo: SVM_6_{'kernel': 'sigmoid', 'C': 1.0, 'gamma': 'scale'}
Accuracy: 0.6295
Recall clase 1: 0.4522
F1-score clase 1: 0.4555
Tiempo: 25.12 s

Modelo: SVM_7_{'kernel': 'linear', 'C': 1.0}
Accuracy: 0.7489
Recall clase 1: 0.5442
F1-score clase 1: 0.