In [20]:
import numpy as np
from sklearn.model_selection import train_test_split
# Cargar el archivo
data = np.load("df-deepfake.npz")

# Acceder a los arrays
X = data['X']
y = data['y']



In [23]:
from sklearn.model_selection import train_test_split

# Paso 1: dividir en entrenamiento+validación y test
X, X_test, y, y_test = train_test_split(
    X, y, test_size=0.20, random_state=50, stratify=y  # stratify mantiene proporciones
)

# Paso 2: guardar el test para después
np.savez_compressed("deepfake_trainval.npz", X=X, y=y)
np.savez_compressed("deepfake_test.npz", X=X_test, y=y_test)

# Paso 3: continuar con entrenamiento/validación usando X_trainval, y_trainval
print(f"Train+Val shape: {X.shape}, Test shape: {X_test.shape}")


Train+Val shape: (1581, 40), Test shape: (396, 40)


In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import numpy as np

# Cargar datos
data_train = np.load("deepfake_trainval.npz")
X = data_train['X']
y = data_train['y']

# Escalar manualmente
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Modelo base
lr = LogisticRegression(max_iter=1000)

# Definir grilla de hiperparámetros
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],             # Regularización
    'solver': ['lbfgs', 'liblinear'],         # Algoritmos
    'penalty': ['l2']                         # Tipo de penalización (solo 'l2' es compatible con lbfgs y liblinear)
}

# Métricas para evaluar
scoring = {
    'accuracy': 'accuracy',
    'recall': 'recall',
    'f1': 'f1'
}

# Grid Search con CV
grid = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5,
                    scoring=scoring, refit='f1', return_train_score=False)

# Entrenar
grid.fit(X_scaled, y)

# Resultados
print("🔍 Mejor combinación de hiperparámetros:", grid.best_params_)
print(f"📈 Mejor F1-score (CV): {grid.best_score_:.4f}")

# Puedes imprimir más métricas si deseas:
means = grid.cv_results_
print(f"✅ Accuracy medio: {means['mean_test_accuracy'][grid.best_index_]:.4f}")
print(f"✅ Recall medio:   {means['mean_test_recall'][grid.best_index_]:.4f}")


🔍 Mejor combinación de hiperparámetros: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
📈 Mejor F1-score (CV): 0.7362
✅ Accuracy medio: 0.6768
✅ Recall medio:   0.8235


In [43]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import numpy as np

# Cargar datos
data_trainval = np.load("deepfake_trainval.npz")
X = data_trainval['X']
y = data_trainval['y']

# 1. Dividir en entrenamiento y validación (80/20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Escalar los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 3. Crear y entrenar modelo con los hiperparámetros óptimos
lr = LogisticRegression(C=0.01, penalty='l2', solver='lbfgs', max_iter=1000)
lr.fit(X_train_scaled, y_train)

# 4. Evaluar en entrenamiento y validación
print("🔹 Evaluación en TRAIN:")
print(classification_report(y_train, lr.predict(X_train_scaled)))

print("🔹 Evaluación en VALIDACIÓN:")
print(classification_report(y_val, lr.predict(X_val_scaled)))


🔹 Evaluación en TRAIN:
              precision    recall  f1-score   support

           0       0.70      0.51      0.59       571
           1       0.67      0.82      0.74       693

    accuracy                           0.68      1264
   macro avg       0.69      0.67      0.66      1264
weighted avg       0.69      0.68      0.67      1264

🔹 Evaluación en VALIDACIÓN:
              precision    recall  f1-score   support

           0       0.71      0.50      0.59       143
           1       0.67      0.83      0.74       174

    accuracy                           0.68       317
   macro avg       0.69      0.67      0.67       317
weighted avg       0.69      0.68      0.67       317



In [47]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np

# Cargar datos
data_train = np.load("deepfake_trainval.npz")
X = data_train['X']
y = data_train['y']

# Escalar
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Modelo base
knn = KNeighborsClassifier()

# Grilla de hiperparámetros
param_grid = {
    'n_neighbors': [ 5, 10, 15, 20],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Scoring múltiple
scoring = {
    'accuracy': 'accuracy',
    'recall': 'recall',
    'f1': 'f1'
}

# GridSearch con múltiples métricas
grid = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    scoring=scoring,
    refit='f1',  # el mejor se elige por f1
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Entrenar
grid.fit(X_scaled, y)

# Resultados
print("🔍 Mejor combinación de hiperparámetros:", grid.best_params_)
print(f"📈 Mejor F1-score (CV): {grid.cv_results_['mean_test_f1'][grid.best_index_]:.4f}")
print(f"✅ Accuracy medio:       {grid.cv_results_['mean_test_accuracy'][grid.best_index_]:.4f}")
print(f"✅ Recall medio:         {grid.cv_results_['mean_test_recall'][grid.best_index_]:.4f}")


Fitting 5 folds for each of 16 candidates, totalling 80 fits
🔍 Mejor combinación de hiperparámetros: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
📈 Mejor F1-score (CV): 0.9751
✅ Accuracy medio:       0.9728
✅ Recall medio:         0.9712


In [49]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import numpy as np

# Cargar datos
data_trainval = np.load("deepfake_trainval.npz")
X = data_trainval['X']
y = data_trainval['y']

# 1. Dividir en entrenamiento y validación (80/20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Escalar los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 3. Crear y entrenar modelo
knn = KNeighborsClassifier(metric = 'euclidean', n_neighbors= 5, weights= 'distance')
knn.fit(X_train_scaled, y_train)

# 4. Evaluar en entrenamiento y validación
print("🔹 Evaluación en TRAIN:")
print(classification_report(y_train, knn.predict(X_train_scaled)))

print("🔹 Evaluación en VALIDACIÓN:")
print(classification_report(y_val, knn.predict(X_val_scaled)))


🔹 Evaluación en TRAIN:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       571
           1       1.00      1.00      1.00       693

    accuracy                           1.00      1264
   macro avg       1.00      1.00      1.00      1264
weighted avg       1.00      1.00      1.00      1264

🔹 Evaluación en VALIDACIÓN:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       143
           1       0.99      0.99      0.99       174

    accuracy                           0.99       317
   macro avg       0.99      0.99      0.99       317
weighted avg       0.99      0.99      0.99       317



Random Forest

In [53]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np

# Cargar datos
data_train = np.load("deepfake_trainval.npz")
X = data_train['X']
y = data_train['y']

# Escalar los datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Definir modelo base
rf = RandomForestClassifier(random_state=42)

# Definir la grilla de hiperparámetros
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

scoring = {
    'accuracy': 'accuracy',
    'recall': 'recall',
    'f1': 'f1'
}

# Crear GridSearchCV
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring=scoring,  # también puedes usar 'recall' si priorizas eso
    cv=5,
    refit='f1',  # el mejor se elige por f1
    n_jobs=-1,
    verbose=1
)

# Ejecutar búsqueda
grid.fit(X_scaled, y)

# Resultados
print("🔍 Mejor combinación de hiperparámetros:", grid.best_params_)
print(f"📈 Mejor F1-score (CV): {grid.cv_results_['mean_test_f1'][grid.best_index_]:.4f}")
print(f"✅ Accuracy medio:       {grid.cv_results_['mean_test_accuracy'][grid.best_index_]:.4f}")
print(f"✅ Recall medio:         {grid.cv_results_['mean_test_recall'][grid.best_index_]:.4f}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
🔍 Mejor combinación de hiperparámetros: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
📈 Mejor F1-score (CV): 0.9696
✅ Accuracy medio:       0.9665
✅ Recall medio:         0.9723


In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import numpy as np

# Cargar datos
data_trainval = np.load("deepfake_trainval.npz")
X = data_trainval['X']
y = data_trainval['y']

# 1. Dividir en entrenamiento y validación (80/20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Escalar los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 3. Crear y entrenar modelo
rf = RandomForestClassifier(max_depth= 15, min_samples_leaf= 2, min_samples_split= 2, n_estimators = 100, random_state=42)
rf.fit(X_train_scaled, y_train)

# 4. Evaluar en entrenamiento y validación
print("🔹 Evaluación en TRAIN:")
print(classification_report(y_train, rf.predict(X_train_scaled)))

print("🔹 Evaluación en VALIDACIÓN:")
print(classification_report(y_val, rf.predict(X_val_scaled)))


🔹 Evaluación en TRAIN:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       571
           1       1.00      1.00      1.00       693

    accuracy                           1.00      1264
   macro avg       1.00      1.00      1.00      1264
weighted avg       1.00      1.00      1.00      1264

🔹 Evaluación en VALIDACIÓN:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       143
           1       0.98      0.99      0.99       174

    accuracy                           0.98       317
   macro avg       0.99      0.98      0.98       317
weighted avg       0.98      0.98      0.98       317



In [30]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import numpy as np

# Cargar datos
data_train = np.load("deepfake_trainval.npz")
X = data_train['X']
y = data_train['y']

data_test = np.load("deepfake_test.npz")
X_test = data_test['X']
y_test = data_test['y']

# Escalar datos (muy importante para SVM)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Crear modelo SVM (con kernel RBF por defecto)
svm = SVC(kernel='rbf', C=2, gamma='scale', random_state=42)

# Validación cruzada (usando múltiples métricas)
scoring = {
    'accuracy': 'accuracy',
    'recall': 'recall', 
    'f1': 'f1'
}

results = cross_validate(svm, X_scaled, y, cv=5, scoring=scoring)

# Mostrar resultados promedio
print(f"Accuracy promedio (CV): {results['test_accuracy'].mean():.4f}")
print(f"Recall promedio (CV):   {results['test_recall'].mean():.4f}")
print(f"F1-score promedio (CV): {results['test_f1'].mean():.4f}")


Accuracy promedio (CV): 0.8823
Recall promedio (CV):   0.9169
F1-score promedio (CV): 0.8951


In [31]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import numpy as np

# Cargar datos
data_trainval = np.load("deepfake_trainval.npz")
X = data_trainval['X']
y = data_trainval['y']

# 1. Dividir en entrenamiento y validación (80/20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Escalar los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 3. Crear y entrenar modelo
svm = SVC(kernel='rbf', C=2, gamma='scale', random_state=42)
svm.fit(X_train_scaled, y_train)

# 4. Evaluar en entrenamiento y validación
print("🔹 Evaluación en TRAIN:")
print(classification_report(y_train, svm.predict(X_train_scaled)))

print("🔹 Evaluación en VALIDACIÓN:")
print(classification_report(y_val, svm.predict(X_val_scaled)))


🔹 Evaluación en TRAIN:
              precision    recall  f1-score   support

           0       0.90      0.84      0.87       571
           1       0.87      0.92      0.90       693

    accuracy                           0.89      1264
   macro avg       0.89      0.88      0.88      1264
weighted avg       0.89      0.89      0.88      1264

🔹 Evaluación en VALIDACIÓN:
              precision    recall  f1-score   support

           0       0.90      0.83      0.86       143
           1       0.87      0.93      0.89       174

    accuracy                           0.88       317
   macro avg       0.88      0.88      0.88       317
weighted avg       0.88      0.88      0.88       317

