# Selección de caracteristicas

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
# Cargar el archivo
data = np.load("df-deepfake.npz")

# Acceder a los arrays
X = data['X']
y = data['y']


In [7]:
from sklearn.feature_selection import SelectKBest, f_classif

def select_features_filter(X, y, k=20):
    """
    Selecciona las k mejores características según ANOVA F-score.
    """
    selector = SelectKBest(score_func=f_classif, k=k)
    X_new = selector.fit_transform(X, y)
    mask = selector.get_support()  # True para las características seleccionadas
    return X_new, mask

In [8]:
selected_features = select_features_filter(X, y, k=30)

In [9]:
selected_features

(array([[3.58127338e+02, 7.64220534e+02, 4.74715244e+02, ...,
         9.45720244e-01, 9.08914183e-01, 9.47606244e-01],
        [4.84644377e+02, 9.80036828e+02, 5.87257936e+02, ...,
         9.29161886e-01, 8.86432234e-01, 9.34714009e-01],
        [3.13229946e+02, 8.59019964e+02, 5.78034756e+02, ...,
         9.32667262e-01, 8.83714246e-01, 9.41328429e-01],
        ...,
        [3.26513103e+02, 8.35851882e+02, 5.82992618e+02, ...,
         9.38265695e-01, 8.92597529e-01, 9.39915353e-01],
        [3.98067360e+02, 8.05987724e+02, 4.81166400e+02, ...,
         9.46045659e-01, 9.14929533e-01, 9.49735037e-01],
        [4.84276575e+02, 9.52319735e+02, 5.24121001e+02, ...,
         9.27139397e-01, 8.89208181e-01, 9.33101618e-01]], shape=(3090, 30)),
 array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
        False,  True,  True,  True, False,  True, False, False, False,
        False, False, False, False

In [11]:
from sklearn.decomposition import PCA

def reduce_with_pca(X, n_components=10):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    return X_pca, pca

In [12]:
reduce_with_pca(X, n_components=10)

(array([[-4.16075890e+02,  8.02054262e+01,  2.58948942e+01, ...,
          8.22883989e-03,  1.85002198e-01,  8.21821659e-02],
        [ 1.78981277e+02,  2.02768010e+02,  7.27199395e+01, ...,
          4.66522281e-02, -5.31345825e-02,  1.29485764e-01],
        [-2.47484923e+02, -1.27687311e+02,  6.91881946e+01, ...,
         -2.27480776e-02,  2.33863348e-01, -7.84853500e-02],
        ...,
        [-2.31862975e+02, -1.10801526e+02, -7.93898875e+00, ...,
          4.84975392e-02, -7.65950315e-02, -1.52540707e-01],
        [-3.20983384e+02,  2.08837522e+02,  3.84651356e+01, ...,
          1.38133294e-01, -3.00070751e-01, -5.56594664e-02],
        [ 9.74067010e+01,  3.62343270e+02,  7.84854836e+01, ...,
          1.68096375e-02, -6.02792751e-02,  1.80201287e-01]],
       shape=(3090, 10)),
 PCA(n_components=10))

In [18]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

def select_features_rfe(X, y, k=20):
    """
    Usa RFE con RandomForest para seleccionar las k mejores características.
    """
    model = RandomForestClassifier()
    selector = RFE(model, n_features_to_select=k, step=1)
    X_new = selector.fit_transform(X, y)
    mask = selector.get_support()
    ranked_features = selector.ranking_
    return X_new, mask, ranked_features

In [20]:
select_features_rfe(X, y, k=20)

(array([[3.58127338e+02, 4.74715244e+02, 7.37666997e+02, ...,
         9.47606244e-01, 9.29100799e-01, 9.08914183e-01],
        [4.84644377e+02, 5.87257936e+02, 9.03223882e+02, ...,
         9.34714009e-01, 9.03669889e-01, 8.86432234e-01],
        [3.13229946e+02, 5.78034756e+02, 7.48522227e+02, ...,
         9.41328429e-01, 9.28756020e-01, 8.83714246e-01],
        ...,
        [3.26513103e+02, 5.82992618e+02, 8.13516337e+02, ...,
         9.39915353e-01, 9.32640740e-01, 8.92597529e-01],
        [3.98067360e+02, 4.81166400e+02, 7.50874574e+02, ...,
         9.49735037e-01, 9.23412653e-01, 9.14929533e-01],
        [4.84276575e+02, 5.24121001e+02, 8.74390911e+02, ...,
         9.33101618e-01, 8.89948027e-01, 8.89208181e-01]], shape=(3090, 20)),
 array([ True, False,  True,  True, False, False,  True, False,  True,
         True, False, False,  True, False,  True, False,  True, False,
        False, False,  True, False, False, False, False,  True,  True,
        False, False, False,  True

In [None]:
labelsTemp = labels[['uuid', 'tag']].drop_duplicates().reset_index(drop=True)

labelsTemp

In [None]:
labelsTrain = labels.loc[labels['uuid'].isin(train_df['uuid'])].reset_index(drop=True)

labelsTest = labels.loc[labels['uuid'].isin(test_df['uuid'])].reset_index(drop=True)

print("Train shape:", labelsTrain.shape)
print("Test shape:", labelsTest.shape)
print("Train tag distribution:\n", labelsTrain['tag'].value_counts(normalize=True))
print("Test tag distribution:\n", labelsTest['tag'].value_counts(normalize=True))

In [None]:
from sklearn.model_selection import train_test_split


# Stratified split: 80% train, 20% test, using 'tag' as the stratification label
train_df, test_df = train_test_split(
    labelsTemp,
    test_size=0.2,
    random_state=42,
    stratify=labelsTemp['tag']
)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Train tag distribution:\n", train_df['tag'].value_counts(normalize=True))
print("Test tag distribution:\n", test_df['tag'].value_counts(normalize=True))