In [29]:
import pandas as pd
import numpy as np
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters
import h5py
import warnings
warnings.filterwarnings('ignore')

def cargar_datos(path, is_train=True):
    with h5py.File(path, 'r') as f:
        body_acc_x = f['body_acc_x'][:]
        body_acc_y = f['body_acc_y'][:]
        body_acc_z = f['body_acc_z'][:]
        body_gyro_x = f['body_gyro_x'][:]
        body_gyro_y = f['body_gyro_y'][:]
        body_gyro_z = f['body_gyro_z'][:]
        total_acc_x = f['total_acc_x'][:]
        total_acc_y = f['total_acc_y'][:]
        total_acc_z = f['total_acc_z'][:]
        
        if is_train:
            labels = f['y'][:]
        else:
            labels = None

    time_steps = body_acc_x.shape[1]
    
    if is_train:
        print("Dimensiones de los datos:")
        print(f"Número de muestras: {body_acc_x.shape[0]}")
        print(f"Pasos de tiempo: {time_steps}")
        print(f"Número de etiquetas: {labels.shape[0]}")
        
        labels_repeated = np.repeat(labels, time_steps)
    
    data = pd.DataFrame({
        'body_acc_x': body_acc_x.flatten(),
        'body_acc_y': body_acc_y.flatten(),
        'body_acc_z': body_acc_z.flatten(),
        'body_gyro_x': body_gyro_x.flatten(),
        'body_gyro_y': body_gyro_y.flatten(),
        'body_gyro_z': body_gyro_z.flatten(),
        'total_acc_x': total_acc_x.flatten(),
        'total_acc_y': total_acc_y.flatten(),
        'total_acc_z': total_acc_z.flatten()
    })
    
    if is_train:
        data['label'] = labels_repeated
    
    data['time'] = np.tile(np.arange(time_steps), data.shape[0] // time_steps)
    data['id'] = np.repeat(np.arange(data.shape[0] // time_steps), time_steps)
    
    return data

train_data = cargar_datos('train.h5', is_train=True)

test_data = cargar_datos('test.h5', is_train=False)
fc_parameters = MinimalFCParameters()

def extract_features_in_chunks(data, chunk_size=1000, n_jobs=1):
    all_features = []
    for i in range(0, data['id'].nunique(), chunk_size):
        chunk_ids = range(i, min(i + chunk_size, data['id'].nunique()))
        chunk_data = data[data['id'].isin(chunk_ids)]
        
        features_chunk = extract_features(
            chunk_data,
            column_id='id',
            column_sort='time',
            default_fc_parameters=fc_parameters,
            n_jobs=n_jobs 
        )
        all_features.append(features_chunk)
        print(f"procesado chunk {i//chunk_size + 1} de {(data['id'].nunique() + chunk_size - 1)//chunk_size}")
    
    return pd.concat(all_features)

features_train = extract_features_in_chunks(train_data, n_jobs=1)
features_test = extract_features_in_chunks(test_data, n_jobs=1)
if 'label' in train_data.columns:
    labels_train = train_data['label'].iloc[::128].reset_index(drop=True)
    labels_train.to_pickle('labels_train.pkl')

features_train.to_pickle('features_train.pkl')
features_test.to_pickle('features_test.pkl')


Dimensiones de los datos:
Número de muestras: 7352
Pasos de tiempo: 128
Número de etiquetas: 7352


Feature Extraction: 100%|██████████| 10000/10000 [00:02<00:00, 3878.47it/s]


procesado chunk 1 de 8


Feature Extraction: 100%|██████████| 10000/10000 [00:02<00:00, 3731.45it/s]


procesado chunk 2 de 8


Feature Extraction: 100%|██████████| 10000/10000 [00:02<00:00, 3524.05it/s]


procesado chunk 3 de 8


Feature Extraction: 100%|██████████| 10000/10000 [00:04<00:00, 2272.39it/s]


procesado chunk 4 de 8


Feature Extraction: 100%|██████████| 10000/10000 [00:04<00:00, 2261.11it/s]


procesado chunk 5 de 8


Feature Extraction: 100%|██████████| 10000/10000 [00:03<00:00, 2745.43it/s]


procesado chunk 6 de 8


Feature Extraction: 100%|██████████| 10000/10000 [00:04<00:00, 2420.74it/s]


procesado chunk 7 de 8


Feature Extraction: 100%|██████████| 3520/3520 [00:01<00:00, 2115.43it/s]


procesado chunk 8 de 8


Feature Extraction: 100%|██████████| 9000/9000 [00:04<00:00, 2184.59it/s]


procesado chunk 1 de 3


Feature Extraction: 100%|██████████| 9000/9000 [00:03<00:00, 2651.72it/s]


procesado chunk 2 de 3


Feature Extraction: 100%|██████████| 8523/8523 [00:03<00:00, 2636.55it/s]


procesado chunk 3 de 3


In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')

features_train = pd.read_pickle('features_train.pkl')
features_test = pd.read_pickle('features_test.pkl')
labels_train = pd.read_pickle('labels_train.pkl')

features_test = features_test.replace([np.inf, -np.inf], np.nan)
features_train = features_train.fillna(0)  
features_test = features_test.fillna(0)

features_test = features_test.reindex(columns=features_train.columns, fill_value=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(features_train)
X_test_scaled = scaler.transform(features_test)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, labels_train)

knn = KNeighborsClassifier(
    n_neighbors=5,
    weights='distance',
    metric='euclidean',
    n_jobs=-1
)
knn.fit(X_train_resampled, y_train_resampled)

joblib.dump(knn, 'KNN_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
np.save('X_test_scaled.npy', X_test_scaled)

In [33]:
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

knn_model = joblib.load('KNN_model.pkl')
X_test_scaled = np.load('X_test_scaled.npy')

test_predictions = knn_model.predict(X_test_scaled)

submission = pd.DataFrame({
    'id': range(1, len(test_predictions) + 1),
    'activity': test_predictions
})

submission['activity'] = submission['activity'].astype(int)

submission_file = 'prediccionknn.csv'
submission.to_csv(submission_file, index=False, header=['ID', 'Activity'])