# HR Analytics - Data Preprocessing Pipeline



In [1]:
import numpy as np
import sys
import os

# Add src folder to path
sys.path.append('../src')

from data_processing import (
    preprocess_train_dataset, 
    preprocess_test_dataset,
    save_processed_data
)



## Step 1: Preprocess Training Data

Xử lý training data và lưu artifacts (statistics, categories) để sử dụng cho test data

In [2]:
train_raw_path = '../data/raw/aug_train.csv'
artifacts_path = '../data/processed/artifacts.json'
train_processed_path = '../data/processed/train_processed.csv'
print("PREPROCESSING TRAINING DATA")

training_data = preprocess_train_dataset(
    filepath=train_raw_path,
    save_artifacts=True,
    artifacts_path=artifacts_path
)

X_train = training_data['X']
y_train = training_data['y']
feature_names = training_data['feature_names']

print(f"\n{'='*60}")
print("TRAINING DATA SUMMARY")
print(f"Feature matrix shape: {X_train.shape}")
print(f"Target vector shape: {y_train.shape}")
print(f"Number of features: {len(feature_names)}")
print(f"\nFeature names ({len(feature_names)} total):")
for i, name in enumerate(feature_names, 1):
    print(f"  {i:2d}. {name}")

print(f"\nData quality check:")
print(f"  NaN values in X: {np.isnan(X_train).sum()}")
print(f"  Inf values in X: {np.isinf(X_train).sum()}")
print(f"  NaN values in y: {np.isnan(y_train).sum()}")
unique, counts = np.unique(y_train, return_counts=True)
print(f"\nTarget distribution:")
for val, count in zip(unique, counts):
    pct = 100 * count / len(y_train)
    print(f"  Class {int(val)}: {count:5d} ({pct:.1f}%)")

PREPROCESSING TRAINING DATA
Loading ../data/raw/aug_train.csv...
  19,158 samples
Standardizing numeric features...
Encoding categorical features...
One-hot encoding...
Engineering features...
Building feature matrix...
Artifacts → ../data/processed/artifacts.json
Done! X: (19158, 33), y: (19158,), features: 33

TRAINING DATA SUMMARY
Feature matrix shape: (19158, 33)
Target vector shape: (19158,)
Number of features: 33

Feature names (33 total):
   1. cdi_scaled
   2. hours_scaled
   3. gender
   4. rel_exp
   5. exp_bin
   6. comp_size
   7. last_job
   8. uni_Full time course
   9. uni_Part time course
  10. uni_no_enrollment
  11. edu_Graduate
  12. edu_High School
  13. edu_Masters
  14. edu_Phd
  15. edu_Primary School
  16. maj_Arts
  17. maj_Business Degree
  18. maj_Humanities
  19. maj_No Major
  20. maj_Other
  21. maj_STEM
  22. ctype_Early Stage Startup
  23. ctype_Funded Startup
  24. ctype_NGO
  25. ctype_Other
  26. ctype_Public Sector
  27. ctype_Pvt Ltd
  28. exp_relev

## Step 2: Save Training Data

Lưu processed training data thành CSV file

In [3]:
print("Saving processed training data...")
save_processed_data(
    X=X_train, 
    y=y_train, 
    feature_names=feature_names,
    filepath=train_processed_path
)
print(f"Training data saved successfully!")

Saving processed training data...
Saved ../data/processed/train_processed.csv ((19158, 34))
Training data saved successfully!


## Step 3: Preprocess Test Data

Xử lý test data sử dụng artifacts từ training (mean, std, categories) để đảm bảo consistency

In [4]:
test_raw_path = '../data/raw/aug_test.csv'
test_processed_path = '../data/processed/test_processed.csv'
print("PREPROCESSING TEST DATA")

test_bundle = preprocess_test_dataset(
    filepath=test_raw_path,
    artifacts_path=artifacts_path
)

X_test = test_bundle['X']
test_feature_names = test_bundle['feature_names']
enrollee_ids = test_bundle['enrollee_ids']

print(f"\n{'='*60}")
print("TEST DATA SUMMARY")
print(f"Feature matrix shape: {X_test.shape}")
print(f"Number of features: {len(test_feature_names)}")
print(f"Enrollee IDs shape: {enrollee_ids.shape}")
print(f"\nData quality check:")
print(f"  NaN values in X: {np.isnan(X_test).sum()}")
print(f"  Inf values in X: {np.isinf(X_test).sum()}")



PREPROCESSING TEST DATA
Loading ../data/raw/aug_test.csv...
  2,129 samples
Standardizing with train stats...
Encoding...
One-hot encoding...
Engineering...
Done! X: (2129, 33), expected: 33

TEST DATA SUMMARY
Feature matrix shape: (2129, 33)
Number of features: 33
Enrollee IDs shape: (2129,)

Data quality check:
  NaN values in X: 0
  Inf values in X: 0


## Step 4: Save Test Data

Lưu processed test data (với enrollee_ids để tạo submission file)

In [5]:
print("Saving processed test data...")
save_processed_data(
    X=X_test, 
    y=None,  
    feature_names=test_feature_names,
    filepath=test_processed_path,
    enrollee_ids=enrollee_ids
)
print(f"  Test data saved successfully!")

Saving processed test data...
Saved ../data/processed/test_processed.csv ((2129, 34))
  Test data saved successfully!
