# 02 - Feature Engineering: Computer Price Prediction

## Simplified and Improved Pipeline

This notebook applies **simplified feature engineering** with:

1. **Data Quality Analysis** - Detect format issues, mixed types, and column groupings
2. **CPU/GPU Parsing** - Extract normalized keys (brand, family, model, suffix)
3. **Benchmark Matching** - Exact matching first, then fuzzy matching with scores
4. **Feature Extraction** - 18 engineered features for modeling

---

## 1. Imports and Setup

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import warnings

# Reload features module
sys.path.append('..')
for mod in ['src.features', 'features']:
    if mod in sys.modules:
        del sys.modules[mod]

from src.features import (
    cargar_datos, construir_features,
    analyze_format_issues, print_column_groups,
    parse_cpu_name, parse_gpu_name
)

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)
sns.set_theme(style='whitegrid')
warnings.filterwarnings('ignore')

print("Libraries loaded successfully!")

## 2. Load Raw Data

In [None]:
DATA_DIR = Path('../data')

df_comp, df_cpu, df_gpu = cargar_datos(
    str(DATA_DIR / 'db_computers_2025_raw.csv'),
    str(DATA_DIR / 'db_cpu_raw.csv'),
    str(DATA_DIR / 'db_gpu_raw.csv')
)

print(f"\nDataset shapes:")
print(f"  Computers: {df_comp.shape}")
print(f"  CPU benchmarks: {df_cpu.shape}")
print(f"  GPU benchmarks: {df_gpu.shape}")

## 3. Data Quality Analysis

Detect format issues, mixed types, and column groupings.

In [None]:
# Analyze format issues in each dataset
df_comp_issues = analyze_format_issues(df_comp, "Computers dataset")
df_cpu_issues = analyze_format_issues(df_cpu, "CPU benchmarks")
df_gpu_issues = analyze_format_issues(df_gpu, "GPU benchmarks")

In [None]:
# Show columns with format issues
print("\n=== Computers: Columns with mixed types ===")
mixed_cols = df_comp_issues[df_comp_issues['mixed_numeric_text'] == True]
if len(mixed_cols) > 0:
    display(mixed_cols[['column', 'sample_text_values']].head(10))
else:
    print("No mixed type columns detected")

print("\n=== Computers: Columns with multilabel values ===")
multilabel = df_comp_issues[df_comp_issues['multilabel_rows'] > 100].sort_values('multilabel_rows', ascending=False)
if len(multilabel) > 0:
    display(multilabel[['column', 'multilabel_rows']].head(10))
else:
    print("No significant multilabel columns")

In [None]:
# Show column groups by prefix
print_column_groups(df_comp)

## 4. CPU/GPU Parsing Preview

Test the parsing logic on sample data before running full feature engineering.

In [None]:
# Test CPU parsing on sample data
print("=== CPU Parsing Examples ===")
cpu_samples = df_comp['Procesador_Procesador'].dropna().sample(10, random_state=42)

for cpu in cpu_samples:
    parsed = parse_cpu_name(cpu)
    print(f"\nOriginal: {cpu}")
    print(f"  -> Key: {parsed['cpu_normalized_key']}")
    print(f"  -> Brand: {parsed['cpu_brand']}, Family: {parsed['cpu_family']}")
    print(f"  -> Model: {parsed['cpu_model_code']}, Suffix: {parsed['cpu_suffix']}")

In [None]:
# Test GPU parsing on sample data
print("=== GPU Parsing Examples ===")
gpu_samples = df_comp['Gr치fica_Tarjeta gr치fica'].dropna().sample(10, random_state=21)

for gpu in gpu_samples:
    parsed = parse_gpu_name(gpu)
    print(f"\nOriginal: {gpu}")
    print(f"  -> Key: {parsed['gpu_normalized_key']}")
    print(f"  -> Brand: {parsed['gpu_brand']}, Series: {parsed['gpu_series']}")
    print(f"  -> Model: {parsed['gpu_model_number']}, Integrated: {parsed['gpu_is_integrated']}")

## 5. Run Feature Engineering

In [None]:
# Build all engineered features
print("="*80)
print("RUNNING FEATURE ENGINEERING")
print("="*80)
print(f"\nProcessing {len(df_comp):,} computer listings...\n")

df_feat = construir_features(df_comp, df_cpu, df_gpu)

print(f"\nDataframe shape: {df_feat.shape}")

In [None]:
# List all engineered features
engineered = sorted([c for c in df_feat.columns if c.startswith('_')])
print(f"\nTotal engineered features: {len(engineered)}\n")

for i, feat in enumerate(engineered, 1):
    non_null = df_feat[feat].notna().sum()
    pct = non_null / len(df_feat) * 100
    print(f"{i:2d}. {feat:35s}: {non_null:5,}/{len(df_feat):,} ({pct:5.1f}%)")

## 6. CPU/GPU Matching Analysis

In [None]:
# CPU matching summary
print("=" * 60)
print("CPU BENCHMARK MATCHING")
print("=" * 60)

print("\nMatch strategy distribution:")
print(df_feat['cpu_match_strategy'].value_counts(dropna=False))

# Coverage by brand
print("\nCoverage by CPU brand:")
for brand in ['intel', 'amd', 'apple', 'qualcomm']:
    mask = df_feat['cpu_brand'] == brand
    if mask.sum() > 0:
        matched = df_feat.loc[mask, 'cpu_bench_mark'].notna().sum()
        total = mask.sum()
        print(f"  {brand.capitalize():10s}: {matched:4,}/{total:4,} ({matched/total*100:5.1f}%)")

In [None]:
# GPU matching summary
print("=" * 60)
print("GPU BENCHMARK MATCHING")
print("=" * 60)

print("\nMatch strategy distribution:")
print(df_feat['gpu_match_strategy'].value_counts(dropna=False))

# Show discrete GPU coverage
discrete_mask = df_feat['gpu_is_integrated'] != True
if discrete_mask.sum() > 0:
    matched = df_feat.loc[discrete_mask, 'gpu_bench_mark'].notna().sum()
    total = discrete_mask.sum()
    print(f"\nDiscrete GPU coverage: {matched:,}/{total:,} ({matched/total*100:.1f}%)")

In [None]:
# Show sample matched CPUs
print("\n=== Sample Matched CPUs ===")
matched_cpus = df_feat[df_feat['cpu_match_strategy'].isin(['exact', 'fuzzy'])][
    ['Procesador_Procesador', 'cpu_normalized_key', 'cpu_bench_name', 
     'cpu_bench_mark', 'cpu_match_strategy', 'cpu_match_score']
].head(15)
display(matched_cpus)

In [None]:
# Show sample matched GPUs
print("\n=== Sample Matched GPUs ===")
matched_gpus = df_feat[df_feat['gpu_match_strategy'].isin(['exact', 'fuzzy'])][
    ['Gr치fica_Tarjeta gr치fica', 'gpu_normalized_key', 'gpu_bench_name', 
     'gpu_bench_mark', 'gpu_match_strategy', 'gpu_match_score']
].head(15)
display(matched_gpus)

## 7. Correlation Analysis

In [None]:
# Correlation with price
numeric_feats = [
    '_ram_gb', '_ssd_gb', '_cpu_cores', '_gpu_memory_gb',
    '_cpu_mark', '_gpu_mark', '_tamano_pantalla_pulgadas',
    '_resolucion_pixeles', '_tasa_refresco_hz', '_peso_kg',
    '_num_ofertas', '_precio_num'
]

available = [f for f in numeric_feats if f in df_feat.columns]
corr = df_feat[available].corr()['_precio_num'].drop('_precio_num').sort_values(ascending=False)

print("=" * 60)
print("CORRELATION WITH PRICE")
print("=" * 60)
print(f"\n{'Feature':<35s} {'Correlation':>12s} {'Strength':>12s}")
print("-" * 60)

for feat, c in corr.items():
    if pd.notna(c):
        strength = "Strong" if abs(c) >= 0.5 else "Moderate" if abs(c) >= 0.3 else "Weak"
        print(f"{feat:<35s} {c:>12.3f} {strength:>12s}")

In [None]:
# Visualize correlations
fig, ax = plt.subplots(figsize=(10, 6))
corr.plot(kind='barh', ax=ax, color=['green' if x > 0 else 'red' for x in corr])
ax.set_xlabel('Correlation with Price')
ax.set_title('Feature Correlations with Price')
ax.axvline(0.3, color='orange', linestyle='--', alpha=0.5)
ax.axvline(0.5, color='red', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

## 8. Feature Distributions

In [None]:
# Plot distributions of key features
key_feats = ['_precio_num', '_ram_gb', '_ssd_gb', '_cpu_cores', '_cpu_mark', '_gpu_mark']
key_feats = [f for f in key_feats if f in df_feat.columns]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, feat in enumerate(key_feats):
    if idx < len(axes):
        data = df_feat[feat].dropna()
        if len(data) > 0:
            axes[idx].hist(data, bins=50, edgecolor='black', alpha=0.7)
            axes[idx].set_xlabel(feat)
            axes[idx].set_title(f'Distribution of {feat}')
            axes[idx].axvline(data.median(), color='red', linestyle='--', 
                             label=f'Median: {data.median():.1f}')
            axes[idx].legend()

plt.tight_layout()
plt.show()

## 9. Missing Values Summary

In [None]:
# Missing values for engineered features
eng_feats = [c for c in df_feat.columns if c.startswith('_')]
missing = pd.DataFrame({
    'Missing': df_feat[eng_feats].isna().sum(),
    'Missing %': (df_feat[eng_feats].isna().sum() / len(df_feat) * 100).round(1)
}).sort_values('Missing %', ascending=False)

print("=== Missing Values for Engineered Features ===")
display(missing)

## 10. Save Processed Dataset

In [None]:
# Drop rows without valid price target
print(f"Original size: {len(df_feat):,} rows")
df_model = df_feat[df_feat['_precio_num'].notna()].copy()
print(f"After dropping missing prices: {len(df_model):,} rows")
print(f"Rows dropped: {len(df_feat) - len(df_model):,}")

# Save processed dataset - try parquet first, fall back to CSV
try:
    output_path = DATA_DIR / 'db_features.parquet'
    df_model.to_parquet(output_path, index=False)
    print(f"\nSaved to: {output_path}")
    print(f"File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")
except ImportError:
    print("\nNote: pyarrow not installed, saving as CSV instead")
    output_path = DATA_DIR / 'db_features.csv'
    df_model.to_csv(output_path, index=False)
    print(f"\nSaved to: {output_path}")
    print(f"File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")

print(f"Rows: {len(df_model):,}")
print(f"Columns: {len(df_model.columns)}")
print(f"Engineered features: {len([c for c in df_model.columns if c.startswith('_')])}")

## Summary

### Feature Engineering Complete!

**Key Improvements:**

1. **Simplified CPU/GPU Parsing** - Extract normalized keys with brand, family, model, suffix
2. **Two-Stage Matching** - Exact match first, then fuzzy match with similarity scores
3. **Match Tracking** - `cpu_match_strategy` and `gpu_match_strategy` columns track how each row was matched
4. **Integrated GPU Handling** - Correctly identifies and skips integrated graphics

**Next Steps:**
- Load processed dataset in modeling notebook
- Build sklearn pipelines with imputation
- Train and evaluate ML models