In [None]:
from typing import (
    Tuple,
    List,
    Dict,
    Any,
    Sequence,
    Union,
    Optional,
)

import sys
import time
import json
import re
from pathlib import Path
from enum import Enum
import shutil

import pandas as pd
import polars as pl
import polars.selectors as cs

# 상대 경로 사용
PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / 'data'

# 로컬 모듈
# 맨 앞에 추가
if str(PROJECT_ROOT) in sys.path:
    sys.path.remove(str(PROJECT_ROOT))
sys.path.insert(0, str(PROJECT_ROOT))
from src.preprocess.preprocess import overview_col, analyze_null_values, eda_proportion

# 이제 import
from src.loading import DataLoader
from src.utils import increment_path

output_file = DATA_DIR / 'temp' / 'maude_clustered.parquet'
loader = DataLoader(
    # output_file= DATA_DIR / 'silver' / 'maude50.parquet',
    output_file= output_file,
)

adapter = 'polars'
polars_kwargs = {
    'use_statistics': True,
    'parallel': 'auto',
    'low_memory': False,
    'rechunk': False,
    'cache': True,
}
maude_lf = loader.load(adapter=adapter, **polars_kwargs)
maude_lf

In [None]:
from cuml import UMAP
import cupy as cp
import matplotlib.pyplot as plt
import numpy as np

# 1. GPU 메모리로 직접 전송
embedding_df: pl.DataFrame = maude_lf.select(pl.col('embeddings')).collect()
cluster_df: pl.DataFrame = maude_lf.select(pl.col('cluster')).collect()

In [None]:
embeddings_array = np.vstack(embedding_df['embeddings'].to_list())
clusters = cluster_df['cluster'].to_numpy()

# 2. CuPy 배열로 변환 (GPU 메모리 활용)
embeddings_gpu = cp.asarray(embeddings_array, dtype=cp.float32)

In [None]:
umap_model = UMAP(
    n_neighbors=50,
    min_dist=0.0,
    n_components=15,
    metric='cosine',
    random_state=42,
    verbose=True
)
X = umap_model.fit_transform(embeddings_gpu)
X = cp.asarray(X, dtype=cp.float32)

In [None]:
import cupy as cp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

X_np = cp.asnumpy(X) if hasattr(X, "get") else X
clusters_np = cp.asnumpy(clusters) if hasattr(clusters, "get") else clusters

Z = PCA(n_components=2).fit_transform(X_np)

df_temp = pd.DataFrame({
    "PC1": Z[:,0],
    "PC2": Z[:,1],
    "Cluster": clusters_np
})

plt.figure(figsize=(6,5))
sns.scatterplot(data=df_temp, x="PC1", y="PC2", hue="Cluster", s=6, legend="full")
plt.title("HDBSCAN (k<=18) result")
plt.show()

In [None]:
# maude_lf에 PC1, PC2 컬럼 추가 (LazyFrame 유지)
maude_lf = maude_lf.with_columns([
    pl.lit(Z[:,0]).alias('PC1'),
    pl.lit(Z[:,1]).alias('PC2')
])
maude_lf.collect_schema().names()

In [None]:
output_path = DATA_DIR / 'gold' / 'maude_clustered_viz.parquet'
maude_lf.sink_parquet(
    output_path,
    compression='zstd',
    compression_level=3,
    maintain_order=True,
    statistics=True,
    mkdir=True,
)

In [None]:
# # 3. UMAP 파라미터 최적화
# umap_2d = UMAP(n_components=2, random_state=42)

# # 4. GPU에서 변환
# embedding_2d_gpu = umap_2d.fit_transform(embeddings_gpu)

# # 5. 시각화를 위해
# embedding_2d = cp.asnumpy(embedding_2d_gpu)

# plt.figure(figsize=(12, 8))
# scatter = plt.scatter(
#     embedding_2d[:, 0], 
#     embedding_2d[:, 1], 
#     c=clusters, 
#     cmap='tab20', 
#     alpha=0.6,
#     s=5
# )
# # plt.colorbar(scatter)
# plt.title('Cluster')
# plt.xlabel('UMAP 1')
# plt.ylabel('UMAP 2')
# plt.tight_layout()
# plt.show()

# # 메모리 정리
# del embeddings_gpu, embedding_2d_gpu