# Point Upsampling

## IDW point selection

- 각 포인트의 1789m 거리 내에 1m 간격의 포인트 생성
- 생성된 포인트에 대해 neighbors idw
- neighbors가 가장 작은 값 순서대로 선택

In [1]:
import pandas as pd
from tqdm import tqdm
from scipy.spatial import distance
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KDTree
import utils.eda_utils as eda_utils 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('data/241121_Rockhead_Seoul_train(before_aumentation).csv')

In [3]:
def idw(df, x_col, y_col, value_col, new_points, p=2):
    """
    Pandas DataFrame을 이용한 IDW 보간 함수

    Args:
        df: DataFrame containing x, y coordinates and values.
        x_col, y_col, value_col: Column names for x, y coordinates and values in the DataFrame.
        new_points: Array of points to interpolate (M, D), where M is the number of new points.
        p: Power parameter for the weighting function.

    Returns:
        Interpolated values at new points (M).
    """

    points = df[[x_col, y_col]].values
    values = df[value_col].values

    # 새로운 점과 기존 점 사이의 거리 계산
    dist = distance.cdist(new_points, points, 'euclidean')

    # 거리의 역수를 p 제곱하여 가중치 계산
    weights = 1 / (dist + 1e-10)**p

    # 가중 평균 계산
    z = np.sum(weights * values, axis=1) / np.sum(weights, axis=1)

    return z

In [5]:
df_samples = pd.DataFrame(columns=['x', 'y', 'neighbors'])

radius = 1789
mid_neighbor = 50

location = df.loc[:,['x','y']].to_numpy()
tree = KDTree(location, leaf_size=15000)

for i, row in tqdm(df.iterrows(), total=len(df)):
    indices = tree.query_radius([row[['x','y']].to_numpy()], r=radius)

    if len(indices) >= mid_neighbor:
        continue
    else:
        df_group = df.iloc[indices[0]]
        x_min, x_max = row.x - radius, row.x + radius
        y_min, y_max = row.y - radius, row.y + radius

        # 원 안에 그리드 생성
        x = np.arange(x_min, x_max, 1)
        y = np.arange(y_min, y_max, 1)
        X, Y = np.meshgrid(x, y)
        points = np.column_stack((X.ravel(), Y.ravel()))

        distances = np.sqrt((points[:, 0] - row.x)**2 + (points[:, 1] - row.y)**2)
        mask = distances <= radius
        points = points[mask]

        # neighbors 기반 idw
        z = idw(df_group, 'x', 'y', 'neighbors', points, p=1)

        df_new = pd.DataFrame({'x': points[:,0], 'y': points[:,1], 'neighbors': z, 'sample': True})

        # 필터링된 데이터에서 neighbors 값이 작은 행 추출
        df_new = df_new.sort_values(by='neighbors')
        top = df_new.head(100-len(indices))
        df_samples = pd.concat([df_samples, top], ignore_index=True)

  df_samples = pd.concat([df_samples, top], ignore_index=True)
  1%|          | 75/13590 [24:40<74:07:39, 19.75s/it] 


MemoryError: Unable to allocate 11.3 GiB for an array with shape (10054687, 151) and data type float64

In [None]:
# 컨투어 플롯 그리기
plt.figure(figsize=(10, 6))
plt.contourf(df_samples.x, df_samples.y, df_samples.neighbors, cmap='viridis')
plt.colorbar()
#plt.scatter(df['x'], df['y'], c='red', marker='x')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('IDW Interpolation')
plt.show()

In [None]:
df_idw = df[['x', 'y', 'z']]
df_idw['sample'] = False

df_idw = pd.concat([df_idw, df_samples], ignore_index=True)

eda_utils.scatter_relation(df_idw, 'x', 'y', 5, 'x, y, neighbors','sample')