### 데이터 전처리

In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
from tqdm import tqdm


# 데이터 로드
df = pd.read_csv('./데이터/체류거점_2021.csv')

scaler = StandardScaler()
df[['latitude', 'longitude']] = scaler.fit_transform(df[['latitude', 'longitude']])

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda:1


### Sequence-to-Sequence 모델 정의

In [7]:
import torch
import torch.nn as nn

class Seq2SeqEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(Seq2SeqEncoder, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        
    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        return hidden[-1]
    
class Seq2SeqModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(Seq2SeqModel, self).__init__()
        self.encoder = Seq2SeqEncoder(input_dim=input_dim, hidden_dim=hidden_dim, num_layers=num_layers)
        self.decoder = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# 모델 인스턴스 생성
input_dim = 2
hidden_dim = 128
num_layers = 2
output_dim = 2

model = Seq2SeqModel(input_dim=input_dim, hidden_dim=hidden_dim, num_layers=num_layers, output_dim=output_dim).to(device=device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

### 데이터 준비 및 모델 훈련

In [8]:
from torch.utils.data import Dataset, DataLoader

class MovementDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length
        
    def __len__(self):
        return len(self.data) - self.seq_length
    
    def __getitem__(self, idx):
        seq = self.data.iloc[idx:idx+self.seq_length][['latitude', 'longitude']].values
        return torch.tensor(seq, dtype=torch.float32)
    
seq_length = 10
dataset = MovementDataset(data=df, seq_length=seq_length)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)

num_epochs = 20
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0
    for seq in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs})"):
        seq = seq.to(device)
        outputs = model(seq)
        loss = criterion(outputs, seq[:, -1, :]) # 마지막 위치 예측
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    avg_loss = epoch_loss / len(dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}") 

Epoch 1/20): 100%|██████████| 1485/1485 [02:21<00:00, 10.50it/s]


Epoch [1/20], Loss: 0.0001


Epoch 2/20): 100%|██████████| 1485/1485 [02:21<00:00, 10.50it/s]


Epoch [2/20], Loss: 0.0000


Epoch 3/20): 100%|██████████| 1485/1485 [02:21<00:00, 10.52it/s]


Epoch [3/20], Loss: 0.0000


Epoch 4/20): 100%|██████████| 1485/1485 [02:21<00:00, 10.50it/s]


Epoch [4/20], Loss: 0.0000


Epoch 5/20): 100%|██████████| 1485/1485 [02:21<00:00, 10.47it/s]


Epoch [5/20], Loss: 0.0000


Epoch 6/20): 100%|██████████| 1485/1485 [02:22<00:00, 10.46it/s]


Epoch [6/20], Loss: 0.0000


Epoch 7/20): 100%|██████████| 1485/1485 [02:21<00:00, 10.52it/s]


Epoch [7/20], Loss: 0.0000


Epoch 8/20): 100%|██████████| 1485/1485 [02:19<00:00, 10.68it/s]


Epoch [8/20], Loss: 0.0000


Epoch 9/20): 100%|██████████| 1485/1485 [02:18<00:00, 10.74it/s]


Epoch [9/20], Loss: 0.0000


Epoch 10/20): 100%|██████████| 1485/1485 [02:18<00:00, 10.69it/s]


Epoch [10/20], Loss: 0.0000


Epoch 11/20): 100%|██████████| 1485/1485 [02:20<00:00, 10.56it/s]


Epoch [11/20], Loss: 0.0000


Epoch 12/20): 100%|██████████| 1485/1485 [02:21<00:00, 10.48it/s]


Epoch [12/20], Loss: 0.0000


Epoch 13/20): 100%|██████████| 1485/1485 [02:19<00:00, 10.68it/s]


Epoch [13/20], Loss: 0.0000


Epoch 14/20): 100%|██████████| 1485/1485 [02:18<00:00, 10.72it/s]


Epoch [14/20], Loss: 0.0000


Epoch 15/20): 100%|██████████| 1485/1485 [02:19<00:00, 10.68it/s]


Epoch [15/20], Loss: 0.0000


Epoch 16/20): 100%|██████████| 1485/1485 [02:22<00:00, 10.43it/s]


Epoch [16/20], Loss: 0.0000


Epoch 17/20): 100%|██████████| 1485/1485 [02:21<00:00, 10.48it/s]


Epoch [17/20], Loss: 0.0000


Epoch 18/20): 100%|██████████| 1485/1485 [02:19<00:00, 10.61it/s]


Epoch [18/20], Loss: 0.0000


Epoch 19/20): 100%|██████████| 1485/1485 [02:20<00:00, 10.53it/s]


Epoch [19/20], Loss: 0.0000


Epoch 20/20): 100%|██████████| 1485/1485 [02:21<00:00, 10.52it/s]

Epoch [20/20], Loss: 0.0000





### 특징 추출 및 클러스터링

In [None]:
# 특징 추출
all_sequences = torch.tensor(df[['latitude', 'longitude']].values, dtype=torch.float32).unsqueeze(0).to(device)
with torch.no_grad():
    encoded_features = model.encoder(all_sequences).cpu().numpy()  # 결과를 CPU로 이동

# K-Means 클러스터링
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5)
clusters = kmeans.fit_predict(encoded_features)

# 클러스터링 결과를 데이터프레임에 추가
df['cluster'] = clusters


# 클러스터링 결과 시각화
import matplotlib.pyplot as plt

plt.scatter(df['latitude'], df['longitude'], c=df['cluster'])
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.title('Clustering of OID Movements')
plt.show()

ValueError: k must be less than or equal to the number of training points

In [15]:
encoded_features

array([[ 0.03429781, -0.13419507, -0.2583534 ,  0.08721063, -0.04501933,
        -0.2191285 , -0.13385521,  0.02588288, -0.43780097, -0.14164743,
         0.14058147, -0.02022504,  0.1288713 ,  0.07232095,  0.15696567,
        -0.03247162,  0.13529624, -0.33933643,  0.07189347,  0.09665288,
        -0.07490114,  0.03164821, -0.03804766,  0.04680096,  0.05350653,
        -0.04530615, -0.39768896, -0.03649569, -0.01001937,  0.01772391,
         0.02021107, -0.05366134, -0.20402595, -0.08778554,  0.07564089,
         0.07524466, -0.19027628, -0.16299124, -0.13266239, -0.16484879,
         0.02522239,  0.06861281,  0.08593588, -0.13285963, -0.0046808 ,
        -0.00220503, -0.08188999, -0.18233521, -0.09100848,  0.16123821,
        -0.01152903, -0.01052126, -0.00588909, -0.16297941,  0.07052331,
        -0.41137096, -0.17036392,  0.05327912,  0.30898204,  0.11842325,
         0.11918961, -0.0087897 , -0.02577187, -0.16360332,  0.06295823,
         0.06133551, -0.38431403,  0.04063575, -0.3