In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import time 

torch.manual_seed(21)    #reproducible
device='cuda' if torch.cuda.is_available() else 'cpu'
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 4090


In [3]:
p_ij_rtm = np.load('/home/gmy/workspace/llm-se/V2/data_1-MV-urban--0-sw/use0/p_ij_with_rtm.npy')
q_ij_rtm = np.load('/home/gmy/workspace/llm-se/V2/data_1-MV-urban--0-sw/use0/p_ij_with_rtm.npy')
PQ_input_data = np.load('/home/gmy/workspace/llm-se/V2/data_1-MV-urban--0-sw/use0/PQ_input_data.npy')
v_m_labels = np.load('/home/gmy/workspace/llm-se/V2/data_1-MV-urban--0-sw/use0/v_m_labels.npy')
line_input_data = np.stack((p_ij_rtm, q_ij_rtm), axis=-1) 
print('PQ_input_data.shape', PQ_input_data.shape)
print('p_ij_rtm.shape', p_ij_rtm.shape)
print('q_ij_rtm.shape', q_ij_rtm.shape)
print('v_m_labels.shape', v_m_labels.shape)
print('line_input_data.shape', line_input_data.shape)


PQ_input_data.shape (35120, 144, 16, 2)
p_ij_rtm.shape (35120, 36)
q_ij_rtm.shape (35120, 36)
v_m_labels.shape (35120, 144)
line_input_data.shape (35120, 36, 2)


In [4]:
from sklearn.model_selection import train_test_split  

x1 = PQ_input_data  
x2 = line_input_data  
y1 = v_m_labels  

In [None]:
new_shape = (-1, 16, 2)   
x1_reshaped = x1.reshape(new_shape)  

x2_expanded = np.expand_dims(x2, axis=1)
x2_expanded = np.tile(x2_expanded, (1, 144, 1, 1)) 

print('x2_expanded.shape', x2_expanded.shape)

x2_reshaped = x2_expanded.reshape(-1, 36, 2)  

y_m_reshaped = y1.reshape(-1,1)

print('x1_reshaped.shape', x1_reshaped.shape)
print('x2_reshaped.shape', x2_reshaped.shape)
print('y_m_reshaped.shape', y_m_reshaped.shape)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, EarlyStoppingCallback, get_scheduler

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

ds = Dataset.from_json("/data_1-MV-urban--0-sw/gpt2/dataset.json")



In [None]:
from torch.utils.data import Dataset, DataLoader  

def process_func(example):  
    MAX_LENGTH = 128  
    input_ids = []  
    instruction = tokenizer(example["instruction"])  
    input_ids = instruction["input_ids"]  
    if len(input_ids) > MAX_LENGTH:  
        input_ids = input_ids[:MAX_LENGTH]  
    return {"input_ids": input_ids}  

tokenized_ds = ds.map(process_func)  
input_ids_subset = tokenized_ds["input_ids"]  

# 划分训练集和测试集  
train_size = int(x1_reshaped.shape[0] * 0.8)  
test_size = x1_reshaped.shape[0] - train_size  

class CustomDataset(Dataset):  
    def __init__(self, x1, x2, input_ids, y_m, tokenizer, max_length=128):  
        self.x1 = torch.FloatTensor(x1)  
        self.x2 = torch.FloatTensor(x2)  
        self.y_m = torch.FloatTensor(y_m)  
        self.max_length = max_length  
        
        padded_input_ids = []  
        for seq in input_ids:  
            if len(seq) < max_length:  
                padding = [tokenizer.pad_token_id] * (max_length - len(seq))  
                padded_seq = list(seq) + padding  
            else:  
                padded_seq = list(seq[:max_length])  
            padded_input_ids.append(padded_seq)  
            
        self.input_ids = torch.LongTensor(padded_input_ids)  
        self.tokenizer = tokenizer  

    def __len__(self):  
        return len(self.x1)  

    def __getitem__(self, idx):  
        return {  
            'x1': self.x1[idx],  
            'x2': self.x2[idx],  
            'input_ids': self.input_ids[idx],  
            'labels': self.y_m[idx]
        }  

# 训练集  
train_dataset = CustomDataset(  
    x1[:train_size],  
    x2[:train_size],  
    input_ids_subset[:train_size],  
    y1[:train_size], 
    tokenizer  
)  

# 测试集  
test_dataset = CustomDataset(  
    x1[train_size:],  
    x2[train_size:],  
    input_ids_subset[train_size:],  
    y1[train_size:],  
    tokenizer  
)

In [None]:
import torch  
import torch.nn as nn  
from torch.utils.data import DataLoader  
from transformers import AdamW, get_linear_schedule_with_warmup  
from tqdm import tqdm  

class PatchTSTEncoder(nn.Module):  
    def __init__(self, input_dim, hidden_size, patch_len, stride):  
        super().__init__()  
        self.patch_len = patch_len  
        self.stride = stride  
        
        # Channel-independent patch embedding  
        self.patch_embedding = nn.Conv1d(  
            in_channels=input_dim,  
            out_channels=hidden_size,  
            kernel_size=patch_len,  
            stride=stride  
        )  
        
        # Position encoding for patches  
        self.pos_embedding = nn.Parameter(torch.randn(1, 1000, hidden_size))  # Max length of 1000 patches  
        
        # Transformer layers for patch processing  
        self.encoder_layer = nn.TransformerEncoderLayer(  
            d_model=hidden_size,  
            nhead=8,  
            dim_feedforward=hidden_size * 4,  
            dropout=0.1,  
            activation='gelu'  
        )  
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)  
        
    def forward(self, x):  
        # x shape: [batch_size, seq_len, input_dim]  
        x = x.transpose(1, 2)  # [batch_size, input_dim, seq_len]  
        
        # Create patches  
        patches = self.patch_embedding(x)  # [batch_size, hidden_size, num_patches]  
        patches = patches.transpose(1, 2)  # [batch_size, num_patches, hidden_size]  
        
        # Add position encoding  
        num_patches = patches.shape[1]  
        patches = patches + self.pos_embedding[:, :num_patches, :]  
        
        # Apply transformer  
        patches = self.transformer_encoder(patches)  
        
        return patches  

class ModifiedGPT2Model(nn.Module):  
    def __init__(self, original_model, time_series_dim=2, realtime_dim=2):  
        super(ModifiedGPT2Model, self).__init__()  
        self.original_model = original_model  
        self.hidden_size = original_model.config.hidden_size  
        
        self.wte = original_model.transformer.wte  
        self.wpe = original_model.transformer.wpe  
        self.h_blocks = original_model.transformer.h  
        self.ln_f = original_model.transformer.ln_f  

        # Set parameters frozen/trainable status  
        for param in self.wpe.parameters():  
            param.requires_grad = True  
        for param in self.wte.parameters():  
            param.requires_grad = False    
        for param in self.h_blocks.parameters():  
            param.requires_grad = False   
        for param in self.ln_f.parameters():  
            param.requires_grad = False  

        self.patch_len = 4
        self.stride = 2
        self.time_series_encoder = PatchTSTEncoder(  
            input_dim=time_series_dim,  
            hidden_size=self.hidden_size,  
            patch_len=self.patch_len,  
            stride=self.stride  
        )  
        
        self.realtime_linear = nn.Linear(realtime_dim, self.hidden_size)  
        self.realtime_down = nn.Sequential(  
            nn.Conv1d(self.hidden_size, self.hidden_size, 3, padding=1),  
            nn.ReLU(),  
            nn.BatchNorm1d(self.hidden_size)  
        )  
        
        self.alignment_layer = nn.Sequential(  
            nn.Linear(self.hidden_size * 2, self.hidden_size),  
            nn.ReLU(),  
            nn.LayerNorm(self.hidden_size)  
        )  
        
        self.regression_head = nn.Sequential(  
            nn.Linear(self.hidden_size, 256),  
            nn.ReLU(),  
            nn.Dropout(0.1),  
            nn.Linear(256, 64),  
            nn.ReLU(),  
            nn.Dropout(0.1),  
            nn.Linear(64, 1)  
        )  
        
    def forward(self, input_ids, x1, x2, attention_mask=None):  
        inputs_embeds = self.wte(input_ids)  
        position_ids = torch.arange(0, input_ids.size(1),  
                                  dtype=torch.long, device=input_ids.device)  
        position_embeds = self.wpe(position_ids).unsqueeze(0)  
        text_hidden = inputs_embeds + position_embeds  

        for block in self.h_blocks:  
            text_hidden = block(text_hidden)[0]  
        text_features = self.ln_f(text_hidden)  # [batch_size, seq_len, hidden_size]  
        
        time_features = self.time_series_encoder(x1)  # [batch_size, num_patches, hidden_size] #[64, 7, 768]  
        realtime_features = self.realtime_linear(x2)  
        realtime_features = realtime_features.transpose(1, 2)  
        realtime_features = self.realtime_down(realtime_features)  
        realtime_features = realtime_features.transpose(1, 2)  

        time_features_mean = time_features.mean(dim=1, keepdim=True).expand(-1, text_features.size(1), -1)  
        realtime_features_mean = realtime_features.mean(dim=1, keepdim=True).expand(-1, text_features.size(1), -1)  
        
        combined_features = torch.cat([  
            text_features,  
            time_features_mean,  
            realtime_features_mean  
        ], dim=-1)  
        
        fused_features = self.alignment_layer(combined_features)  
        final_features = fused_features[:, -1, :]  
        regression_output = self.regression_head(final_features)  
        
        return regression_output

In [None]:
from torch.utils.data import DataLoader, SubsetRandomSampler
import torch
import numpy as np
from tqdm import tqdm
from transformers import AdamW

class Trainer:  
    def __init__(self, model, train_dataset, test_dataset, tokenizer, config):  
        self.model = model  

        #print(model)
        print('number of params: %d'%(sum(temp.numel() for temp in model.parameters() if temp.requires_grad)))
        self.train_dataset = train_dataset 
        self.test_dataset = test_dataset  
        self.train_loader = DataLoader(  
            train_dataset,  
            batch_size=config['batch_size'],  
            shuffle=True  
        )  
        self.test_loader = DataLoader(  
            test_dataset,  
            batch_size=config['batch_size']  
        )  
        self.tokenizer = tokenizer  
        self.config = config  
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
        self.model.to(self.device)  
        
        self.criterion = nn.MSELoss()  
        
        self.optimizer = AdamW(  
            self.model.parameters(),  
            lr=config['learning_rate'],  
            weight_decay=config['weight_decay']  
        )  
        
        self.best_val_loss = float('inf')  
        self.patience_counter = 0  
        self.best_model_state = None  
        self.step_counter = 0 
        self.early_stop = False
        self.training_losses = []  # Will store (step, loss) pairs every 10 steps    

    def evaluate(self, sample_ratio=1.0, seed=55):  # Added seed parameter  

        self.model.eval()  
        total_val_loss = 0  
        all_preds = []  
        all_labels = []  

        import random  
        import numpy as np  

        random.seed(seed)  
        np.random.seed(seed)  
        torch.manual_seed(seed)  
        if torch.cuda.is_available():  
            torch.cuda.manual_seed(seed)  
            torch.cuda.manual_seed_all(seed)  

        if sample_ratio < 1.0:  
            subset_size = int(len(self.test_dataset) * sample_ratio)  
            generator = torch.Generator()  
            generator.manual_seed(seed)  
            indices = torch.randperm(len(self.test_dataset), generator=generator)[:subset_size]  
            sampler = SubsetRandomSampler(indices, generator=generator)  
            temp_loader = DataLoader(  
                self.test_dataset,  
                batch_size=self.config['batch_size'],  
                sampler=sampler,  
                worker_init_fn=lambda worker_id: np.random.seed(seed + worker_id) 
            )  
        else:  
            temp_loader = self.test_loader  

        with torch.no_grad():  
            for batch in tqdm(temp_loader, desc='Evaluating'):  
                input_ids = batch['input_ids'].to(self.device)  
                x1 = batch['x1'].to(self.device)  
                x2 = batch['x2'].to(self.device)  
                labels = batch['labels'].to(self.device)  

                outputs = self.model(input_ids, x1, x2)  
                loss = self.criterion(outputs, labels)  
                total_val_loss += loss.item()  

                all_preds.extend(outputs.cpu().numpy())  
                all_labels.extend(labels.cpu().numpy())  

        avg_val_loss = total_val_loss / len(temp_loader)  
        mae = np.mean(np.abs(np.array(all_preds) - np.array(all_labels)))  

        return avg_val_loss, mae  
    
    def train_epoch(self):  
        self.model.train()  
        total_train_loss = 0  

        progress_bar = tqdm(self.train_loader, desc='Training')  
        
        for batch in progress_bar:  
            input_ids = batch['input_ids'].to(self.device)  
            x1 = batch['x1'].to(self.device)  
            x2 = batch['x2'].to(self.device)  
            labels = batch['labels'].to(self.device)  

            outputs = self.model(input_ids, x1, x2)  
            loss = self.criterion(outputs, labels)  
            
            self.optimizer.zero_grad()  
            loss.backward()  
            self.optimizer.step()  
            
            total_train_loss += loss.item()  
            current_loss = loss.item()  
            progress_bar.set_postfix({'train_loss': loss.item()})  

            self.step_counter += 1  

            # Store loss every 10 steps  
            if self.step_counter % 1 == 0:  
                self.training_losses.append((self.step_counter, current_loss)) 
                
            if self.step_counter % 50 == 0:
                val_loss, mae = self.evaluate(sample_ratio=0.1, seed=55)  
                print(f"\nStep {self.step_counter} - Validation (10% subset):")  
                print(f"Val Loss: {val_loss:.8f} | MAE: {mae:.8f}")  
                
                if val_loss < self.best_val_loss - self.config['min_delta']:  
                    self.best_val_loss = val_loss  
                    self.patience_counter = 0  
                    self.best_model_state = self.model.state_dict()  
                    print(f"Found better model with validation loss: {self.best_val_loss:.8f}")  
                    torch.save(self.best_model_state, self.config['model_save_path'])  
                else:  
                    self.patience_counter += 1  
                    print(f"Validation loss did not improve. Patience counter: {self.patience_counter}/{self.config['patience']}")  
                
                if self.patience_counter >= self.config['patience']:  
                    print(f"\nEarly stopping triggered at step {self.step_counter}!")  
                    print(f"Best validation loss achieved: {self.best_val_loss:.8f}")  
                    self.early_stop = True  
                    break  

        return total_train_loss / len(self.train_loader)  

    def train(self):  
        print("Starting training...")  
        print(f"Training on device: {self.device}")  
        
        for epoch in range(self.config['epochs']):  
            avg_train_loss = self.train_epoch()  
            
            print(f"\nEpoch [{epoch+1}/{self.config['epochs']}]")  
            print(f"Train Loss: {avg_train_loss:.8f}")  
            
            if self.early_stop:  
                break  
        
        if self.best_model_state is not None:  
            self.model.load_state_dict(self.best_model_state)  
            print("Loaded best model state from checkpoint")  
            torch.save(self.model.state_dict(), './modified_gpt2_model.pth')  
            print("Best model saved")  

    def get_training_losses(self):  
        return self.training_losses  
    
def reinitialize_model_parameters(model):  

    def _init_weights(module):  
        if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)):  
            module.weight.data.normal_(mean=0.0, std=0.02)  
            if isinstance(module, torch.nn.Linear) and module.bias is not None:  
                module.bias.data.zero_()  
        elif isinstance(module, torch.nn.LayerNorm):  
            module.bias.data.zero_()  
            module.weight.data.fill_(1.0)  
            
    # 应用初始化  
    model.apply(_init_weights)  
    
    if hasattr(model, 'wpe'):  
        model.wpe.weight.data.normal_(mean=0.0, std=0.02)  
    
    print("reinitialize succeed")  
    
    return model  

# 使用示例：  
config = {  
    'batch_size': 64,  
    'learning_rate': 2e-5,  
    'weight_decay': 0.01,  
    'epochs': 300,  
    'max_grad_norm': 1.0,  
    'model_save_path': '/home/gmy/workspace/llm-se/V2/gpt2/checkpoints/best_model_mm.pt',  
    'patience': 5,  
    'min_delta': 0  
}  

from transformers import AutoModelForCausalLM  
original_model = AutoModelForCausalLM.from_pretrained('/home/gmy/workspace/LLMs_local_dir/gpt2')  
model = ModifiedGPT2Model(original_model)  
#model = reinitialize_model_parameters(model)
trainer = Trainer(model, train_dataset, test_dataset, tokenizer, config)  

trainer.train()  
