In [None]:
import torch
import numpy as np
from torch.optim import Adam
from gym import spaces
import gym
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import AutoModelForCausalLMWithValueHead, PPOTrainer, PPOConfig
from datasets import Dataset
from tqdm import tqdm

class TraderEnv(gym.Env):
    def __init__(self, data, initial_balance=10000, max_steps=200):
        super().__init__()
        self.data = data
        self.initial_balance = initial_balance
        self.max_steps = max_steps
        self.current_step = 0
        self.balance = self.initial_balance
        self.asset = 0
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(data.shape[1],), dtype=np.float32)
        self.reset()
    def reset(self):
        self.current_step = 0
        self.balance = self.initial_balance
        self.asset = 0
        return self.data[self.current_step].astype(np.float32)
    def step(self, action):
        price = self.data[self.current_step, -1]
        prev_value = self.balance + self.asset * price
        if action == 0 and self.balance >= price:
            shares = int(self.balance // price)
            self.asset += shares
            self.balance -= shares * price
        elif action == 2 and self.asset > 0:
            self.balance += self.asset * price
            self.asset = 0
        self.current_step += 1
        done = (self.current_step >= len(self.data) - 1) or (self.current_step >= self.max_steps)
        new_price = self.data[self.current_step, -1]
        new_value = self.balance + self.asset * new_price
        reward = new_value - prev_value
        return self.data[self.current_step].astype(np.float32), reward, done, {}

class PPOTrader:
    def __init__(self, model_name="Qwen/Qwen2.5-0.5B-Instruct", device="cuda:0"):
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.policy_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map=device)
        self.ref_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map=device)
        self.ref_model.eval()
        self.value_model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name, torch_dtype=torch.float16, device_map=device)
        class SimpleRewardModel(torch.nn.Module):
            def forward(self, input_ids, attention_mask=None):
                return torch.ones((input_ids.shape[0], 1), device=input_ids.device)
        self.reward_model = SimpleRewardModel().to(device)
    def generate_prompt(self, obs):
        return f"Financial Trading Decision:\nMarket Data: {obs}\nPossible Actions: [Buy, Hold, Sell]\nDecision:"
    @torch.no_grad()
    def predict(self, obs):
        prompt = self.generate_prompt(obs)
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        outputs = self.policy_model.generate(**inputs, max_new_tokens=16, temperature=0.7, do_sample=True)
        t = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return t.split("Decision:")[-1].strip()

def map_action(a_str):
    return {"Buy":0,"Hold":1,"Sell":2}.get(a_str,1)

def train_with_ppotrainer(agent, env, total_epochs=3, steps_per_epoch=200):
    config = PPOConfig(model_name="Qwen/Qwen2.5-0.5B-Instruct", learning_rate=5e-5, batch_size=32, mini_batch_size=4, gradient_accumulation_steps=4, optimize_cuda_cache=True, num_train_epochs=1, kl_coef=0.05, seed=42, output_dir="./ppo_output")
    dummy_dataset = Dataset.from_list([{"query":"","response":"","reward":0.0}])
    trainer = PPOTrainer(args=config, processing_class=agent.tokenizer, model=agent.policy_model, ref_model=agent.ref_model, reward_model=agent.reward_model, value_model=agent.value_model, train_dataset=dummy_dataset)
    for epoch in range(total_epochs):
        experiences = []
        obs = env.reset()
        total_reward = 0
        for _ in tqdm(range(steps_per_epoch), desc=f"Epoch {epoch+1}/{total_epochs}"):
            a_str = agent.predict(obs)
            a_id = map_action(a_str)
            nxt, rew, dn, _ = env.step(a_id)
            total_reward += rew
            experiences.append({"query":agent.generate_prompt(obs),"response":a_str,"reward":rew})
            obs = nxt
            if dn:
                obs = env.reset()
        real_ds = Dataset.from_list(experiences)
        trainer.train_dataset = real_ds
        trainer.train()
        print(f"[Epoch {epoch+1}/{total_epochs}] total reward = {total_reward:.2f}")

if __name__=="__main__":
    data = np.random.rand(1000,5).astype(np.float32)
    data[:,-1] *= 100
    env = TraderEnv(data)
    agent = PPOTrader("Qwen/Qwen2.5-0.5B-Instruct","cuda:0")
    train_with_ppotrainer(agent, env, 3, 200)
