In [None]:
import numpy as np
import pandas as pd

class WeeklyTradingEnvNoGym:
    """
    - Single-step environment with 5-day episodes.
    - The agent rebalances once per 5-day window.
    - Reward is the difference in portfolio value over those 5 days.
    """

    def __init__(
        self,
        df: pd.DataFrame,
        initial_amount: float = 1e6,
        episode_length: int = 5,
        news_lookback_days: int = 5,
        tech_indicator_list=None
    ):
        """
        Args:
            df: Must include 'close' (float) daily price, optional 'news' (str).
            initial_amount: Starting capital (float).
            episode_length: # of daily rows per episode (5 for 'weekly').
            news_lookback_days: # of prior days of news to aggregate for the text prompt.
            tech_indicator_list: list of columns for numeric indicators (optional).
        """
        self.df = df.reset_index(drop=True)
        self.num_days = len(self.df)
        self.initial_amount = initial_amount
        self.episode_length = episode_length
        self.news_lookback_days = news_lookback_days
        self.tech_indicator_list = tech_indicator_list or []

        # Pointers
        self.current_idx = 0  # Start day for the current episode
        self.done = False

        # Portfolio state
        self.cash = initial_amount
        self.position = 0
        self.start_portfolio_value = initial_amount

    def _get_aggregated_news(self, day_idx):
        """Collect the last `news_lookback_days` of news and join them into one string."""
        start_idx = max(0, day_idx - self.news_lookback_days + 1)
        snippet_list = self.df.iloc[start_idx:day_idx+1].get("news", "").tolist()
        agg_news = " ".join(str(s) for s in snippet_list if s is not None)
        return agg_news

    def _portfolio_value(self, price):
        return self.cash + self.position * price

    def reset(self):
        """
        Reset environment to a new 5-day window start.
        Returns (state, news).
        State is a simple numeric array, e.g. [cash, day0_price, indicators...]
        """
        if self.current_idx + self.episode_length >= self.num_days:
            self.done = True
            raise StopIteration("No more data to continue the environment.")

        self.done = False
        self.cash = self.initial_amount
        self.position = 0

        day0_data = self.df.iloc[self.current_idx]
        day0_price = day0_data["close"]
        self.start_portfolio_value = self.initial_amount

        # Numeric part of the observation
        obs_numeric = [self.cash, day0_price]
        for col in self.tech_indicator_list:
            obs_numeric.append(day0_data.get(col, 0.0))

        # Get aggregated news
        news_text = self._get_aggregated_news(self.current_idx)

        return np.array(obs_numeric, dtype=np.float32), news_text

    def step(self, action):
        """
        One step = pick # of shares. We jump forward 5 days for the final portfolio.
        Returns: (next_state, next_news, reward, done, info)
        """
        if self.done:
            raise RuntimeError("Episode is done. Call reset() for a new episode.")

        # Day 0 price
        day0_price = self.df.iloc[self.current_idx]["close"]
        current_val = self._portfolio_value(day0_price)

        # Rebalance
        self.position = action
        self.cash = current_val - self.position * day0_price

        # Final day in this 5-day window
        final_idx = self.current_idx + self.episode_length - 1
        if final_idx >= self.num_days:
            final_idx = self.num_days - 1
            self.done = True
        else:
            self.done = True  # Single-step episode

        final_price = self.df.iloc[final_idx]["close"]
        final_val = self._portfolio_value(final_price)
        reward = final_val - self.start_portfolio_value

        # Advance pointer for next call to reset()
        self.current_idx += self.episode_length

        next_state = None  # or you could return some dummy
        next_news = None
        info = {}

        return next_state, next_news, reward, self.done, info


# Install dependencies (if not installed):
# !pip install trl transformers

import torch
from transformers import AutoTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead

# Example: Qwen 2.5 from HuggingFace
# (Replace with actual model repo name if different)
MODEL_NAME = "Qwen/Qwen-2.5"  

# 1. Load model & tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLMWithValueHead.from_pretrained(MODEL_NAME)

# 2. Configure PPO
ppo_config = PPOConfig(
    model_name=MODEL_NAME,
    learning_rate=1e-5,
    batch_size=1,      # In this example, we do 1 sample/episode at a time
    mini_batch_size=1, # ...
    optimize_cuda_usage=False,
    log_with=None,     # or "wandb"
    # ... other config
)

ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model,
    tokenizer=tokenizer,
    # If you want a separate reference model for KL:
    # ref_model_name=MODEL_NAME,
)

# Example environment usage
df = pd.DataFrame({
    'close': np.random.uniform(90, 110, size=100),
    'news': ["Macro event " + str(i) for i in range(100)]
})
env = WeeklyTradingEnvNoGym(df, initial_amount=1e6, episode_length=5)

def obs_and_news_to_prompt(obs: np.ndarray, news: str) -> str:
    """
    Convert numeric obs + text snippet into a prompt for Qwen.
    """
    obs_str = ", ".join([f"{v:.2f}" for v in obs])
    prompt = (
        f"Observation: {obs_str}\n"
        f"News: {news}\n\n"
        "As a financial expert, output three numbers (comma-separated):\n"
        "(1) number of shares to hold,\n"
        "(2) recommendation score 1-5,\n"
        "(3) risk score 1-5."
    )
    return prompt

def parse_response(response_text: str):
    """
    Attempt to parse "action, rec_score, risk_score" from the model's text output.
    """
    try:
        parts = response_text.strip().split(",")
        action = int(float(parts[0].strip()))
        rec = float(parts[1].strip())
        risk = float(parts[2].strip())
        return action, rec, risk
    except Exception:
        return 0, 3.0, 3.0

def map_rec_risk_to_action_mod(action, rec_score, risk_score):
    """
    Example heuristic to adjust the raw action from the model
    with the recommendation & risk scores.
    """
    # e.g. scale up if rec >=4, scale down if risk >=4, etc.
    factor = 1.0
    if rec_score >= 4.5: factor *= 1.2
    elif rec_score >= 4.0: factor *= 1.1
    if risk_score >= 4.5: factor *= 0.8
    elif risk_score >= 4.0: factor *= 0.9

    mod_action = int(round(action * factor))
    return mod_action


# We'll store (query, response, reward) each time, then train with PPO
NUM_EPISODES = 10
all_prompts = []
all_responses = []
all_rewards = []

episode_count = 0

while episode_count < NUM_EPISODES:
    try:
        obs, news = env.reset()
    except StopIteration:
        print("No more data, training done.")
        break

    # Build prompt
    prompt = obs_and_news_to_prompt(obs, news)

    # Tokenize for the model
    query_tensors = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    # Generate a response
    response_tensors = ppo_trainer.model.generate(
        input_ids=query_tensors,
        max_length=80,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )

    response_text = tokenizer.decode(response_tensors[0], skip_special_tokens=True)

    # Parse the raw action from response
    raw_action, rec_score, risk_score = parse_response(response_text)
    final_action = map_rec_risk_to_action_mod(raw_action, rec_score, risk_score)

    # Step environment
    next_obs, next_news, reward, done, info = env.step(final_action)

    # Store for PPO
    all_prompts.append(query_tensors[0])
    all_responses.append(response_tensors[0])
    all_rewards.append(torch.tensor([reward], dtype=torch.float, device=model.device))

    # One-step environment => done should be True after the step
    if done:
        # Update policy with PPO step on (prompt, response, reward)
        # Note: PPOTrainer can do multiple steps at once if you gather more data.
        # But here we do 1 episode -> 1 step to keep it simple.
        ppo_trainer.step(
            queries=all_prompts,
            responses=all_responses,
            rewards=all_rewards
        )
        # Clear buffers
        all_prompts = []
        all_responses = []
        all_rewards = []
        episode_count += 1

print("Training complete.")


import torch

def cvar_transform_rewards(rewards: torch.Tensor, alpha: float = 0.2):
    """
    Simple transform: 
    - Find the alpha-quantile among all rewards.
    - Keep only those below the quantile (the "worst" episodes),
      and set others to 0 or a reduced value.
    - Or compute the mean of the worst returns and set all to that average to 
      push the policy to improve tail risk.
    """
    sorted_rewards, _ = torch.sort(rewards)
    cutoff_index = int(len(sorted_rewards) * alpha)
    cutoff_index = max(0, min(cutoff_index, len(sorted_rewards)-1))
    cutoff_value = sorted_rewards[cutoff_index].item()

    # For example, let's define the new reward as:
    # new_reward = (reward if reward <= cutoff_value else 0)
    # or we can do: new_reward = min(reward, cutoff_value)
    # or an average of the below-threshold returns

    # Here we do something simple: 
    # if reward <= cutoff_value, keep reward
    # else set reward to the cutoff_value
    new_rewards = torch.minimum(rewards, torch.tensor(cutoff_value, device=rewards.device))
    return new_rewards

# Let's adapt the loop to batch up multiple episodes, apply CVaR transform,
# then do a single PPO step on the entire batch.
NUM_EPISODES = 20
BATCH_SIZE = 5
episode_count = 0

prompts_batch = []
responses_batch = []
rewards_batch = []

while episode_count < NUM_EPISODES:
    try:
        obs, news = env.reset()
    except StopIteration:
        print("No more data.")
        break

    prompt = obs_and_news_to_prompt(obs, news)
    query_tensors = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    response_tensors = ppo_trainer.model.generate(
        input_ids=query_tensors,
        max_length=80,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )
    response_text = tokenizer.decode(response_tensors[0], skip_special_tokens=True)

    # parse action
    raw_action, rec_score, risk_score = parse_response(response_text)
    final_action = map_rec_risk_to_action_mod(raw_action, rec_score, risk_score)

    # step env
    _, _, reward, done, _ = env.step(final_action)

    # store
    prompts_batch.append(query_tensors[0])
    responses_batch.append(response_tensors[0])
    rewards_batch.append(reward)  # keep as float for now

    # single-step => done = True
    episode_count += 1

    # once we have a full batch, do cvar adjustment
    if episode_count % BATCH_SIZE == 0 or episode_count == NUM_EPISODES:
        # Convert to tensor
        rewards_tensor = torch.tensor(rewards_batch, dtype=torch.float, device=model.device)

        # CVaR transform
        cvar_rewards = cvar_transform_rewards(rewards_tensor, alpha=0.2)

        # Now do PPO step with transformed rewards
        ppo_trainer.step(
            queries=prompts_batch,
            responses=responses_batch,
            rewards=cvar_rewards
        )

        # clear for next batch
        prompts_batch = []
        responses_batch = []
        rewards_batch = []

