In [None]:
import os
import numpy as np
import pandas as pd

from transformers import AutoTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead


class StockTradingEnvCustom:

    def __init__(self, df, initial_amount=1e6, hmax=100, tech_indicator_list=None):
        self.df = df.reset_index(drop=True)
        self.hmax = hmax
        self.initial_amount = initial_amount
        self.tech_indicator_list = tech_indicator_list or []
        self.current_step = 0
        self.done = False
        self.cash = initial_amount
        self.position = 0

        self.data = self.df.iloc[self.current_step]
        self.price = self.data["close"]
        self.news = self.data.get("news", "")
        self.state = [self.cash, self.price] + [
            self.data[tech] for tech in self.tech_indicator_list
        ]

    def reset(self):
        self.current_step = 0
        self.done = False
        self.cash = self.initial_amount
        self.position = 0
        self.data = self.df.iloc[self.current_step]
        self.price = self.data["close"]
        self.news = self.data.get("news", "")
        self.state = [self.cash, self.price] + [
            self.data[tech] for tech in self.tech_indicator_list
        ]
        return self.state, self.news

    def step(self, action):
        """
        Execute the (modulated) action:
          - action: integer representing shares to buy (positive) or sell (negative).
        Updates cash and position, then moves to the next timestep.
        Returns:
          - new numerical state,
          - new news snippet,
          - reward (change in portfolio value),
          - done flag,
          - empty info dict.
        """
        # Clip action to allowable range
        action = int(np.clip(action, -self.hmax, self.hmax))
        trade_cost = self.price * action
        self.cash -= trade_cost
        self.position += action

        self.current_step += 1
        if self.current_step >= len(self.df):
            self.done = True
            new_state, new_news = None, ""
        else:
            self.data = self.df.iloc[self.current_step]
            self.price = self.data["close"]
            new_news = self.data.get("news", "")
            new_state = [self.cash, self.price] + [
                self.data[tech] for tech in self.tech_indicator_list
            ]

        portfolio_value = self.cash + self.position * self.price
        reward = portfolio_value - self.initial_amount

        return new_state, new_news, reward, self.done, {}


def obs_and_news_to_prompt(obs, news):
    """
    Build a composite prompt that includes both numerical observations and news text.
    The prompt output three comma-separated numbers:
      (action, recommendation score, risk score)
    """
    obs_str = ", ".join([f"{v:.2f}" for v in obs])
    prompt = (
        f"Observation: {obs_str}. News: {news} "
        "As a financial expert, please output three numbers separated by commas: "
        "the trading action (integer), a stock recommendation score (1-5), "
        "and a risk assessment score (1-5)."
    )
    return prompt


def parse_response_refined(response_text):
    """
    Expect three comma-separated numbers:
      - Raw action (integer)
      - Recommendation score (float)
      - Risk score (float)
    Returns defaults (0, 3.0, 3.0) if parsing fails.
    """
    try:
        parts = response_text.strip().split(",")
        if len(parts) < 3:
            raise ValueError("Insufficient output values.")
        action = int(float(parts[0].strip()))
        rec_score = float(parts[1].strip())
        risk_score = float(parts[2].strip())
    except Exception:
        action, rec_score, risk_score = 0, 3.0, 3.0
    return action, rec_score, risk_score


def map_rec_score_to_factor(rec_score):
    """
    Map the recommendation score (Sf) to an action modulation factor.
      5 -> 1.10, 4 -> 1.05, 3 -> 1.00, 2 -> 0.95, 1 -> 0.90.
    """
    if rec_score >= 5:
        return 1.10
    elif rec_score >= 4:
        return 1.05
    elif rec_score >= 3:
        return 1.00
    elif rec_score >= 2:
        return 0.95
    else:
        return 0.90


def map_risk_score_to_factor(risk_score):
    """
    Map the risk score (Rf) to a reward adjustment factor.
      5 -> 1.10, 4 -> 1.05, 3 -> 1.00, 2 -> 0.95, 1 -> 0.90.
    """
    if risk_score >= 5:
        return 1.10
    elif risk_score >= 4:
        return 1.05
    elif risk_score >= 3:
        return 1.00
    elif risk_score >= 2:
        return 0.95
    else:
        return 0.90


def load_trading_data(path):
    """
    Load trading data from CSV. The CSV should contain:
      - "close" column,
      - "news" column,
      - (Optionally) technical indicator columns.
    """
    df = pd.read_csv(path)
    return df


def train_trl_refined(env, model, tokenizer, ppo_trainer, num_epochs=10):
    for epoch in range(num_epochs):
        obs, news = env.reset()
        done = False
        epoch_reward = 0
        prompts = []
        responses = []
        rewards = []

        while not done:
            prompt = obs_and_news_to_prompt(obs, news)
            prompts.append(prompt)

            # Tokenize and generate a response from Qwen2.5
            query = tokenizer(prompt, return_tensors="pt")
            response_ids = model.generate(**query, max_length=50)
            response_text = tokenizer.decode(response_ids[0], skip_special_tokens=True)
            responses.append(response_text)

            # Parse the response to get raw action, recommendation score, and risk score
            raw_action, rec_score, risk_score = parse_response_refined(response_text)
            sf = map_rec_score_to_factor(rec_score)
            modulated_action = int(round(sf * raw_action))

            # Execute action in environment
            next_obs, next_news, reward, done, _ = env.step(modulated_action)
            rf = map_risk_score_to_factor(risk_score)
            reward_adj = rf * reward

            rewards.append(reward_adj)
            epoch_reward += reward_adj

            obs, news = next_obs, next_news if not done else (None, "")

        # Tokenize prompts and responses as a batch for PPO update
        query_batch = tokenizer(
            prompts, return_tensors="pt", padding=True, truncation=True
        )
        response_batch = tokenizer(
            responses, return_tensors="pt", padding=True, truncation=True
        )

        stats = ppo_trainer.step(query_batch, response_batch, rewards)
        print(
            f"Epoch {epoch+1}/{num_epochs} - Total Adjusted Reward: {epoch_reward:.2f}, Stats: {stats}"
        )
    return model


def main():
    data_path = "train_data_2013_2018.csv"
    df = load_trading_data(data_path)

    env = StockTradingEnvCustom(df)

    model_name = "Qwen-2.5"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name)

    ppo_config = PPOConfig(
        model_name=model_name,
        num_train_epochs=3,
        batch_size=1,
        learning_rate=1e-5,
        log_with="tensorboard",
    )

    ppo_trainer = PPOTrainer(ppo_config, model, tokenizer)

    num_epochs = 10
    trained_model = train_trl_refined(
        env, model, tokenizer, ppo_trainer, num_epochs=num_epochs
    )

    # Save the trained model
    save_dir = "trained_trl_qwen2.5_news_trading"
    os.makedirs(save_dir, exist_ok=True)
    trained_model.save_pretrained(save_dir)
    print("Training finished and model saved in", save_dir)


if __name__ == "__main__":
    main()
