In [None]:
import pandas as pd

import numpy as np
import pandas as pd
df=pd.read_csv('/content/final_imputed_data.csv')
df1=pd.read_csv('/content/df_u.csv')
# Ensure the key combination is unique per row
df_keys = set(zip(df['Company'], df['Year']))
df1_keys = set(zip(df1['Company'], df1['Year']))

# Find the combinations in df1 but not in df
missing_keys = df1_keys - df_keys

# Filter df1 rows that are missing in df
missing_rows = df1[df1.apply(lambda row: (row['Company'], row['Year']) in missing_keys, axis=1)]

# Concatenate to create the final dataframe
df_final = pd.concat([df, missing_rows], ignore_index=True)
target_cols = ['Target 1', 'Target 2', 'Target 3']

for col in target_cols:
    df_final[col] = pd.to_numeric(df_final[col], errors='coerce')
df_final = df_final.drop(
    df_final[
        (df_final['Year'] < 2022) &
        (df_final[['Target 2', 'Target 3']].isna().all(axis=1))
    ].index
)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_final['Company_encoded'] = le.fit_transform(df_final['Company'])

In [None]:


import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# ---------- Device Setup ----------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ---------- Configuration ----------
feature_cols = [f"Feature{i}" for i in range(1, 29)]
batch_size   = 1    # one company sequence per batch
epochs       = 51
patience     = 8
TRAIN_SPLIT_RATIO = 0.8

# ---------- DataFrames for Targets ----------
df_target1 = df_final[df_final['Target 1'].notna()][
    feature_cols + ['Target 1', 'Year', 'Company_encoded', 'Sector']
].copy()
df_target2 = df_final[df_final['Target 2'].notna()][
    feature_cols + ['Target 2', 'Year', 'Company_encoded', 'Sector', 'Target 1']
].copy()
df_target3 = df_final[df_final['Target 3'].notna()][
    feature_cols + ['Target 3', 'Year', 'Company_encoded', 'Sector', 'Target 1', 'Target 2']
].copy()

# ---------- Sector Embeddings ----------
def get_sector_embeddings(df):
    st = SentenceTransformer('all-MiniLM-L6-v2')
    sectors = df['Sector'].fillna('Unknown').unique()
    embs = st.encode(sectors, show_progress_bar=True).astype(np.float32)
    mapping = dict(zip(sectors, embs))
    df['Sector_Emb'] = df['Sector'].map(mapping)
    return df

# ---------- Attention Pooling ----------
class AttentionPooling(nn.Module):
    def _init_(self, dim):
        super()._init_()
        self.q = nn.Parameter(torch.randn(dim))
        self.lin = nn.Linear(dim, dim)
    def forward(self, x):       # x: (B, T, D)
        p = torch.tanh(self.lin(x))                  # (B,T,D)
        q = self.q.view(1,1,-1)                      # (1,1,D)
        scores = (p * q).sum(-1, keepdim=True)       # (B,T,1)
        weights = torch.softmax(scores, dim=1)       # (B,T,1)
        return (x * weights).sum(dim=1)              # (B,D)

# ---------- Convolutional Stem + SE Block ----------
class SqueezeExcite(nn.Module):
    def _init_(self, channels, reduction=8):
        super()._init_()
        self.fc1 = nn.Linear(channels, channels // reduction)
        self.fc2 = nn.Linear(channels // reduction, channels)
    def forward(self, x):  # x: (B, C, T)
        s = x.mean(-1)               # (B, C)
        s = torch.relu(self.fc1(s))
        s = torch.sigmoid(self.fc2(s))
        return x * s.unsqueeze(-1)   # (B,C,T)

class ConvStem(nn.Module):
    def _init_(self, in_channels, out_channels, kernel_size=3, depth=3):
        super()._init_()
        layers = []
        for i in range(depth):
            inc = in_channels if i==0 else out_channels
            layers += [
                nn.Conv1d(inc, out_channels, kernel_size, padding=kernel_size//2),
                nn.BatchNorm1d(out_channels),
                nn.ReLU(),
                SqueezeExcite(out_channels),
                nn.Dropout(0.2)
            ]
        self.net = nn.Sequential(*layers)
    def forward(self, x):    # x: (B, T, F)
        x = x.transpose(1,2) # (B, F, T)
        return self.net(x).transpose(1,2)  # (B, T, C)

# ---------- Improved Hybrid Model ----------
class HybridModel(nn.Module):
    def _init_(self, feat_dim, emb_dim, d_model=128, heads=4, layers=2):
        super()._init_()
        # Convolutional stem
        self.stem = ConvStem(feat_dim, out_channels=64, depth=2)
        # Feature projection to d_model
        self.feat_proj = nn.Linear(64, d_model)
        # Year embedding
        self.year_proj = nn.Linear(1, d_model)
        # Sector projection
        self.sec_proj  = nn.Linear(emb_dim, d_model)
        # Positional encoding
        self.pos_enc   = nn.Parameter(torch.randn(1, 100, d_model))
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=heads,
            dim_feedforward=256,
            dropout=0.2,
            batch_first=True,
            activation='gelu'
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=layers)
        # Cross-attention between meta and features
        self.cross_attn  = nn.MultiheadAttention(d_model, heads, batch_first=True, dropout=0.2)
        self.norm        = nn.LayerNorm(d_model)
        # Attention pooling + MLP head
        self.pool        = AttentionPooling(d_model)
        self.mlp         = nn.Sequential(
            nn.Linear(d_model, d_model//2),
            nn.LayerNorm(d_model//2),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(d_model//2, 1)
        )
        # Weight initialization
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight, gain=math.sqrt(2.0))
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def forward(self, x, year, sector, extra=None):
        # x: (B,T,F); year: (B,T,1); sector: (B,T,emb_dim)
        B,T,_ = x.shape
        # Conv stem
        h = self.stem(x)                  # (B,T,64)
        h = self.feat_proj(h)             # (B,T,d_model)
        # Meta
        ye = self.year_proj(year)         # (B,T,d_model)
        se = self.sec_proj(sector)        # (B,T,d_model)
        # Positional + sum
        h = h + ye + se + self.pos_enc[:,:T,:]
        # Transformer encoding
        h = self.transformer(h)           # (B,T,d_model)
        # Cross-attention: meta as query
        meta = ye + se                    # (B,T,d_model)
        attn_out, _ = self.cross_attn(meta, h, h)
        h = self.norm(h + attn_out)
        # Pool and output
        pooled = self.pool(h)             # (B,d_model)
        return self.mlp(pooled)           # (B,1)

# ---------- Sequence Preparation ----------
def prepare_sequence(df, target_col):
    df = df.sort_values('Year')
    X = df[feature_cols].values.astype(np.float32)
    if target_col == 'Target 2':
        X = np.hstack([X, df['Target 1'].values.reshape(-1,1).astype(np.float32)])
    elif target_col == 'Target 3':
        extras = df[['Target 1','Target 2']].values.astype(np.float32)
        X = np.hstack([X, extras])
    X   = torch.tensor(X, device=device).unsqueeze(0)
    year= torch.tensor(df['Year'].values.astype(np.float32).reshape(1,-1,1),
                       device=device)
    sec = torch.tensor(np.stack(df['Sector_Emb'].values).astype(np.float32),
                       device=device).unsqueeze(0)
    y   = torch.tensor(df[target_col].values.astype(np.float32)[-1],
                       device=device).view(1,1)
    return DataLoader(TensorDataset(X, year, sec, y),
                      batch_size=batch_size, shuffle=False)

# ---------- Early Stopping ----------
class EarlyStopping:
    def _init_(self, patience=patience, delta=1e-4, path='chkpt.pt'):
        self.patience,self.delta,self.path = patience,delta,path
        self.best,self.counter,self.stop = None,0,False
    def _call_(self, val_loss, model):
        score = -val_loss
        if self.best is None or score > self.best + self.delta:
            self.best = score
            torch.save(model.state_dict(), self.path)
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.stop = True

# ---------- Training Function ----------
def train_model_fn(model, train_loader, val_loader):
    optimizer = optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, factor=0.5, patience=3, min_lr=1e-6
    )
    criterion = nn.MSELoss()
    stopper   = EarlyStopping()
    history   = {'tr':[], 'va':[]}

    for epoch in range(epochs):
        model.train()
        tr_losses = []
        for X,yr,sec,y in train_loader:
            optimizer.zero_grad()
            out = model(X, yr, sec)
            loss = criterion(out, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            tr_losses.append(loss.item())
        tr_rmse = math.sqrt(np.mean(tr_losses)) if tr_losses else float('nan')

        model.eval()
        va_losses = []
        with torch.no_grad():
            for X,yr,sec,y in val_loader:
                va_losses.append(criterion(model(X, yr, sec), y).item())
        va_rmse = math.sqrt(np.mean(va_losses)) if va_losses else float('nan')

        scheduler.step(va_rmse)
        history['tr'].append(tr_rmse)
        history['va'].append(va_rmse)

        stopper(va_rmse, model)
        if stopper.stop:
            break

    model.load_state_dict(torch.load(stopper.path))
    return model, history

# ---------- Train Target 3 (example) ----------
'''df3 = get_sector_embeddings(df_target3)
companies3 = df3['Company_encoded'].unique()
tot_tr = tot_va = tot_ep = cnt = skip = 0
pbar = tqdm(companies3, desc="Training Target 3")

for cid in pbar:
    sub = df3[df3['Company_encoded'] == cid]
    years = sorted(sub['Year'].unique())
    split = int(TRAIN_SPLIT_RATIO * len(years))
    tr_df = sub[sub['Year'].isin(years[:split])]
    va_df = sub[sub['Year'].isin(years[split:])]

    if len(tr_df) < 3 or len(va_df) < 2:
        skip += 1
        pbar.update(1)
        continue

    tl = prepare_sequence(tr_df, 'Target 3')
    vl = prepare_sequence(va_df, 'Target 3')
    model3 = HybridModel(
        feat_dim=tl.dataset.tensors[0].shape[-1],
        emb_dim=tl.dataset.tensors[2].shape[-1],
        d_model=128, heads=4, layers=2
    ).to(device)

    model3, hist3 = train_model_fn(model3, tl, vl)
    tr_rm, va_rm, ep = hist3['tr'][-1], hist3['va'][-1], len(hist3['va'])

    tot_tr += tr_rm; tot_va += va_rm; tot_ep += ep; cnt += 1
    pbar.set_postfix({
        'avg_tr': f"{tot_tr/cnt:.4f}",
        'avg_va': f"{tot_va/cnt:.4f}",
        'avg_ep': f"{tot_ep/cnt:.1f}"
    })
    pbar.update(1)

pbar.close()
print(f"Target 3 ▶ avg_train={tot_tr/cnt:.4f}, avg_val={tot_va/cnt:.4f}, skipped={skip}")'''
df1 = get_sector_embeddings(df_target1)
companies1 = df1['Company_encoded'].unique()
tot_tr1 = tot_va1 = tot_ep1 = cnt1 = skip1 = 0
pbar1 = tqdm(companies1, desc="Training Target 1")
for cid in pbar1:
    sub = df1[df1['Company_encoded'] == cid]
    years = sorted(sub['Year'].unique())
    split = int(0.8 * len(years))
    tr_df = sub[sub['Year'].isin(years[:split])]
    va_df = sub[sub['Year'].isin(years[split:])]
    if len(tr_df) < 3 or len(va_df) < 2:
        skip1 += 1
        continue
    tl = prepare_sequence(tr_df, 'Target 1')
    vl = prepare_sequence(va_df, 'Target 1')
    model1 = BalancedModel(
        input_dim=tl.dataset.tensors[0].shape[-1],
        meta_dim=tl.dataset.tensors[2].shape[-1]
    ).to(device)
    model1, hist1 = train_model_fn(model1, tl, vl)
    tr_rm1, va_rm1, ep1 = hist1['tr'][-1], hist1['va'][-1], len(hist1['va'])
    tot_tr1 += tr_rm1; tot_va1 += va_rm1; tot_ep1 += ep1; cnt1 += 1
    pbar1.set_postfix(
        avg_tr=f"{tot_tr1/cnt1:.4f}", avg_va=f"{tot_va1/cnt1:.4f}", avg_ep=f"{tot_ep1/cnt1:.1f}"
    )
pbar1.close()
print(f"Target 1 ▶ avg_train={tot_tr1/cnt1:.4f}, avg_val={tot_va1/cnt1:.4f}, skipped={skip1}")

In [None]:


import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# ---------- Device Setup ----------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ---------- Configuration ----------
feature_cols = [f"Feature{i}" for i in range(1, 29)]
batch_size = 1    # one company sequence per batch
epochs = 51
patience = 8      # Keeping original patience

# ---------- Set Default Tensor Type ----------
# This ensures all tensors use the same dtype
torch.set_default_dtype(torch.float32)

# ---------- DataFrames for Targets ----------
df_target1 = df_final[df_final['Target 1'].notna()][
    feature_cols + ['Target 1', 'Year', 'Company_encoded', 'Sector']
].copy()
df_target2 = df_final[df_final['Target 2'].notna()][
    feature_cols + ['Target 2', 'Year', 'Company_encoded', 'Sector', 'Target 1']
].copy()
df_target3 = df_final[df_final['Target 3'].notna()][
    feature_cols + ['Target 3', 'Year', 'Company_encoded', 'Sector', 'Target 1', 'Target 2']
].copy()

# ---------- Sector Embeddings ----------
def get_sector_embeddings(df):
    st = SentenceTransformer('all-MiniLM-L6-v2')
    sectors = df['Sector'].fillna('Unknown').unique()
    embs = st.encode(sectors, show_progress_bar=True)
    # Ensure embeddings are float32
    embs = embs.astype(np.float32)
    mapping = dict(zip(sectors, embs))
    df['Sector_Emb'] = df['Sector'].map(mapping)
    return df

# ---------- Model Architecture (Balanced) ----------
class Time2Vec(nn.Module):
    def _init_(self, dim):
        super()._init_()
        self.w0 = nn.Parameter(torch.randn(1, dtype=torch.float32))
        self.b0 = nn.Parameter(torch.randn(1, dtype=torch.float32))
        self.w  = nn.Parameter(torch.randn(dim-1, dtype=torch.float32))
        self.b  = nn.Parameter(torch.randn(dim-1, dtype=torch.float32))
    def forward(self, t):
        # Ensure input is float32
        t = t.to(torch.float32)
        v0 = self.w0 * t + self.b0
        v  = torch.sin(self.w * t + self.b)
        return torch.cat([v0, v], dim=-1)

class PositionalEncoding(nn.Module):
    def _init_(self, d_model, max_len=500):
        super()._init_()
        pe = torch.zeros(max_len, d_model, dtype=torch.float32)
        pos = torch.arange(max_len, dtype=torch.float32).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) *
                        -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer('pe', pe.unsqueeze(0))
    def forward(self, x):
        # Ensure input is float32
        x = x.to(torch.float32)
        return x + self.pe[:, :x.size(1)]

class TransformerBranch(nn.Module):
    def _init_(self, input_dim, meta_dim, d_model=96, heads=6, layers=2):
        super()._init_()
        self.input_proj = nn.Linear(input_dim, d_model)
        self.time2vec   = Time2Vec(d_model)
        self.pos_enc    = PositionalEncoding(d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model, heads, dim_feedforward=192,
            dropout=0.15,
            batch_first=True
        )
        self.encoder    = nn.TransformerEncoder(enc_layer, layers)
        self.meta_proj  = nn.Linear(meta_dim, d_model)
        self.cross_attn = nn.MultiheadAttention(d_model, heads, batch_first=True)
        self.pool       = nn.AdaptiveAvgPool1d(1)

        # Apply weight initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight, gain=0.8)
            if module.bias is not None:
                module.bias.data.zero_()

    def forward(self, x, time, meta):
        # Ensure all inputs are float32
        x = x.to(torch.float32)
        time = time.to(torch.float32)
        meta = meta.to(torch.float32)

        h = self.input_proj(x)
        h = h + self.time2vec(time)
        h = self.pos_enc(h)
        h = self.encoder(h)
        m = self.meta_proj(meta)
        attn_out, _ = self.cross_attn(m, h, h)
        seq = h + attn_out
        seq = seq.transpose(1, 2)  # (B, d_model, T)
        return self.pool(seq).squeeze(-1)

class SimplifiedTCN(nn.Module):
    def _init_(self, input_dim, d_model=96):
        super()._init_()
        self.conv1 = nn.Conv1d(input_dim, d_model, 3, padding=2, dilation=1)
        self.conv2 = nn.Conv1d(d_model, d_model, 3, padding=4, dilation=2)
        self.norm  = nn.LayerNorm(d_model)
        self.act   = nn.GELU()
        self.pool  = nn.AdaptiveAvgPool1d(1)
        self.dropout = nn.Dropout(0.15)
    def forward(self, x):
        # Ensure input is float32
        x = x.to(torch.float32)
        h = x.transpose(1, 2)
        h = self.dropout(self.act(self.conv1(h)))
        h = self.act(self.conv2(h))
        h = h.transpose(1, 2)
        h = self.norm(h)
        return self.pool(h.transpose(1, 2)).squeeze(-1)

class BalancedModel(nn.Module):
    def _init_(self, input_dim, meta_dim, d_model=96):
        super()._init_()
        self.trans = TransformerBranch(input_dim, meta_dim, d_model)
        self.tcn   = SimplifiedTCN(input_dim, d_model)
        self.fc    = nn.Sequential(
            nn.Linear(2*d_model, d_model),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(d_model, 1)
        )
    def forward(self, x, time, meta):
        h1 = self.trans(x, time, meta)
        h2 = self.tcn(x)
        return self.fc(torch.cat([h1, h2], dim=-1))

# ---------- Sequence Preparation ----------
def prepare_sequence(df, target_col):
    df = df.sort_values('Year')
    # Build feature matrix
    X = df[feature_cols].values.astype(np.float32)  # Explicitly set dtype
    if target_col == 'Target 2':
        X = np.hstack([X, df['Target 1'].values.reshape(-1, 1).astype(np.float32)])
    if target_col == 'Target 3':
        extras = df[['Target 1', 'Target 2']].values.astype(np.float32)
        X = np.hstack([X, extras])
    X = torch.tensor(X, dtype=torch.float32, device=device).unsqueeze(0)
    # Year as time input
    year = torch.tensor(
        df['Year'].values.astype(np.float32).reshape(1, -1, 1),
        dtype=torch.float32, device=device
    )
    # Sector embedding
    sector = torch.tensor(
        np.stack(df['Sector_Emb'].values).astype(np.float32),
        dtype=torch.float32, device=device
    ).unsqueeze(0)
    # Target: last year
    y = torch.tensor(
        df[target_col].values.astype(np.float32)[-1:].reshape(1, 1),
        dtype=torch.float32, device=device
    )
    return DataLoader(
        TensorDataset(X, year, sector, y),
        batch_size=batch_size, shuffle=False
    )

# ---------- Early Stopping ----------
class EarlyStopping:
    def _init_(self, patience=patience, delta=0.0005, path='chkpt.pt'):
        self.patience = patience
        self.delta    = delta
        self.path     = path
        self.best     = None
        self.counter  = 0
        self.stop     = False
    def _call_(self, val_loss, model):
        score = -val_loss
        if self.best is None or score > self.best + self.delta:
            self.best = score
            torch.save(model.state_dict(), self.path)
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.stop = True

# ---------- Training Function ----------
def train_model_fn(model, train_loader, val_loader):
    optimizer = optim.Adam(model.parameters(), lr=8e-4, weight_decay=5e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.7, patience=4)
    criterion = nn.MSELoss()
    stopper   = EarlyStopping()
    history   = {'tr': [], 'va': []}

    for epoch in range(epochs):
        model.train()
        tr_losses = []
        for X, yr, sec, y in train_loader:
            # Ensure all data is float32
            X = X.to(torch.float32)
            yr = yr.to(torch.float32)
            sec = sec.to(torch.float32)
            y = y.to(torch.float32)

            optimizer.zero_grad()
            out = model(X, yr, sec)
            loss = criterion(out, y)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            tr_losses.append(loss.item())
        tr_rmse = math.sqrt(np.mean(tr_losses)) if tr_losses else float('nan')

        model.eval()
        va_losses = []
        with torch.no_grad():
            for X, yr, sec, y in val_loader:
                # Ensure all data is float32
                X = X.to(torch.float32)
                yr = yr.to(torch.float32)
                sec = sec.to(torch.float32)
                y = y.to(torch.float32)

                va_losses.append(criterion(model(X, yr, sec), y).item())
        va_rmse = math.sqrt(np.mean(va_losses)) if va_losses else float('nan')

        scheduler.step(va_rmse)

        history['tr'].append(tr_rmse)
        history['va'].append(va_rmse)
        stopper(va_rmse, model)
        if stopper.stop:
            break

    model.load_state_dict(torch.load(stopper.path))
    return model, history

# ---------- Training Target 1 ----------
'''df1 = get_sector_embeddings(df_target1)
companies1 = df1['Company_encoded'].unique()
tot_tr1 = tot_va1 = tot_ep1 = cnt1 = skip1 = 0
pbar1 = tqdm(companies1, desc="Training Target 1")
for cid in pbar1:
    sub = df1[df1['Company_encoded'] == cid]
    years = sorted(sub['Year'].unique())

    # Updated splitting logic with time-based holdout
    if len(years) < 3:  # For very small datasets
        skip1 += 1
        continue
    elif len(years) <= 5:  # For small datasets
        tr_df = sub[sub['Year'] != years[-1]]  # All but last year
        va_df = sub[sub['Year'] == years[-1]]  # Just last year
    else:  # Normal split for larger datasets
        split = int(0.8 * len(years))
        tr_df = sub[sub['Year'].isin(years[:split])]
        va_df = sub[sub['Year'].isin(years[split:])]

    tl = prepare_sequence(tr_df, 'Target 1')
    vl = prepare_sequence(va_df, 'Target 1')
    model1 = BalancedModel(
        input_dim=tl.dataset.tensors[0].shape[-1],
        meta_dim=tl.dataset.tensors[2].shape[-1]
    ).to(device)
    model1, hist1 = train_model_fn(model1, tl, vl)
    tr_rm1, va_rm1, ep1 = hist1['tr'][-1], hist1['va'][-1], len(hist1['va'])
    tot_tr1 += tr_rm1; tot_va1 += va_rm1; tot_ep1 += ep1; cnt1 += 1
    pbar1.set_postfix(
        avg_tr=f"{tot_tr1/cnt1:.4f}", avg_va=f"{tot_va1/cnt1:.4f}", avg_ep=f"{tot_ep1/cnt1:.1f}"
    )
pbar1.close()
print(f"Target 1 ▶ avg_train={tot_tr1/cnt1:.4f}, avg_val={tot_va1/cnt1:.4f}, skipped={skip1}")
'''
# ---------- Training Target 2 ----------
df2 = get_sector_embeddings(df_target2)
companies2 = df2['Company_encoded'].unique()
tot_tr2 = tot_va2 = tot_ep2 = cnt2 = skip2 = 0
pbar2 = tqdm(companies2, desc="Training Target 2")
for cid in pbar2:
    sub = df2[df2['Company_encoded'] == cid]
    years = sorted(sub['Year'].unique())

    # Updated splitting logic with time-based holdout
    if len(years) < 3:  # For very small datasets
        skip2 += 1
        continue
    elif len(years) <= 5:  # For small datasets
        tr_df = sub[sub['Year'] != years[-1]]  # All but last year
        va_df = sub[sub['Year'] == years[-1]]  # Just last year
    else:  # Normal split for larger datasets
        split = int(0.8 * len(years))
        tr_df = sub[sub['Year'].isin(years[:split])]
        va_df = sub[sub['Year'].isin(years[split:])]

    tl = prepare_sequence(tr_df, 'Target 2')
    vl = prepare_sequence(va_df, 'Target 2')
    model2 = BalancedModel(
        input_dim=tl.dataset.tensors[0].shape[-1],
        meta_dim=tl.dataset.tensors[2].shape[-1]
    ).to(device)
    model2, hist2 = train_model_fn(model2, tl, vl)
    tr_rm2, va_rm2, ep2 = hist2['tr'][-1], hist2['va'][-1], len(hist2['va'])
    tot_tr2 += tr_rm2; tot_va2 += va_rm2; tot_ep2 += ep2; cnt2 += 1
    pbar2.set_postfix(
        avg_tr=f"{tot_tr2/cnt2:.4f}", avg_va=f"{tot_va2/cnt2:.4f}", avg_ep=f"{tot_ep2/cnt2:.1f}"
    )
pbar2.close()
print(f"Target 2 ▶ avg_train={tot_tr2/cnt2:.4f}, avg_val={tot_va2/cnt2:.4f}, skipped={skip2}")
torch.save({
    'model_state_dict': model2.state_dict(),
    'input_dim': tl.dataset.tensors[0].shape[-1],
    'meta_dim': tl.dataset.tensors[2].shape[-1],
}, 'model2_trained.pt')

# ---------- Training Target 3 ----------
'''df3 = get_sector_embeddings(df_target3)
companies3 = df3['Company_encoded'].unique()
tot_tr3 = tot_va3 = tot_ep3 = cnt3 = skip3 = 0
pbar3 = tqdm(companies3, desc="Training Target 3")
for cid in pbar3:
    sub = df3[df3['Company_encoded'] == cid]
    years = sorted(sub['Year'].unique())

    # Updated splitting logic with time-based holdout
    if len(years) < 3:  # For very small datasets
        skip3 += 1
        continue
    elif len(years) <= 5:  # For small datasets
        tr_df = sub[sub['Year'] != years[-1]]  # All but last year
        va_df = sub[sub['Year'] == years[-1]]  # Just last year
    else:  # Normal split for larger datasets
        split = int(0.8 * len(years))
        tr_df = sub[sub['Year'].isin(years[:split])]
        va_df = sub[sub['Year'].isin(years[split:])]

    tl = prepare_sequence(tr_df, 'Target 3')
    vl = prepare_sequence(va_df, 'Target 3')
    model3 = BalancedModel(
        input_dim=tl.dataset.tensors[0].shape[-1],
        meta_dim=tl.dataset.tensors[2].shape[-1]
    ).to(device)
    model3, hist3= train_model_fn(model3, tl, vl)
    tr_rm3, va_rm3, ep3 = hist3['tr'][-1], hist3['va'][-1], len(hist3['va'])
    tot_tr3 += tr_rm3; tot_va3 += va_rm3; tot_ep3 += ep3; cnt3 += 1
    pbar3.set_postfix(
        avg_tr=f"{tot_tr3/cnt3:.4f}", avg_va=f"{tot_va3/cnt3:.4f}", avg_ep=f"{tot_ep3/cnt3:.1f}"
    )
pbar3.close()
print(f"Target 3 ▶ avg_train={tot_tr3/cnt3:.4f}, avg_val={tot_va3/cnt3:.4f}, skipped={skip3}")'''

In [None]:
model 3

import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# ---------- Device Setup ----------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ---------- Configuration ----------
feature_cols = [f"Feature{i}" for i in range(1, 29)]
batch_size = 1    # one company sequence per batch
epochs = 51
patience = 8      # Keeping original patience

# ---------- Set Default Tensor Type ----------
# This ensures all tensors use the same dtype
torch.set_default_dtype(torch.float32)

# ---------- DataFrames for Targets ----------
df_target1 = df_final[df_final['Target 1'].notna()][
    feature_cols + ['Target 1', 'Year', 'Company_encoded', 'Sector']
].copy()
df_target2 = df_final[df_final['Target 2'].notna()][
    feature_cols + ['Target 2', 'Year', 'Company_encoded', 'Sector', 'Target 1']
].copy()
df_target3 = df_final[df_final['Target 3'].notna()][
    feature_cols + ['Target 3', 'Year', 'Company_encoded', 'Sector', 'Target 1', 'Target 2']
].copy()

# ---------- Sector Embeddings ----------
def get_sector_embeddings(df):
    st = SentenceTransformer('all-MiniLM-L6-v2')
    sectors = df['Sector'].fillna('Unknown').unique()
    embs = st.encode(sectors, show_progress_bar=True)
    # Ensure embeddings are float32
    embs = embs.astype(np.float32)
    mapping = dict(zip(sectors, embs))
    df['Sector_Emb'] = df['Sector'].map(mapping)
    return df

# ---------- Model Architecture (Balanced) ----------
class Time2Vec(nn.Module):
    def _init_(self, dim):
        super()._init_()
        self.w0 = nn.Parameter(torch.randn(1, dtype=torch.float32))
        self.b0 = nn.Parameter(torch.randn(1, dtype=torch.float32))
        self.w  = nn.Parameter(torch.randn(dim-1, dtype=torch.float32))
        self.b  = nn.Parameter(torch.randn(dim-1, dtype=torch.float32))
    def forward(self, t):
        # Ensure input is float32
        t = t.to(torch.float32)
        v0 = self.w0 * t + self.b0
        v  = torch.sin(self.w * t + self.b)
        return torch.cat([v0, v], dim=-1)

class PositionalEncoding(nn.Module):
    def _init_(self, d_model, max_len=500):
        super()._init_()
        pe = torch.zeros(max_len, d_model, dtype=torch.float32)
        pos = torch.arange(max_len, dtype=torch.float32).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) *
                        -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer('pe', pe.unsqueeze(0))
    def forward(self, x):
        # Ensure input is float32
        x = x.to(torch.float32)
        return x + self.pe[:, :x.size(1)]

class TransformerBranch(nn.Module):
    def _init_(self, input_dim, meta_dim, d_model=64, heads=4, layers=1, dropout=0.25):
        super()._init_()
        self.input_proj = nn.Linear(input_dim, d_model)
        self.time2vec   = Time2Vec(d_model)
        self.pos_enc    = PositionalEncoding(d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model, heads,
            dim_feedforward=128,
            dropout=dropout,
            batch_first=True
        )
        self.encoder    = nn.TransformerEncoder(enc_layer, layers)
        self.meta_proj  = nn.Linear(meta_dim, d_model)
        self.cross_attn = nn.MultiheadAttention(
            d_model, heads, dropout=dropout, batch_first=True
        )
        self.norm1      = nn.LayerNorm(d_model)
        self.drop1      = nn.Dropout(dropout)
        self.pool       = nn.AdaptiveAvgPool1d(1)

        # now self._init_weights is a real method
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight, gain=0.8)
            if module.bias is not None:
                module.bias.data.zero_()

    def forward(self, x, time, meta):
        x = x.to(torch.float32)
        time = time.to(torch.float32)
        meta = meta.to(torch.float32)

        h = self.input_proj(x) + self.time2vec(time)
        h = self.pos_enc(h)
        h = self.encoder(h)

        m = self.meta_proj(meta)
        attn_out, _ = self.cross_attn(m, h, h)
        seq = self.norm1(h + self.drop1(attn_out))

        seq = seq.transpose(1, 2)           # (B, d_model, T)
        return self.pool(seq).squeeze(-1)

# Define the BalancedModel class that was missing in the original code
class BalancedModel(nn.Module):
    def _init_(self, input_dim, meta_dim, d_model=64):
        super()._init_()
        self.transformer = TransformerBranch(input_dim, meta_dim, d_model=d_model)
        self.output = nn.Linear(d_model, 1)

    def forward(self, x, time, meta):
        features = self.transformer(x, time, meta)
        return self.output(features)

# ---------- Sequence Preparation ----------
def prepare_sequence(df, target_col):
    df = df.sort_values('Year')
    # Build feature matrix
    X = df[feature_cols].values.astype(np.float32)  # Explicitly set dtype
    if target_col == 'Target 2':
        X = np.hstack([X, df['Target 1'].values.reshape(-1, 1).astype(np.float32)])
    if target_col == 'Target 3':
        extras = df[['Target 1', 'Target 2']].values.astype(np.float32)
        X = np.hstack([X, extras])
    X = torch.tensor(X, dtype=torch.float32, device=device).unsqueeze(0)
    # Year as time input
    year = torch.tensor(
        df['Year'].values.astype(np.float32).reshape(1, -1, 1),
        dtype=torch.float32, device=device
    )
    # Sector embedding
    sector = torch.tensor(
        np.stack(df['Sector_Emb'].values).astype(np.float32),
        dtype=torch.float32, device=device
    ).unsqueeze(0)
    # Target: last year
    y = torch.tensor(
        df[target_col].values.astype(np.float32)[-1:].reshape(1, 1),
        dtype=torch.float32, device=device
    )
    return DataLoader(
        TensorDataset(X, year, sector, y),
        batch_size=batch_size, shuffle=False
    )

# ---------- Early Stopping ----------
class EarlyStopping:
    def _init_(self, patience=patience, delta=0.0005, path='chkpt.pt'):
        self.patience = patience
        self.delta    = delta
        self.path     = path
        self.best     = None
        self.counter  = 0
        self.stop     = False
    def _call_(self, val_loss, model):
        score = -val_loss
        if self.best is None or score > self.best + self.delta:
            self.best = score
            torch.save(model.state_dict(), self.path)
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.stop = True

# ---------- Training Function ----------
def train_model_fn(model, train_loader, val_loader):
    optimizer = optim.Adam(model.parameters(), lr=8e-4, weight_decay=5e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.7, patience=4)
    criterion = nn.MSELoss()
    stopper   = EarlyStopping()
    history   = {'tr': [], 'va': []}

    for epoch in range(epochs):
        model.train()
        tr_losses = []
        for X, yr, sec, y in train_loader:
            # Ensure all data is float32
            X = X.to(torch.float32)
            yr = yr.to(torch.float32)
            sec = sec.to(torch.float32)
            y = y.to(torch.float32)

            optimizer.zero_grad()
            out = model(X, yr, sec)
            loss = criterion(out, y)
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            tr_losses.append(loss.item())
        tr_rmse = math.sqrt(np.mean(tr_losses)) if tr_losses else float('nan')

        model.eval()
        va_losses = []
        with torch.no_grad():
            for X, yr, sec, y in val_loader:
                # Ensure all data is float32
                X = X.to(torch.float32)
                yr = yr.to(torch.float32)
                sec = sec.to(torch.float32)
                y = y.to(torch.float32)

                va_losses.append(criterion(model(X, yr, sec), y).item())
        va_rmse = math.sqrt(np.mean(va_losses)) if va_losses else float('nan')

        scheduler.step(va_rmse)

        history['tr'].append(tr_rmse)
        history['va'].append(va_rmse)
        stopper(va_rmse, model)
        if stopper.stop:
            break

    model.load_state_dict(torch.load(stopper.path))
    return model, history

# ---------- Training Target 1 with Time-Based Holdout Variation ----------
'''df1 = get_sector_embeddings(df_target1)
companies1 = df1['Company_encoded'].unique()
tot_tr1 = tot_va1 = tot_ep1 = cnt1 = skip1 = 0
pbar1 = tqdm(companies1, desc="Training Target 1")
for cid in pbar1:
    sub = df1[df1['Company_encoded'] == cid]
    years = sorted(sub['Year'].unique())

    # Apply the time-based holdout variation
    if len(years) < 3:  # For very small datasets
        skip1 += 1
        continue
    elif len(years) <= 5:  # For small datasets
        tr_df = sub[sub['Year'] != years[-1]]  # All but last year
        va_df = sub[sub['Year'] == years[-1]]  # Just last year
    else:  # Normal split for larger datasets
        split = int(0.8 * len(years))
        tr_df = sub[sub['Year'].isin(years[:split])]
        va_df = sub[sub['Year'].isin(years[split:])]

    # Ensure minimum sizes for both training and validation sets
    if len(tr_df) < 2 or len(va_df) < 1:
        skip1 += 1
        continue

    tl = prepare_sequence(tr_df, 'Target 1')
    vl = prepare_sequence(va_df, 'Target 1')
    model1 = BalancedModel(
        input_dim=tl.dataset.tensors[0].shape[-1],
        meta_dim=tl.dataset.tensors[2].shape[-1]
    ).to(device)
    model1, hist1 = train_model_fn(model1, tl, vl)
    tr_rm1, va_rm1, ep1 = hist1['tr'][-1], hist1['va'][-1], len(hist1['va'])
    tot_tr1 += tr_rm1; tot_va1 += va_rm1; tot_ep1 += ep1; cnt1 += 1
    pbar1.set_postfix(
        avg_tr=f"{tot_tr1/cnt1:.4f}", avg_va=f"{tot_va1/cnt1:.4f}", avg_ep=f"{tot_ep1/cnt1:.1f}"
    )
pbar1.close()
print(f"Target 1 ▶ avg_train={tot_tr1/cnt1:.4f}, avg_val={tot_va1/cnt1:.4f}, skipped={skip1}")'''

# ---------- Training Target 2 with Time-Based Holdout Variation ----------
'''df2 = get_sector_embeddings(df_target2)
companies2 = df2['Company_encoded'].unique()
tot_tr2 = tot_va2 = tot_ep2 = cnt2 = skip2 = 0
pbar2 = tqdm(companies2, desc="Training Target 2")
for cid in pbar2:
    sub = df2[df2['Company_encoded'] == cid]
    years = sorted(sub['Year'].unique())

    # Apply the time-based holdout variation
    if len(years) < 3:  # For very small datasets
        skip2 += 1
        continue
    elif len(years) <= 5:  # For small datasets
        tr_df = sub[sub['Year'] != years[-1]]  # All but last year
        va_df = sub[sub['Year'] == years[-1]]  # Just last year
    else:  # Normal split for larger datasets
        split = int(0.8 * len(years))
        tr_df = sub[sub['Year'].isin(years[:split])]
        va_df = sub[sub['Year'].isin(years[split:])]

    # Ensure minimum sizes for both training and validation sets
    if len(tr_df) < 2 or len(va_df) < 1:
        skip2 += 1
        continue

    tl = prepare_sequence(tr_df, 'Target 2')
    vl = prepare_sequence(va_df, 'Target 2')
    model2 = BalancedModel(
        input_dim=tl.dataset.tensors[0].shape[-1],
        meta_dim=tl.dataset.tensors[2].shape[-1]
    ).to(device)
    model2, hist2 = train_model_fn(model2, tl, vl)
    tr_rm2, va_rm2, ep2 = hist2['tr'][-1], hist2['va'][-1], len(hist2['va'])
    tot_tr2 += tr_rm2; tot_va2 += va_rm2; tot_ep2 += ep2; cnt2 += 1
    pbar2.set_postfix(
        avg_tr=f"{tot_tr2/cnt2:.4f}", avg_va=f"{tot_va2/cnt2:.4f}", avg_ep=f"{tot_ep2/cnt2:.1f}"
    )
pbar2.close()
print(f"Target 2 ▶ avg_train={tot_tr2/cnt2:.4f}, avg_val={tot_va2/cnt2:.4f}, skipped={skip2}")'''

# ---------- Training Target 3 with Time-Based Holdout Variation ----------
df3 = get_sector_embeddings(df_target3)
companies3 = df3['Company_encoded'].unique()
tot_tr3 = tot_va3 = tot_ep3 = cnt3 = skip3 = 0
pbar3 = tqdm(companies3, desc="Training Target 3")
for cid in pbar3:
    sub = df3[df3['Company_encoded'] == cid]
    years = sorted(sub['Year'].unique())

    # Apply the time-based holdout variation
    if len(years) < 3:  # For very small datasets
        skip3 += 1
        continue
    elif len(years) <= 5:  # For small datasets
        tr_df = sub[sub['Year'] != years[-1]]  # All but last year
        va_df = sub[sub['Year'] == years[-1]]  # Just last year
    else:  # Normal split for larger datasets
        split = int(0.8 * len(years))
        tr_df = sub[sub['Year'].isin(years[:split])]
        va_df = sub[sub['Year'].isin(years[split:])]

    # Ensure minimum sizes for both training and validation sets
    if len(tr_df) < 2 or len(va_df) < 1:
        skip3 += 1
        continue

    tl = prepare_sequence(tr_df, 'Target 3')
    vl = prepare_sequence(va_df, 'Target 3')
    model3 = BalancedModel(
        input_dim=tl.dataset.tensors[0].shape[-1],
        meta_dim=tl.dataset.tensors[2].shape[-1]
    ).to(device)
    model3, hist3 = train_model_fn(model3, tl, vl)
    tr_rm3, va_rm3, ep3 = hist3['tr'][-1], hist3['va'][-1], len(hist3['va'])
    tot_tr3 += tr_rm3; tot_va3 += va_rm3; tot_ep3 += ep3; cnt3 += 1
    pbar3.set_postfix(
        avg_tr=f"{tot_tr3/cnt3:.4f}", avg_va=f"{tot_va3/cnt3:.4f}", avg_ep=f"{tot_ep3/cnt3:.1f}"
    )
pbar3.close()
print(f"Target 3 ▶ avg_train={tot_tr3/cnt3:.4f}, avg_val={tot_va3/cnt3:.4f}, skipped={skip3}")
save_path = "model3.pth"
torch.save(model3.state_dict(), save_path)
print(f"Saved Target 3 model weights to {save_path}")