In [None]:
import pandas as pd

# Load your full dataset
file_path = 'final_cleaned.parquet'
df = pd.read_parquet(file_path)
print(df.columns)
# Extract a small portion of the dataset (e.g., first 100 rows)
sample_df = df.head(100)  # Adjust the number of rows as needed

# Save the sample dataset to a CSV file
sample_file_path = 'sample_dataset.csv'
sample_df.to_csv(sample_file_path, index=False)

print(f'Sample dataset saved as {sample_file_path}')


In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import torch
import torch.nn as nn
from lightgbm import LGBMRegressor

# Remove multicollinearity using VIF
def remove_high_vif_features(X, threshold=5.0):
    X_numeric = X.select_dtypes(include=[np.number])  # Select only numeric columns
    vif_data = pd.DataFrame()
    vif_data['feature'] = X_numeric.columns
    vif_data['VIF'] = [variance_inflation_factor(X_numeric.values, i) for i in range(X_numeric.shape[1])]
    
    # Remove features with VIF higher than the threshold
    high_vif_features = vif_data[vif_data['VIF'] > threshold]['feature']
    return X.drop(columns=high_vif_features)

# Apply PCA for dimensionality reduction
def apply_pca(X_train, X_val, n_components=0.95):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_val_pca = pca.transform(X_val)
    return pd.DataFrame(X_train_pca), pd.DataFrame(X_val_pca)

# Aggressive feature selection using correlation and RFE
def select_important_features(X, y, num_features):
    # Remove highly correlated features
    corr_matrix = X.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
    X_reduced = X.drop(columns=to_drop)

    # Recursive feature elimination (RFE) with linear regression
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=num_features)
    X_rfe_reduced = rfe.fit_transform(X_reduced, y)

    return pd.DataFrame(X_rfe_reduced)

# Simplified Transformer model definition
class StockTransformer(nn.Module):
    def __init__(self, input_dim, nhead=2):  # Reduce nhead to simplify architecture
        super(StockTransformer, self).__init__()
        self.nhead = nhead

        # Check if input_dim is divisible by nhead, if not adjust nhead
        if input_dim % nhead != 0:
            self.nhead = 1  # Adjust to 1 head if not divisible
        
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=self.nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)  # Reduce number of layers
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)  # Reduce sequence length dimension
        return self.fc(self.dropout(x))

# Train Transformer
def train_transformer(X_train, y_train, X_val, y_val, input_dim, device='cpu'):
    model = StockTransformer(input_dim).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Add a sequence dimension (unsqueeze to add a dimension for sequence length)
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1).to(device)
    X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
    y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1).to(device)

    for epoch in range(20000):  # Reduce the number of epochs to avoid overfitting
        model.train()
        optimizer.zero_grad()
        output = model(X_train_tensor)
        loss = criterion(output, y_train_tensor)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_output = model(X_val_tensor)
            val_loss = criterion(val_output, y_val_tensor)
        print(f'Epoch {epoch + 1}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}')

    return model

# Preprocessing function with KNN Imputation and scaling
def preprocess_data(X):
    # Select only numeric columns
    X_numeric = X.select_dtypes(include=[np.number])

    # Replace infinity values with NaN
    X_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Cap extreme large values (based on domain knowledge)
    X_numeric = X_numeric.clip(lower=-1e6, upper=1e6)
    
    # Use KNN imputation for handling missing values
    imputer = KNNImputer(n_neighbors=5)
    X_numeric = pd.DataFrame(imputer.fit_transform(X_numeric), columns=X_numeric.columns)
    
    return X_numeric

# Process batches for LightGBM and Transformer for two companies
def process_two_companies(data, target_column, batch_size=2, num_features=10, device='cpu'):
    # Select any two companies for processing
    unique_companies = data['Company_ID'].unique()[:2]  # Selecting only two companies
    
    for i in range(0, len(unique_companies), batch_size):
        batch_companies = unique_companies[i:i+batch_size]
        batch_data = data[data['Company_ID'].isin(batch_companies)]
        
        X_train = batch_data.drop(columns=[target_column, 'Company_ID', 'Date'])
        y_train = batch_data[target_column]

        # Preprocess the data (handles both internal and external features)
        X_train = preprocess_data(X_train)

        # Scaling the data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)

        # Remove multicollinearity using VIF
        X_train_vif_reduced = remove_high_vif_features(pd.DataFrame(X_train_scaled, columns=X_train.columns))
        
        # Aggressive feature selection (correlation and RFE)
        X_train_reduced = select_important_features(X_train_vif_reduced, y_train, num_features)

        # Train LightGBM model with reduced features
        lgb_model = LGBMRegressor(n_estimators=100)
        lgb_model.fit(X_train_reduced, y_train)
        y_pred_lgb = lgb_model.predict(X_train_reduced)

        # Train Transformer model
        input_dim = X_train_reduced.shape[1]
        transformer_model = train_transformer(X_train_reduced, y_train, X_train_reduced, y_train, input_dim, device=device)

        # Get predictions for Transformer
        X_train_tensor = torch.tensor(X_train_reduced.values, dtype=torch.float32).unsqueeze(1).to(device)
        y_pred_transformer = transformer_model(X_train_tensor).detach().cpu().numpy()

        # Ensemble prediction
        ensemble_pred = (y_pred_lgb * 0.9 + y_pred_transformer.squeeze() * 0.1)

        # Calculate RMSE
        print(f'RMSE for LightGBM: {mean_squared_error(y_train, y_pred_lgb, squared=False)}')
        print(f'RMSE for Transformer: {mean_squared_error(y_train, y_pred_transformer.squeeze(), squared=False)}')
        print(f'RMSE for Ensemble: {mean_squared_error(y_train, ensemble_pred, squared=False)}')

# Main function
def train_model_on_small_dataset(file_path, target_column='Close_x', batch_size=200, num_features=100):
    df = pd.read_parquet(file_path)
    process_two_companies(df, target_column, batch_size, num_features)

# Assuming your dataset is saved as 'final_cleaned.parquet'
file_path = 'final_cleaned.parquet'
train_model_on_small_dataset(file_path, num_features=100)


In [None]:
## previous 200 epochs, 0.01 lr
## curr-previous 3000 epochs, 0.001 lr, num features 10, batches 200 -----RMSE for LightGBM: 8.597191087745076, RMSE for Transformer: 17.6809587444915, RMSE for Ensemble: 8.558077688024794
## ## curr-previous 20000 epochs, 0.001 lr, num features 100,batches 200

In [None]:
import os
import pickle
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import torch
import torch.nn as nn
from lightgbm import LGBMRegressor

# Remove multicollinearity using VIF
def remove_high_vif_features(X, threshold=5.0):
    X_numeric = X.select_dtypes(include=[np.number])
    vif_data = pd.DataFrame()
    vif_data['feature'] = X_numeric.columns
    vif_data['VIF'] = [variance_inflation_factor(X_numeric.values, i) for i in range(X_numeric.shape[1])]

    high_vif_features = vif_data[vif_data['VIF'] > threshold]['feature']
    return X.drop(columns=high_vif_features)

# Apply PCA for dimensionality reduction
def apply_pca(X_train, X_val, n_components=0.95):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_val_pca = pca.transform(X_val)
    return pd.DataFrame(X_train_pca), pd.DataFrame(X_val_pca)

# Aggressive feature selection using correlation and RFE
def select_important_features(X, y, num_features):
    corr_matrix = X.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
    X_reduced = X.drop(columns=to_drop)

    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=num_features)
    X_rfe_reduced = rfe.fit_transform(X_reduced, y)

    return pd.DataFrame(X_rfe_reduced)

# Simplified Transformer model definition
class StockTransformer(nn.Module):
    def __init__(self, input_dim, nhead=2):
        super(StockTransformer, self).__init__()
        self.nhead = nhead
        if input_dim % nhead != 0:
            self.nhead = 1

        self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=self.nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        return self.fc(self.dropout(x))

# Train Transformer
def train_transformer(X_train, y_train, X_val, y_val, input_dim, epochs, learning_rate, model=None, device='cpu'):
    if model is None:
        model = StockTransformer(input_dim).to(device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1).to(device)
    X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
    y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1).to(device)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train_tensor)
        loss = criterion(output, y_train_tensor)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_output = model(X_val_tensor)
            val_loss = criterion(val_output, y_val_tensor)
        print(f'Epoch {epoch + 1}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}')

    return model

# Preprocessing function with KNN Imputation and scaling
def preprocess_data(X):
    X_numeric = X.select_dtypes(include=[np.number])
    X_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_numeric = X_numeric.clip(lower=-1e6, upper=1e6)
    imputer = KNNImputer(n_neighbors=5)
    X_numeric = pd.DataFrame(imputer.fit_transform(X_numeric), columns=X_numeric.columns)
    return X_numeric

# Save intermediate model
def save_model(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
    print(f'Model saved as {filename}')

# Load saved model
def load_model(filename):
    with open(filename, 'rb') as f:
        model = pickle.load(f)
    print(f'Model {filename} loaded successfully.')
    return model

# Process batches for LightGBM and Transformer for two companies
def process_two_companies(data, target_column, batch_size=2, num_features=10, device='cpu'):
    unique_companies = data['Company_ID'].unique()[:2]

    for i in range(0, len(unique_companies), batch_size):
        batch_companies = unique_companies[i:i + batch_size]
        batch_data = data[data['Company_ID'].isin(batch_companies)]

        X_train = batch_data.drop(columns=[target_column, 'Company_ID', 'Date'])
        y_train = batch_data[target_column]

        X_train = preprocess_data(X_train)
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)

        X_train_vif_reduced = remove_high_vif_features(pd.DataFrame(X_train_scaled, columns=X_train.columns))
        X_train_reduced = select_important_features(X_train_vif_reduced, y_train, num_features)

        train_lgbm = input("Do you want to train LightGBM? (y/n): ").lower() == 'y'
        if train_lgbm:
            num_estimators = int(input("Enter the number of estimators for LightGBM: "))
            learning_rate = float(input("Enter the learning rate for LightGBM: "))
            max_depth = int(input("Enter the max depth for LightGBM: "))
            num_leaves = int(input("Enter the number of leaves for LightGBM: "))
            feature_fraction = float(input("Enter the feature fraction for LightGBM (0.0 to 1.0): "))
            bagging_fraction = float(input("Enter the bagging fraction for LightGBM (0.0 to 1.0): "))
            bagging_freq = int(input("Enter the bagging frequency for LightGBM: "))
            lambda_l1 = float(input("Enter the L1 regularization for LightGBM: "))
            lambda_l2 = float(input("Enter the L2 regularization for LightGBM: "))

            lgb_model = LGBMRegressor(
                n_estimators=num_estimators,
                learning_rate=learning_rate,
                max_depth=max_depth,
                num_leaves=num_leaves,
                feature_fraction=feature_fraction,
                bagging_fraction=bagging_fraction,
                bagging_freq=bagging_freq,
                lambda_l1=lambda_l1,
                lambda_l2=lambda_l2
            )

            lgb_model.fit(X_train_reduced, y_train)
            y_pred_lgb = lgb_model.predict(X_train_reduced)
            print(f'RMSE for LightGBM: {mean_squared_error(y_train, y_pred_lgb, squared=False)}')

            save_lgbm = input("Do you want to save this LightGBM model? (y/n): ").lower() == 'y'
            if save_lgbm:
                filename = input("Enter the filename to save LightGBM model: ")
                save_model(lgb_model, filename)

        train_transformer = input("Do you want to train the Transformer? (y/n): ").lower() == 'y'
        if train_transformer:
            epochs = int(input("Enter the number of epochs for Transformer: "))
            transformer_lr = float(input("Enter the learning rate for Transformer: "))

            transformer_model = None
            load_transformer = input("Do you want to load a saved Transformer model? (y/n): ").lower() == 'y'
            if load_transformer:
                filename = input("Enter the filename of the saved Transformer model: ")
                transformer_model = load_model(filename)

            transformer_model = train_transformer(X_train_reduced, y_train, X_train_reduced, y_train, X_train_reduced.shape[1], epochs, transformer_lr, model=transformer_model, device=device)
            
            save_transformer = input("Do you want to save this Transformer model? (y/n): ").lower() == 'y'
            if save_transformer:
                filename = input("Enter the filename to save Transformer model: ")
                save_model(transformer_model, filename)

        continue_training = input("Do you want to continue training the models? (y/n): ").lower() == 'y'
        if continue_training:
            continue
        else:
            break

# Main function
def train_model_on_small_dataset(file_path, target_column='Close_x', batch_size=2, num_features=10):
    df = pd.read_parquet(file_path)
    process_two_companies(df, target_column, batch_size, num_features)

# Assuming your dataset is saved as 'final_cleaned.parquet'
file_path = 'final_cleaned.parquet'
train_model_on_small_dataset(file_path, num_features=10)


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import joblib

# Simplified Transformer model definition
class StockTransformer(nn.Module):
    def __init__(self, input_dim, nhead=2):
        super(StockTransformer, self).__init__()
        self.nhead = nhead
        if input_dim % nhead != 0:
            self.nhead = 1
        
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=self.nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        return self.fc(self.dropout(x))

# Function to train the transformer model
def train_transformer(X_train, y_train, X_val, y_val, input_dim, epochs, learning_rate, model=None, device='cpu'):
    if model is None:
        model = StockTransformer(input_dim).to(device)
    
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1).to(device)
    X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
    y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1).to(device)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train_tensor)
        loss = criterion(output, y_train_tensor)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_output = model(X_val_tensor)
            val_loss = criterion(val_output, y_val_tensor)
        print(f'Epoch {epoch + 1}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}')

    return model

# Function to save transformer models
def save_model(model, filename):
    torch.save(model.state_dict(), filename)

# Function to load transformer models
def load_model(filename, input_dim, device='cpu'):
    model = StockTransformer(input_dim)
    model.load_state_dict(torch.load(filename, map_location=device))
    return model

# Feature selection function
def select_important_features(X, y, num_features):
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=num_features)
    X_rfe_reduced = rfe.fit_transform(X, y)
    return pd.DataFrame(X_rfe_reduced)

# LightGBM Training and Save Model Function
def train_lightgbm(X_train_reduced, y_train, num_estimators, learning_rate, max_depth, num_leaves, feature_fraction, bagging_fraction, bagging_freq, lambda_l1, lambda_l2):
    lgb_model = LGBMRegressor(
        n_estimators=num_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        num_leaves=num_leaves,
        feature_fraction=feature_fraction,
        bagging_fraction=bagging_fraction,
        bagging_freq=bagging_freq,
        lambda_l1=lambda_l1,
        lambda_l2=lambda_l2
    )
    
    lgb_model.fit(X_train_reduced, y_train)
    y_pred_lgb = lgb_model.predict(X_train_reduced)
    print(f'RMSE for LightGBM: {mean_squared_error(y_train, y_pred_lgb, squared=False)}')
    
    save_lgbm = input("Do you want to save this LightGBM model? (y/n): ").lower() == 'y'
    if save_lgbm:
        lgbm_filename = input("Enter the filename to save LightGBM model: ")
        joblib.dump(lgb_model, f"{lgbm_filename}.pkl")
        print(f'Model saved as {lgbm_filename}')
    
    return lgb_model

# Preprocessing function
def preprocess_data(X):
    X_numeric = X.select_dtypes(include=[np.number])
    X_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_numeric.fillna(X_numeric.mean(), inplace=True)
    return X_numeric

# Main training loop for both LightGBM and Transformer models
def process_two_companies(data, target_column, batch_size=2, num_features=10, device='cpu'):
    unique_companies = data['Company_ID'].unique()[:2]
    
    for i in range(0, len(unique_companies), batch_size):
        batch_companies = unique_companies[i:i+batch_size]
        batch_data = data[data['Company_ID'].isin(batch_companies)]
        
        X_train = batch_data.drop(columns=[target_column, 'Company_ID', 'Date'])
        y_train = batch_data[target_column]

        # Preprocess and scale data
        X_train = preprocess_data(X_train)
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_train_reduced = select_important_features(pd.DataFrame(X_train_scaled), y_train, num_features)

        continue_training = True
        while continue_training:
            # LightGBM Training
            train_lgbm = input("Do you want to train LightGBM? (y/n): ").lower() == 'y'
            if train_lgbm:
                num_estimators = int(input("Enter the number of estimators for LightGBM: "))
                learning_rate = float(input("Enter the learning rate for LightGBM: "))
                max_depth = int(input("Enter the max depth for LightGBM: "))
                num_leaves = int(input("Enter the number of leaves for LightGBM: "))
                feature_fraction = float(input("Enter the feature fraction for LightGBM (0.0 to 1.0): "))
                bagging_fraction = float(input("Enter the bagging fraction for LightGBM (0.0 to 1.0): "))
                bagging_freq = int(input("Enter the bagging frequency for LightGBM: "))
                lambda_l1 = float(input("Enter the L1 regularization for LightGBM: "))
                lambda_l2 = float(input("Enter the L2 regularization for LightGBM: "))
                
                lgb_model = train_lightgbm(X_train_reduced, y_train, num_estimators, learning_rate, max_depth, num_leaves, feature_fraction, bagging_fraction, bagging_freq, lambda_l1, lambda_l2)

            # Transformer Training
            train_transformer_model = input("Do you want to train the Transformer? (y/n): ").lower() == 'y'
            transformer_model = None
            if train_transformer_model:
                epochs = int(input("Enter the number of epochs for Transformer: "))
                transformer_lr = float(input("Enter the learning rate for Transformer: "))
                
                load_transformer = input("Do you want to load a saved Transformer model? (y/n): ").lower() == 'y'
                if load_transformer:
                    filename = input("Enter the filename of the saved Transformer model: ")
                    transformer_model = load_model(filename, X_train_reduced.shape[1], device)
                
                transformer_model = train_transformer(
                    X_train_reduced, y_train, 
                    X_train_reduced, y_train, 
                    X_train_reduced.shape[1], 
                    epochs, transformer_lr, model=transformer_model, device=device
                )
                
                save_transformer = input("Do you want to save this Transformer model? (y/n): ").lower() == 'y'
                if save_transformer:
                    transformer_filename = input("Enter the filename to save Transformer model: ")
                    save_model(transformer_model, f"{transformer_filename}.pth")
                    print(f'Transformer model saved as {transformer_filename}')

            continue_training = input("Do you want to continue training the models? (y/n): ").lower() == 'y'

# Main function
def train_model_on_small_dataset(file_path, target_column='Close_x', batch_size=2, num_features=10):
    df = pd.read_parquet(file_path)
    process_two_companies(df, target_column, batch_size, num_features)

# Example usage
file_path = 'final_cleaned.parquet'
train_model_on_small_dataset(file_path, num_features=10)


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import joblib
from torch.utils.data import DataLoader, TensorDataset
import concurrent.futures

# Simplified Transformer model definition
class StockTransformer(nn.Module):
    def __init__(self, input_dim, nhead=2):
        super(StockTransformer, self).__init__()
        self.nhead = nhead
        if input_dim % nhead != 0:
            self.nhead = 1
        
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=self.nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        return self.fc(self.dropout(x))

# Function to train the transformer model with batch processing
def train_transformer(X_train, y_train, X_val, y_val, input_dim, epochs, learning_rate, model=None, batch_size=64, device='cpu'):
    if model is None:
        model = StockTransformer(input_dim).to(device)
    
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Convert data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1).to(device)
    X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32).to(device)
    y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1).to(device)

    # Create DataLoader for batching
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0

        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(X_batch.unsqueeze(1))  # Add sequence dimension
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        # Validation step
        model.eval()
        with torch.no_grad():
            val_output = model(X_val_tensor.unsqueeze(1))
            val_loss = criterion(val_output, y_val_tensor)

        print(f'Epoch {epoch + 1}, Training Loss: {epoch_loss/len(train_loader)}, Validation Loss: {val_loss.item()}')

    return model

# Function to save transformer models
def save_model(model, filename):
    torch.save(model.state_dict(), filename)

# Function to load transformer models
def load_model(filename, input_dim, device='cpu'):
    model = StockTransformer(input_dim)
    model.load_state_dict(torch.load(filename, map_location=device))
    return model

# Feature selection function
def select_important_features(X, y, num_features):
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=num_features)
    X_rfe_reduced = rfe.fit_transform(X, y)
    return pd.DataFrame(X_rfe_reduced)

# LightGBM Training Function
def train_lightgbm(X_train_reduced, y_train, num_estimators, learning_rate, max_depth, num_leaves, feature_fraction, bagging_fraction, bagging_freq, lambda_l1, lambda_l2):
    lgb_model = LGBMRegressor(
        n_estimators=num_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        num_leaves=num_leaves,
        feature_fraction=feature_fraction,
        bagging_fraction=bagging_fraction,
        bagging_freq=bagging_freq,
        lambda_l1=lambda_l1,
        lambda_l2=lambda_l2
    )
    
    lgb_model.fit(X_train_reduced, y_train)
    y_pred_lgb = lgb_model.predict(X_train_reduced)
    print(f'RMSE for LightGBM: {mean_squared_error(y_train, y_pred_lgb, squared=False)}')
    
    save_lgbm = input("Do you want to save this LightGBM model? (y/n): ").lower() == 'y'
    if save_lgbm:
        lgbm_filename = input("Enter the filename to save LightGBM model: ")
        joblib.dump(lgb_model, f"{lgbm_filename}.pkl")
        print(f'Model saved as {lgbm_filename}')
    
    return lgb_model

# Parallel LightGBM training function
def parallel_lightgbm_training(params_list, X_train_reduced, y_train):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(train_lightgbm, **params) for params in params_list]
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()  # This will raise any exception caught during training
            except Exception as exc:
                print(f"Generated an exception: {exc}")

# Preprocessing function
def preprocess_data(X):
    X_numeric = X.select_dtypes(include=[np.number])
    X_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_numeric.fillna(X_numeric.mean(), inplace=True)
    return X_numeric

# Main training loop for both LightGBM and Transformer models
def process_two_companies(data, target_column, batch_size=2, num_features=10, device='cpu'):
    unique_companies = data['Company_ID'].unique()[:2]
    
    for i in range(0, len(unique_companies), batch_size):
        batch_companies = unique_companies[i:i+batch_size]
        batch_data = data[data['Company_ID'].isin(batch_companies)]
        
        X_train = batch_data.drop(columns=[target_column, 'Company_ID', 'Date'])
        y_train = batch_data[target_column]

        # Preprocess and scale data
        X_train = preprocess_data(X_train)
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_train_reduced = select_important_features(pd.DataFrame(X_train_scaled), y_train, num_features)

        continue_training = True
        while continue_training:
            # LightGBM Training in parallel
            train_lgbm = input("Do you want to train LightGBM? (y/n): ").lower() == 'y'
            if train_lgbm:
                num_estimators = int(input("Enter the number of estimators for LightGBM: "))
                learning_rate = float(input("Enter the learning rate for LightGBM: "))
                max_depth = int(input("Enter the max depth for LightGBM: "))
                num_leaves = int(input("Enter the number of leaves for LightGBM: "))
                feature_fraction = float(input("Enter the feature fraction for LightGBM (0.0 to 1.0): "))
                bagging_fraction = float(input("Enter the bagging fraction for LightGBM (0.0 to 1.0): "))
                bagging_freq = int(input("Enter the bagging frequency for LightGBM: "))
                lambda_l1 = float(input("Enter the L1 regularization for LightGBM: "))
                lambda_l2 = float(input("Enter the L2 regularization for LightGBM: "))

                # Add parameter set for parallel processing
                params = {
                    'X_train_reduced': X_train_reduced,
                    'y_train': y_train,
                    'num_estimators': num_estimators,
                    'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'num_leaves': num_leaves,
                    'feature_fraction': feature_fraction,
                    'bagging_fraction': bagging_fraction,
                    'bagging_freq': bagging_freq,
                    'lambda_l1': lambda_l1,
                    'lambda_l2': lambda_l2
                }
                
                # Train LightGBM models in parallel
                parallel_lightgbm_training([params], X_train_reduced, y_train)

            # Transformer Training with Mini-batch processing
            train_transformer_model = input("Do you want to train the Transformer? (y/n): ").lower() == 'y'
            transformer_model = None
            if train_transformer_model:
                epochs = int(input("Enter the number of epochs for Transformer: "))
                transformer_lr = float(input("Enter the learning rate for Transformer: "))
                
                load_transformer = input("Do you want to load a saved Transformer model? (y/n): ").lower() == 'y'
                if load_transformer:
                    filename = input("Enter the filename of the saved Transformer model: ")
                    transformer_model = load_model(filename, X_train_reduced.shape[1], device)
                
                # Mini-batch training for Transformer
                transformer_model = train_transformer(
                    X_train_reduced, y_train, 
                    X_train_reduced, y_train, 
                    X_train_reduced.shape[1], 
                    epochs, transformer_lr, model=transformer_model, batch_size=64, device=device
                )
                
                save_transformer = input("Do you want to save this Transformer model? (y/n): ").lower() == 'y'
                if save_transformer:
                    transformer_filename = input("Enter the filename to save Transformer model: ")
                    save_model(transformer_model, f"{transformer_filename}.pth")
                    print(f'Transformer model saved as {transformer_filename}')

            # Ask if the user wants to continue training the models
            continue_training = input("Do you want to continue training the models? (y/n): ").lower() == 'y'

# Main function to process the entire dataset
def train_model_on_small_dataset(file_path, target_column='Close_x', batch_size=2, num_features=10):
    df = pd.read_parquet(file_path)
    process_two_companies(df, target_column, batch_size, num_features)

# Example usage
file_path = 'final_cleaned.parquet'
train_model_on_small_dataset(file_path, num_features=10)


In [None]:
import pandas as pd
file_path = 'final_cleaned.parquet'
file= pd.read_parquet(file_path)

In [None]:
file.columns

In [None]:
null_values = file.isnull().sum()

# Display the columns with their respective number of null values
print(null_values)


In [None]:
# Alternatively, display only columns that have null values
null_columns = null_values[null_values > 0]
print(null_columns)

In [None]:
Best trial: {'n_estimators': 24306, 'learning_rate': 0.049412489369297465, 'num_leaves': 65, 'max_depth': 11, 'min_child_weight': 2, 'subsample': 0.7486537063053206, 'colsample_bytree': 0.8938290344572073, 'lambda_l1': 5.552076717390757, 'lambda_l2': 5.664327006224713}
Best RMSE: 1.1564605583932328
Training was not successful.
Best trial: {'n_estimators': 1896, 'learning_rate': 0.04036080448608824, 'num_leaves': 35, 'max_depth': 20, 'min_child_weight': 5, 'subsample': 0.6082873778110115, 'colsample_bytree': 0.6064640614372323, 'lambda_l1': 0.22322036512234522, 'lambda_l2': 6.288264467695819}
Best RMSE: 1.1548257651807403
Training was not successful.
Best trial: {'n_estimators': 906, 'learning_rate': 0.04854017200771697, 'num_leaves': 73, 'max_depth': 20, 'min_child_weight': 1, 'subsample': 0.9405242535067422, 'colsample_bytree': 0.8092277269150399, 'lambda_l1': 1.409218395432035, 'lambda_l2': 6.813061948500454}
Best RMSE: 1.1897049380084983
Training was not successful.
Best trial: {'n_estimators': 515, 'learning_rate': 0.04533535032250655, 'num_leaves': 85, 'max_depth': 6, 'min_child_weight': 4, 'subsample': 0.9920627684940627, 'colsample_bytree': 0.6199230615756138, 'lambda_l1': 5.053218586747487, 'lambda_l2': 7.2048827945514535}
Best RMSE: 1.2306108760078551
Training was not successful.
Best trial: {'n_estimators': 518, 'learning_rate': 0.04350155281960924, 'num_leaves': 78, 'max_depth': 11, 'min_child_weight': 2, 'subsample': 0.6077376957652024, 'colsample_bytree': 0.6092134142649996, 'lambda_l1': 5.247855359976364, 'lambda_l2': 6.6878055740679}
Best RMSE: 1.2026481037101264
Training was not successful.

In [None]:
## hyperparameter tuning
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import optuna
from optuna.pruners import HyperbandPruner

# Objective function for hyperparameter optimization
def objective(trial, X_train, y_train):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 30000, 35000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05),
        'num_leaves': trial.suggest_int('num_leaves', 31, 128),
        'max_depth': trial.suggest_int('max_depth', -1, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.01, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.01, 10.0),
    }

    kf = KFold(n_splits=2, shuffle=True, random_state=42)

    for train_index, valid_index in kf.split(X_train):
        X_train_split, X_valid_split = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]

        model = LGBMRegressor(**param)

        # Fit the model
        model.fit(
            X_train_split, y_train_split,
            eval_set=[(X_valid_split, y_valid_split)],
            eval_metric='rmse',
            callbacks=[optuna.integration.LightGBMPruningCallback(trial, 'rmse')]
        )

        # Predict and calculate RMSE
        y_pred = model.predict(X_valid_split)
        rmse = mean_squared_error(y_valid_split, y_pred, squared=False)

        return rmse

# Preprocessing function to handle Label Encoding for categorical data
def preprocess_data(df):
    # Convert object types to categorical
    le = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = le.fit_transform(df[col])
    
    return df

# Function to train LightGBM with hyperparameter tuning and feature selection
def train_lightgbm_with_tuning_and_feature_selection(file_path, target_column='Close_x', batch_size=10000, min_features=10):
    print("Loading data...")
    
    # Load the dataset
    df = pd.read_parquet(file_path, engine='pyarrow')

    # Preprocess the dataset
    df = preprocess_data(df)

    # Select only two companies for modeling
    selected_companies = df['Company_ID'].unique()[:2]
    df = df[df['Company_ID'].isin(selected_companies)]

    X_full = df.drop(columns=[target_column, 'Date'])
    y_full = df[target_column]

    # Optuna study with hyperparameter tuning
    pruner = HyperbandPruner()
    study = optuna.create_study(direction='minimize', pruner=pruner)
    study.optimize(lambda trial: objective(trial, X_full, y_full), n_trials=50)

    # Output the best parameters and results
    print("Best trial:", study.best_trial.params)
    print("Best RMSE:", study.best_value)

# Main function to train the model on two companies
def train_model_on_two_companies(file_path, target_column='Close_x', min_features=10):
    print("Training LightGBM on two companies with feature selection and hyperparameter tuning...")
    model = train_lightgbm_with_tuning_and_feature_selection(file_path, target_column, min_features=min_features)
    if model is not None:
        print("Training completed with the best model.")
    else:
        print("Training was not successful.")
    return model

# Example usage
file_path = 'final_outliers_2.parquet'
train_model_on_two_companies(file_path, min_features=10)


In [None]:
Summary of Results for Least Estimators:
Companies_10_Least_Estimators: RMSE = 2.669920816356922
Companies_50_Least_Estimators: RMSE = 26.897224035199706
Companies_100_Least_Estimators: RMSE = 3.5614469055576667
Companies_500_Least_Estimators: RMSE = 22.465645727070743

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Preprocessing function to handle Label Encoding for categorical data
def preprocess_data(df):
    # Convert object types to categorical
    le = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = le.fit_transform(df[col])
    
    return df

# Function to perform feature selection based on feature importance
def feature_selection(X_train, y_train, min_features):
    model = LGBMRegressor(n_estimators=100)  # Use a base model to get feature importance
    model.fit(X_train, y_train)
    
    feature_importances = pd.Series(model.feature_importances_, index=X_train.columns)
    
    # Select all features with non-zero importance, but ensure at least 'min_features'
    top_features = feature_importances[feature_importances > 0].nlargest(max(min_features, (feature_importances > 0).sum())).index
    
    print(f"Selected {len(top_features)} features for training.")
    
    return top_features

# Function to train LightGBM model after feature selection
def train_lightgbm_with_feature_selection(X_train, y_train, params, selected_features):
    X_train_selected = X_train[selected_features]
    
    kf = KFold(n_splits=2, shuffle=True, random_state=42)
    rmse_list = []

    for train_index, valid_index in kf.split(X_train_selected):
        X_train_split, X_valid_split = X_train_selected.iloc[train_index], X_train_selected.iloc[valid_index]
        y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]

        model = LGBMRegressor(**params)

        # Fit the model
        model.fit(X_train_split, y_train_split, eval_set=[(X_valid_split, y_valid_split)], eval_metric='rmse')
        
        # Predict and calculate RMSE
        y_pred = model.predict(X_valid_split)
        rmse = mean_squared_error(y_valid_split, y_pred, squared=False)
        rmse_list.append(rmse)
    
    return np.mean(rmse_list)

# Function to train LightGBM on 500 random companies with dynamic feature selection
def train_on_500_companies_with_dynamic_feature_selection(file_path, target_column='Close_x', min_features=10):
    print("Loading data...")
    
    # Load the dataset
    df = pd.read_parquet(file_path, engine='pyarrow')

    # Preprocess the dataset
    df = preprocess_data(df)

    # Select 500 random companies
    selected_companies = np.random.choice(df['Company_ID'].unique(), size=500, replace=False)
    df = df[df['Company_ID'].isin(selected_companies)]

    X_full = df.drop(columns=[target_column, 'Date'])
    y_full = df[target_column]

    # Parameter set with the highest number of estimators
    best_params_high_n_estimators = {
        'n_estimators': 2000,
        'learning_rate': 0.049412489369297465,
        'num_leaves': 65,
        'max_depth': 11,
        'min_child_weight': 2,
        'subsample': 0.7486537063053206,
        'colsample_bytree': 0.8938290344572073,
        'lambda_l1': 5.552076717390757,
        'lambda_l2': 5.664327006224713
    }

    # Select features dynamically based on feature importance
    selected_features = feature_selection(X_full, y_full, min_features)
    
    print(f"\n--- Training for 100 random companies with {len(selected_features)} selected features ---")
    rmse = train_lightgbm_with_feature_selection(X_full, y_full, best_params_high_n_estimators, selected_features)
    
    print("\nFinal RMSE for 100 companies:", rmse)

# Example usage
file_path = 'final_outliers_2.parquet'
train_on_500_companies_with_dynamic_feature_selection(file_path, min_features=10)
