## Random Forest, XG Boost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Load the Excel file
nifty50_excel_file = 'combined_stock_data_single_sheet.xlsx'
xls = pd.ExcelFile(nifty50_excel_file)

# Define features (X)
features = ['Open', 'High', 'Low', 'Adj Close', 'Volume',
       'Previous_Close', '5SMA', '10SMA', '20SMA', '50SMA', '100SMA', '200SMA',
       '5EMA', '10EMA', '20EMA', 'MACD', 'MACD_signal', 'RSI', 'PSAR',
       'vortex_indicator', 'Upper_Band', 'Lower_Band', 'ATR5', 'ATR10',
       'ATR20', 'ATR50', 'Stoch_Signal', 'Stoch', 'WR', 'TSI', 'ADX', 'VWAP',
       'Daily_Return', 'Cumulative_Return', 'ROC5', 'ROC10', 'ROC20', 'ROC50',
       'ROC100', 'ROC200', 'CMF', 'Daily_Log_Return']

# Create an empty DataFrame to store results
all_results_rf = pd.DataFrame(columns=['Company', 'RMSE', 'MSE', 'MAE', 'R-squared', 'MAPE'])
all_results_xgb = pd.DataFrame(columns=['Company', 'RMSE', 'MSE', 'MAE', 'R-squared', 'MAPE'])

# Number of days ahead to predict
days_to_predict = 10

# Loop through each sheet (company) in the Excel file
for sheet_name in xls.sheet_names:
    print(f"Processing data for {sheet_name}...")

    # Get firm data for the current stock
    try:
        df = pd.read_excel(nifty50_excel_file, sheet_name=sheet_name)

        # Shift the 'Close' column to create the target variable (10 days ahead)
        df[f'Close_{days_to_predict}_Days_Ahead'] = df['Close'].shift(-days_to_predict)

        # Remove rows with NaN values in the target variable
        df = df.dropna(subset=[f'Close_{days_to_predict}_Days_Ahead'])

        # Define the target variable (y)
        target = f'Close_{days_to_predict}_Days_Ahead'

        X = df[features]
        y = df[target]

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Standardize the features
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Random Forest Hyperparameter Tuning
        rf_param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }

        rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=5, n_jobs=-1)
        rf_grid_search.fit(X_train, y_train)
        rf_model = rf_grid_search.best_estimator_
        rf_y_pred = rf_model.predict(X_test)

        # XGBoost Hyperparameter Tuning
        xgb_param_dist = {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 4, 5, 6],
            'learning_rate': [0.01, 0.1, 0.2, 0.3],
            'subsample': [0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
        }

        xgb_random_search = RandomizedSearchCV(xgb.XGBRegressor(random_state=42), xgb_param_dist, cv=5, n_iter=20, n_jobs=-1)
        xgb_random_search.fit(X_train, y_train)
        xgb_model = xgb_random_search.best_estimator_
        xgb_y_pred = xgb_model.predict(X_test)

        # Calculate evaluation metrics for Random Forest
        rf_rmse = np.sqrt(mean_squared_error(y_test, rf_y_pred))
        rf_mse = mean_squared_error(y_test, rf_y_pred)
        rf_mae = mean_absolute_error(y_test, rf_y_pred)
        rf_r_squared = r2_score(y_test, rf_y_pred)
        rf_mape = np.mean(np.abs((y_test.values - rf_y_pred) / y_test.values)) * 100

        # Calculate evaluation metrics for XGBoost
        xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_y_pred))
        xgb_mse = mean_squared_error(y_test, xgb_y_pred)
        xgb_mae = mean_absolute_error(y_test, xgb_y_pred)
        xgb_r_squared = r2_score(y_test, xgb_y_pred)
        xgb_mape = np.mean(np.abs((y_test.values - xgb_y_pred) / y_test.values)) * 100

        # Store results in the DataFrames
        rf_result = pd.DataFrame({
            'Company': [sheet_name],
            'RMSE': [rf_rmse],
            'MSE': [rf_mse],
            'MAE': [rf_mae],
            'R-squared': [rf_r_squared],
            'MAPE': [rf_mape]
        })

        xgb_result = pd.DataFrame({
            'Company': [sheet_name],
            'RMSE': [xgb_rmse],
            'MSE': [xgb_mse],
            'MAE': [xgb_mae],
            'R-squared': [xgb_r_squared],
            'MAPE': [xgb_mape]
        })

        all_results_rf = all_results_rf.append(rf_result, ignore_index=True)
        all_results_xgb = all_results_xgb.append(xgb_result, ignore_index=True)

        print(f"Data for {sheet_name} processed.")

    except Exception as e:
        print(f"Error processing data for {sheet_name}: {str(e)}")

# Save the results to CSV files
all_results_rf.to_csv('random_forest_results.csv', index=False)
all_results_xgb.to_csv('xgboost_results.csv', index=False)

print('Results saved to random_forest_results.csv and xgboost_results.csv')
