In [1]:
# import os
# import pandas as pd

# # Function to import feature datasets from features folder
# def import_feature_datasets(folder_path):
#     datasets = {}
#     for filename in os.listdir(folder_path):
#         if filename.endswith('.csv'):
#             stock_name = filename.split('features')[0].strip('_')
#             dataset_path = os.path.join(folder_path, filename)
#             datasets[stock_name] = pd.read_csv(dataset_path)
#     return datasets

# # Path to the features folder
# features_folder_path = 'features/'

# # Import feature datasets
# feature_datasets = import_feature_datasets(features_folder_path)

# # Display the imported datasets
# for stock_name, dataset in feature_datasets.items():
#     print(f"Dataset for {stock_name}:")
#     print(dataset.head())


In [2]:
import os
import pandas as pd

# Function to import feature datasets from features folder
def import_feature_datasets(folder_path):
    datasets = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            stock_name = filename.split('_features')[0].strip('_')
            dataset_path = os.path.join(folder_path, filename)
            # Read the CSV file and set the 'Date' column as the index
            dataset = pd.read_csv(dataset_path, index_col='Date', parse_dates=True)
            datasets[stock_name] = dataset
    return datasets

# Path to the features folder
features_folder_path = 'features/'

# Import feature datasets
feature_datasets = import_feature_datasets(features_folder_path)

# Display the imported datasets
for stock_name, dataset in feature_datasets.items():
    print(f"Dataset for {stock_name}:")
    print(dataset.head())


Dataset for HDFC_Bank_Limited:
                 Open       High        Low      Close  Adj Close   Volume  \
Date                                                                         
2000-05-19  25.205000  26.049999  25.205000  25.575001  21.421576  2169050   
2000-05-22  25.600000  25.790001  24.325001  24.885000  20.843628  1101500   
2000-05-23  24.700001  24.700001  21.905001  23.375000  19.578856  2039100   
2000-05-24  23.000000  25.245001  22.010000  24.209999  20.278248  3198410   
2000-05-25  24.250000  24.750000  23.010000  23.360001  19.566296  1576310   

            daily_returns          rsi        ma       std  ...  trend_high  \
Date                                                        ...               
2000-05-19      -0.117160   488.300638  25.25875  0.959711  ...         NaN   
2000-05-22      -2.697949 -1114.436831  25.34250  0.836719  ...         NaN   
2000-05-23      -6.067913  -174.784884  25.35650  0.798609  ...        -1.0   
2000-05-24       3.572189  

In [3]:
import numpy as np

# Function to add new columns to datasets
def add_new_columns(datasets):
    for stock_name, dataset in datasets.items():
        # Calculate daily returns
        dataset['daily_returns'] = dataset['Close'].pct_change() * 100
        mean_return = dataset['daily_returns'].mean()
        p = mean_return * 10

        # Define the conditions and apply them to create 'expected_returns'
        conditions = [
            (dataset['daily_returns'] >= p),
            (dataset['daily_returns'] <= -1 * p),
            (np.abs(dataset['daily_returns']) - 1 * p < p)
        ]
        choices = [1, -1, 0]
        dataset['expected_returns'] = np.select(conditions, choices, default=0)

        # Drop NaN values introduced by the calculation
        dataset.fillna(0, inplace=True)

        # Shift the 'expected_returns' column by one day to get the next day's expected return
        dataset['next_day_expected_returns'] = dataset['expected_returns'].shift(-1)

        # Drop NaN values introduced by shifting
        dataset.dropna(inplace=True)

        # Save the modified dataset
        dataset.to_csv(f"features/{stock_name}_features.csv", index=False)

# Add new columns to imported datasets
add_new_columns(feature_datasets)


In [4]:
# import os
# from joblib import dump
# import json
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import classification_report
# import matplotlib.pyplot as plt


# # Function to save y_test datasets
# def save_y_test(y_test_datasets, folder_path):
#     for stock_name, y_test in y_test_datasets.items():
#         file_path = os.path.join(folder_path, f'{stock_name}_y_test.csv')
#         # Set 'Date' column as index and convert it to datetime index
#         y_test.set_index('Date', inplace=True)
#         y_test.index = pd.to_datetime(y_test.index)
#         print(y_test)
#         y_test.to_csv(file_path)
#         print(f"Saved {stock_name}_y_test.csv in {folder_path}")

# # Train models on each dataset, print the classification report, and save the model, report, and y_test
# def train_models_on_datasets(features_datasets, models_folder, reports_folder, y_test_folder):
#     if not os.path.exists(models_folder):
#         os.makedirs(models_folder)
#     if not os.path.exists(reports_folder):
#         os.makedirs(reports_folder)
#     if not os.path.exists(y_test_folder):
#         os.makedirs(y_test_folder)
    
#     models_with_reports = {}
#     for stock_name, dataset in features_datasets.items():
#         print(f'Training model for {stock_name} dataset...')
#         # Drop NaN values
#         dataset.fillna(0, inplace=True)
#         # Drop the last row as it will have NaN in the 'next_day_expected_returns' column
#         dataset = dataset.dropna()
        
#         print(dataset)
        
#         # Define features and target variable
#         features = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'daily_returns',
#                     'rsi', 'ma', 'std', 'upper_band', 'lower_band', 'ma_5', 'ma_20',
#                     'ma_50', 'ma_100', 'volume_ma_5', 'volume_ma_20', 'pct_change_1d',
#                     'pct_change_3d', 'pct_change_5d', 'high_1d', 'low_1d', 'high_3d',
#                     'low_3d', 'high_5d', 'low_5d', 'high_8d', 'low_8d', 'high_13d',
#                     'low_13d', 'high_21d', 'low_21d', 'peak', 'trough', 'day', 'month',
#                     'week', 'year', 'quarter', 'weekday', 'high_roll_max', 'low_roll_min',
#                     'head_shoulder_pattern', 'trend_high', 'trend_low', 'channel_pattern',
#                     'double_pattern', 'close_roll_max', 'close_roll_min',
#                     'multiple_top_bottom_pattern', 'slope', 'intercept', 'support',
#                     'resistance', 'triangle_pattern', 'wedge_pattern']

#         target = 'next_day_expected_returns'
        
#         # Select features and target variable
#         X = dataset[features]
#         y = dataset[target]

#         # Split the data into training and testing sets
#         X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2, random_state=42)
        

#         # Save the y_test dataset
#         y_test_file_path = os.path.join(y_test_folder, f'{stock_name}_y_test.csv')
#         y_test.reset_index().rename(columns={'index': 'Date'}).to_csv(y_test_file_path, index=False)
#         print(f'Saved {stock_name}_y_test.csv in {y_test_folder}')

#         # Create a linear regression model
#         model = LinearRegression()

#         # Train the model
#         model.fit(X_train, y_train)

#         # Make predictions on the test set
#         y_pred = model.predict(X_test)

#         # Convert predicted values to binary (1, -1, 0)
#         y_pred_binary = np.round(y_pred)

#         # Convert predicted values to binary (1, 0, -1) based on a threshold
#         threshold = 0.5  # You can adjust this threshold based on your preference
#         y_pred_binary = np.where(y_pred > threshold, 1, np.where(y_pred < -threshold, -1, 0))

#         # Print the classification report
#         report = classification_report(y_test, y_pred_binary)
#         print(f'Classification Report for {stock_name}:\n{report}')

#         # Save the classification report
#         report_file_path = os.path.join(reports_folder, f'{stock_name}_classification_report.json')
#         with open(report_file_path, 'w') as report_file:
#             json.dump(report, report_file)
#         print(f'Classification Report for {stock_name} saved successfully at {report_file_path}')
        
#         # Save the trained model
#         model_file_path = os.path.join(models_folder, f'{stock_name}_model.joblib')
#         dump(model, model_file_path)
#         print(f'Model for {stock_name} saved successfully at {model_file_path}')

#         # Store the trained model with the classification report
#         models_with_reports[stock_name] = (model, report)

#         print(f'Model trained for {stock_name} dataset')

#     return models_with_reports

# # Define folder paths
# models_folder = 'models'
# reports_folder = 'classification_reports'
# y_test_folder = 'y_test'

# # Train models on each dataset, print the classification report, and save the model, report, and y_test
# models_with_reports = train_models_on_datasets(feature_datasets, models_folder, reports_folder, y_test_folder)

# # Access the trained models and their classification reports
# for stock_name, (model, report) in models_with_reports.items():
#     print(f'Model for {stock_name}: {model}')
#     print(f'Classification Report:\n{report}\n')


In [5]:
import os
from joblib import dump
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

# Train models on each dataset, print the classification report, and save the model, report, and y_test
def train_models_on_datasets(features_datasets, models_folder, reports_folder, y_test_folder):
    if not os.path.exists(models_folder):
        os.makedirs(models_folder)
    if not os.path.exists(reports_folder):
        os.makedirs(reports_folder)
    if not os.path.exists(y_test_folder):
        os.makedirs(y_test_folder)
    
    models_with_reports = {}
    for stock_name, dataset in features_datasets.items():
        print(f'Training model for {stock_name} dataset...')
        # Drop NaN values
        dataset.fillna(0, inplace=True)
        # Drop the last row as it will have NaN in the 'next_day_expected_returns' column
        dataset = dataset.dropna()
        
        print(dataset)
        
        # Define features and target variable
        features = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'daily_returns',
                    'rsi', 'ma', 'std', 'upper_band', 'lower_band', 'ma_5', 'ma_20',
                    'ma_50', 'ma_100', 'volume_ma_5', 'volume_ma_20', 'pct_change_1d',
                    'pct_change_3d', 'pct_change_5d', 'high_1d', 'low_1d', 'high_3d',
                    'low_3d', 'high_5d', 'low_5d', 'high_8d', 'low_8d', 'high_13d',
                    'low_13d', 'high_21d', 'low_21d', 'peak', 'trough', 'day', 'month',
                    'week', 'year', 'quarter', 'weekday', 'high_roll_max', 'low_roll_min',
                    'head_shoulder_pattern', 'trend_high', 'trend_low', 'channel_pattern',
                    'double_pattern', 'close_roll_max', 'close_roll_min',
                    'multiple_top_bottom_pattern', 'slope', 'intercept', 'support',
                    'resistance', 'triangle_pattern', 'wedge_pattern']

        target = 'next_day_expected_returns'
        
        # Select features and target variable
        X = dataset[features]
        y = dataset[target]

        # Convert the index to datetime if it's not already
        if not isinstance(y.index, pd.DatetimeIndex):
            y.index = pd.to_datetime(y.index)

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2, random_state=42)
        
        # Create a DataFrame for y_test with 'Date' as index
        y_test_df = pd.DataFrame({'Date': y_test.index, 'next_day_expected_returns': y_test})
        y_test_df.set_index('Date', inplace=True)  # Set 'Date' column as index

        # Save the y_test dataset
        y_test_file_path = os.path.join(y_test_folder, f'{stock_name}_y_test.csv')
        y_test_df.to_csv(y_test_file_path)
        print(f'Saved {stock_name}_y_test.csv in {y_test_folder}')

        # Create a linear regression model
        model = LinearRegression()

        # Train the model
        model.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = model.predict(X_test)

        # Convert predicted values to binary (1, -1, 0)
        y_pred_binary = np.round(y_pred)

        # Convert predicted values to binary (1, 0, -1) based on a threshold
        threshold = 0.5  # You can adjust this threshold based on your preference
        y_pred_binary = np.where(y_pred > threshold, 1, np.where(y_pred < -threshold, -1, 0))

        # Print the classification report
        report = classification_report(y_test, y_pred_binary)
        print(f'Classification Report for {stock_name}:\n{report}')

        # Save the classification report
        report_file_path = os.path.join(reports_folder, f'{stock_name}_classification_report.json')
        with open(report_file_path, 'w') as report_file:
            json.dump(report, report_file)
        print(f'Classification Report for {stock_name} saved successfully at {report_file_path}')
        
        # Save the trained model
        model_file_path = os.path.join(models_folder, f'{stock_name}_model.joblib')
        dump(model, model_file_path)
        print(f'Model for {stock_name} saved successfully at {model_file_path}')

        # Store the trained model with the classification report
        models_with_reports[stock_name] = (model, report)

        print(f'Model trained for {stock_name} dataset')

    return models_with_reports

# Define folder paths
models_folder = 'models'
reports_folder = 'classification_reports'
y_test_folder = 'y_test'

# Train models on each dataset, print the classification report, and save the model, report, and y_test
models_with_reports = train_models_on_datasets(feature_datasets, models_folder, reports_folder, y_test_folder)

# Access the trained models and their classification reports
for stock_name, (model, report) in models_with_reports.items():
    print(f'Model for {stock_name}: {model}')
    print(f'Classification Report:\n{report}\n')


Training model for HDFC_Bank_Limited dataset...
                   Open         High          Low        Close    Adj Close  \
Date                                                                          
2000-05-19    25.205000    26.049999    25.205000    25.575001    21.421576   
2000-05-22    25.600000    25.790001    24.325001    24.885000    20.843628   
2000-05-23    24.700001    24.700001    21.905001    23.375000    19.578856   
2000-05-24    23.000000    25.245001    22.010000    24.209999    20.278248   
2000-05-25    24.250000    24.750000    23.010000    23.360001    19.566296   
...                 ...          ...          ...          ...          ...   
2024-03-26  1427.199951  1437.949951  1422.150024  1425.400024  1425.400024   
2024-03-27  1423.550049  1447.949951  1421.250000  1440.699951  1440.699951   
2024-03-28  1440.699951  1460.500000  1440.699951  1447.900024  1447.900024   
2024-04-01  1458.000000  1473.800049  1455.599976  1470.500000  1470.500000   
2024

Classification Report for ITC_Limited:
              precision    recall  f1-score   support

        -1.0       0.51      0.25      0.33       298
         0.0       0.47      0.50      0.49       553
         1.0       0.54      0.73      0.62       340

    accuracy                           0.50      1191
   macro avg       0.51      0.49      0.48      1191
weighted avg       0.50      0.50      0.49      1191

Classification Report for ITC_Limited saved successfully at classification_reports\ITC_Limited_classification_report.json
Model for ITC_Limited saved successfully at models\ITC_Limited_model.joblib
Model trained for ITC_Limited dataset
Training model for Kotak_Mahindra_Bank_Limited dataset...
                   Open         High          Low        Close    Adj Close  \
Date                                                                          
2001-11-16     2.422500     2.422500     2.422500     2.422500     2.241737   
2001-11-19     2.500000     2.690000     2.500000

Classification Report for Larsen_&_Toubro_Limited:
              precision    recall  f1-score   support

        -1.0       0.32      0.13      0.19       180
         0.0       0.60      0.75      0.67       645
         1.0       0.40      0.31      0.35       234

    accuracy                           0.55      1059
   macro avg       0.44      0.40      0.40      1059
weighted avg       0.51      0.55      0.52      1059

Classification Report for Larsen_&_Toubro_Limited saved successfully at classification_reports\Larsen_&_Toubro_Limited_classification_report.json
Model for Larsen_&_Toubro_Limited saved successfully at models\Larsen_&_Toubro_Limited_model.joblib
Model trained for Larsen_&_Toubro_Limited dataset
Training model for Oil_and_Natural_Gas_Corporation_Limited dataset...
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2000-05-19   12.861111   12.861111   12.222222   12.75

                   Open         High          Low        Close    Adj Close  \
Date                                                                          
2002-12-27    55.650002    55.875000    53.787498    54.012501    38.259472   
2002-12-30    54.000000    57.724998    52.762501    57.150002    40.481888   
2002-12-31    57.525002    61.087502    56.000000    60.112499    42.580364   
2003-01-01    59.987499    61.974998    59.987499    61.087502    43.271000   
2003-01-02    61.525002    62.924999    57.912498    58.299999    41.296482   
...                 ...          ...          ...          ...          ...   
2024-03-26  3875.000000  3946.699951  3871.449951  3877.500000  3877.500000   
2024-03-27  3888.500000  3895.000000  3829.399902  3840.899902  3840.899902   
2024-03-28  3850.100098  3915.000000  3840.500000  3876.300049  3876.300049   
2024-04-01  3897.699951  3933.300049  3888.050049  3916.750000  3916.750000   
2024-04-02  3890.000000  3909.850098  3873.000000  3

In [6]:
# import os
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt

# # Define folder paths
# features_folder = 'features'
# y_test_folder = 'y_test'

# # Load datasets for each stock
# features_datasets = {}
# y_test_datasets = {}

# # Iterate through each stock folder in the features folder
# for stock_name in os.listdir(features_folder):
#     # Construct paths to features and y_test files for the current stock
#     features_file = os.path.join(features_folder, stock_name, f'{stock_name}_features.csv')
#     y_test_file = os.path.join(y_test_folder, stock_name, f'{stock_name}_y_test.csv')
    
#     # Check if both features and y_test files exist
#     if os.path.isfile(features_file) and os.path.isfile(y_test_file):
#         # Read features and y_test files into DataFrames
#         print(f"Reading features data for {stock_name} from: {features_file}")
#         features_datasets[stock_name] = pd.read_csv(features_file)
#         print(features_datasets[stock_name].head())  # Print the first few rows of features data for debugging
#         print(f"Reading y_test data for {stock_name} from: {y_test_file}")
#         y_test_datasets[stock_name] = pd.read_csv(y_test_file)
#         print(y_test_datasets[stock_name].head())  # Print the first few rows of y_test data for debugging

# # Plot cumulative returns for each stock
# for stock_name, features_data in features_datasets.items():
#     if stock_name in y_test_datasets:
#         # Assuming 'y_test' is your Series with predictions indexed by date
#         predictions_series = y_test_datasets[stock_name]['next_day_expected_returns']
        
#         # Convert index to datetime if it's not already, to ensure alignment
#         predictions_series.index = pd.to_datetime(predictions_series.index)
#         features_data.index = pd.to_datetime(features_data.index)

#         # Extend the index of features_data to include the next day's data for the last date in predictions_series.index
#         last_date = predictions_series.index[-1] + pd.Timedelta(days=1)
#         features_data = pd.concat([features_data, pd.DataFrame(index=[last_date])])

#         # Align predictions with daily returns
#         # Here we multiply the predictions directly with the corresponding daily returns from features_data
#         strategy_returns = predictions_series * features_data['daily_returns']
#         strategy_returns = strategy_returns.clip(lower=-1)

#         # Calculate cumulative returns
#         cumulative_returns = (1 + strategy_returns / 100).cumprod() - 1

#         # Plot cumulative returns
#         plt.figure(figsize=(10, 6))
#         plt.plot(cumulative_returns.index, cumulative_returns, label='Cumulative Returns')
#         plt.xlabel('Date')
#         plt.ylabel('Cumulative Returns')
#         plt.title(f'{stock_name} - Strategy Cumulative Returns')
#         plt.legend()
#         plt.show()


In [7]:
# import os
# import pandas as pd

# # Define the path to the features folder

# # Initialize a dictionary to store the datasets
# features_datasets = {}

# # Iterate through each file in the features folder
# for file_name in os.listdir(features_folder):
#     # Check if the file is a CSV file
#     if file_name.endswith('.csv'):
#         # Extract the stock name from the file name
#         stock_name = file_name.split('_')[0]
#         # Read the CSV file and store it in the dictionary with the stock name as key
#         features_datasets[stock_name] = pd.read_csv(os.path.join(features_folder, file_name))

# # Print the loaded datasets
# for stock_name, dataset in features_datasets.items():
#     print(f'Dataset for {stock_name}:')
#     print(dataset.head())  # Print the first few rows of each dataset


In [8]:
# import os
# import pandas as pd

# # Define the path to the y_test folder
# y_test_folder = 'y_test'

# # Initialize a dictionary to store the datasets
# y_test_datasets = {}

# # Iterate through each file in the y_test folder
# for file_name in os.listdir(y_test_folder):
#     # Check if the file is a CSV file
#     if file_name.endswith('.csv'):
#         # Extract the stock name from the file name
#         stock_name = file_name.split('_')[0]
#         # Read the CSV file and store it in the dictionary with the stock name as key
#         y_test_datasets[stock_name] = pd.read_csv(os.path.join(y_test_folder, file_name))

# # Print the loaded datasets
# for stock_name, dataset in y_test_datasets.items():
#     print(f'Dataset for {stock_name}:')
#     print(dataset.head())  # Print the first few rows of each dataset
