# Load Dataset

In [None]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('/content/BAT_DATASET.csv')
df.columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
df['Date'] = pd.to_datetime(df['Date'])

# Inspect the dataset
print(df.head())
print(df.info())


  df['Date'] = pd.to_datetime(df['Date'])


        Date   Open   High    Low  Close  Volume
0 2023-03-30  518.7  518.7  518.7  518.7   324.0
1 2023-03-29  518.7  518.7  518.7  518.7   700.0
2 2023-03-28  518.7  518.7  518.7  518.7   277.0
3 2023-03-27  518.7  518.7  518.7  518.7   692.0
4 2023-03-23  518.7  518.7  518.7  518.7   109.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4584 entries, 0 to 4583
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    4584 non-null   datetime64[ns]
 1   Open    4584 non-null   float64       
 2   High    4584 non-null   float64       
 3   Low     4584 non-null   float64       
 4   Close   4584 non-null   float64       
 5   Volume  4584 non-null   float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 215.0 KB
None


# Create Polynomial Features

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Polynomial Features (degree 2)
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[['Open', 'High', 'Low', 'Close']])
poly_feature_names = poly.get_feature_names_out(['Open', 'High', 'Low', 'Close'])

# Add polynomial features to dataset
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)
df = pd.concat([df, poly_df], axis=1)

print("Added polynomial features:")
print(poly_df.head())


Added polynomial features:
    Open   High    Low  Close     Open^2  Open High   Open Low  Open Close  \
0  518.7  518.7  518.7  518.7  269049.69  269049.69  269049.69   269049.69   
1  518.7  518.7  518.7  518.7  269049.69  269049.69  269049.69   269049.69   
2  518.7  518.7  518.7  518.7  269049.69  269049.69  269049.69   269049.69   
3  518.7  518.7  518.7  518.7  269049.69  269049.69  269049.69   269049.69   
4  518.7  518.7  518.7  518.7  269049.69  269049.69  269049.69   269049.69   

      High^2   High Low  High Close      Low^2  Low Close    Close^2  
0  269049.69  269049.69   269049.69  269049.69  269049.69  269049.69  
1  269049.69  269049.69   269049.69  269049.69  269049.69  269049.69  
2  269049.69  269049.69   269049.69  269049.69  269049.69  269049.69  
3  269049.69  269049.69   269049.69  269049.69  269049.69  269049.69  
4  269049.69  269049.69   269049.69  269049.69  269049.69  269049.69  


# Create Interaction Features

In [None]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('/content/BAT_DATASET.csv')
df.columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
df['Date'] = pd.to_datetime(df['Date'])

# Debugging: Check the structure of the dataset
print(df.head())
print(df.info())

# Ensure 'High' and 'Low' are Series
print("Checking data types of 'High' and 'Low':")
print(type(df['High']), type(df['Low']))

# Ensure 'High' and 'Low' are numeric and fix issues if necessary
df['High'] = pd.to_numeric(df['High'], errors='coerce')
df['Low'] = pd.to_numeric(df['Low'], errors='coerce')

# Handle missing values if any
df['High'].fillna(df['High'].mean(), inplace=True)
df['Low'].fillna(df['Low'].mean(), inplace=True)

# Debugging: Check for missing values and data types again
print("After conversion and filling missing values:")
print(df[['High', 'Low']].isnull().sum())
print(df[['High', 'Low']].head())

# Calculate interaction features
df['High_Low_Interaction'] = df['High'] * df['Low']
df['Open_Close_Interaction'] = df['Open'] * df['Close']

# Debugging: Check the new interaction features
print("Interaction features added:")
print(df[['High_Low_Interaction', 'Open_Close_Interaction']].head())

# Save the updated dataset to a new CSV file
df.to_csv('BATB_with_interaction_features.csv', index=False)
print("Feature engineering completed. Updated dataset saved as 'BATB_with_interaction_features.csv'.")


  df['Date'] = pd.to_datetime(df['Date'])


        Date   Open   High    Low  Close  Volume
0 2023-03-30  518.7  518.7  518.7  518.7   324.0
1 2023-03-29  518.7  518.7  518.7  518.7   700.0
2 2023-03-28  518.7  518.7  518.7  518.7   277.0
3 2023-03-27  518.7  518.7  518.7  518.7   692.0
4 2023-03-23  518.7  518.7  518.7  518.7   109.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4584 entries, 0 to 4583
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    4584 non-null   datetime64[ns]
 1   Open    4584 non-null   float64       
 2   High    4584 non-null   float64       
 3   Low     4584 non-null   float64       
 4   Close   4584 non-null   float64       
 5   Volume  4584 non-null   float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 215.0 KB
None
Checking data types of 'High' and 'Low':
<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
After conversion and filling missing values:
High    0
Low     0
dtyp

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['High'].fillna(df['High'].mean(), inplace=True)  # Replace NaN with mean
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Low'].fillna(df['Low'].mean(), inplace=True)


# Create Domain-Specific Features

In [None]:
# Domain-Specific Features
df['Daily_Range'] = df['High'] - df['Low']
df['Volatility'] = (df['High'] - df['Low']) / df['Open']
df['Close_to_Open_Ratio'] = df['Close'] / df['Open']

print("Added domain-specific features:")
print(df[['Daily_Range', 'Volatility', 'Close_to_Open_Ratio']].head())


Added domain-specific features:
   Daily_Range  Volatility  Close_to_Open_Ratio
0          0.0         0.0                  1.0
1          0.0         0.0                  1.0
2          0.0         0.0                  1.0
3          0.0         0.0                  1.0
4          0.0         0.0                  1.0


# Apply Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize features for PCA
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['Open', 'High', 'Low', 'Close']])

# Perform PCA
pca = PCA(n_components=2)
pca_features = pca.fit_transform(scaled_features)

# Add PCA features to dataset
df['PCA1'] = pca_features[:, 0]
df['PCA2'] = pca_features[:, 1]

print("Added PCA features:")
print(df[['PCA1', 'PCA2']].head())


Added PCA features:
       PCA1      PCA2
0  3.705931  0.031668
1  3.705931  0.031668
2  3.705931  0.031668
3  3.705931  0.031668
4  3.705931  0.031668


# Apply Log Transformation

In [None]:
# Log Transformation
df['Log_Volume'] = np.log1p(df['Volume'])  # log1p handles zero values

print("Applied log transformation on Volume:")
print(df[['Volume', 'Log_Volume']].head())


Applied log transformation on Volume:
   Volume  Log_Volume
0   324.0    5.783825
1   700.0    6.552508
2   277.0    5.627621
3   692.0    6.541030
4   109.0    4.700480


# Normalize and Standardize

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Normalization
min_max_scaler = MinMaxScaler()
df[['Open_Norm', 'High_Norm', 'Low_Norm', 'Close_Norm']] = min_max_scaler.fit_transform(df[['Open', 'High', 'Low', 'Close']])

# Standardization
df[['Open_Std', 'High_Std', 'Low_Std', 'Close_Std']] = scaler.fit_transform(df[['Open', 'High', 'Low', 'Close']])

print("Applied normalization and standardization:")
print(df[['Open_Norm', 'High_Norm', 'Low_Norm', 'Close_Norm', 'Open_Std', 'High_Std']].head())


Applied normalization and standardization:
   Open_Norm  High_Norm  Low_Norm  Close_Norm  Open_Std  High_Std
0   0.690326   0.684863  0.717094    0.705019   1.85096   1.83017
1   0.690326   0.684863  0.717094    0.705019   1.85096   1.83017
2   0.690326   0.684863  0.717094    0.705019   1.85096   1.83017
3   0.690326   0.684863  0.717094    0.705019   1.85096   1.83017
4   0.690326   0.684863  0.717094    0.705019   1.85096   1.83017


# Binning

In [None]:
# Equal Width Binning
df['Daily_Range_Bin'] = pd.cut(df['Daily_Range'], bins=5, labels=False)

# Equal Frequency Binning
df['Volume_Bin'] = pd.qcut(df['Volume'], q=4, labels=False)

print("Applied binning:")
print(df[['Daily_Range_Bin', 'Volume_Bin']].head())


Applied binning:
   Daily_Range_Bin  Volume_Bin
0                0           0
1                0           0
2                0           0
3                0           0
4                0           0


# Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

# One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False)  # Replace 'sparse' with 'sparse_output'
month_encoded = encoder.fit_transform(df[['Month']])
month_encoded_df = pd.DataFrame(month_encoded, columns=encoder.get_feature_names_out(['Month']))
df = pd.concat([df, month_encoded_df], axis=1)

print("One-hot encoding completed.")
print(df.head())


One-hot encoding completed.
        Date   Open   High    Low  Close  Volume  High_Low_Interaction  \
0 2023-03-30  518.7  518.7  518.7  518.7   324.0             269049.69   
1 2023-03-29  518.7  518.7  518.7  518.7   700.0             269049.69   
2 2023-03-28  518.7  518.7  518.7  518.7   277.0             269049.69   
3 2023-03-27  518.7  518.7  518.7  518.7   692.0             269049.69   
4 2023-03-23  518.7  518.7  518.7  518.7   109.0             269049.69   

   Open_Close_Interaction  Daily_Range  Volatility  ...  Month_3  Month_4  \
0               269049.69          0.0         0.0  ...      1.0      0.0   
1               269049.69          0.0         0.0  ...      1.0      0.0   
2               269049.69          0.0         0.0  ...      1.0      0.0   
3               269049.69          0.0         0.0  ...      1.0      0.0   
4               269049.69          0.0         0.0  ...      1.0      0.0   

   Month_5  Month_6  Month_7  Month_8  Month_9  Month_10  Month_

# Imputation

In [None]:
from sklearn.impute import KNNImputer

# Mean Imputation
df['Volume_Imputed_Mean'] = df['Volume'].fillna(df['Volume'].mean())

# KNN Imputation
imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(df[['Open', 'High', 'Low', 'Close', 'Volume']])
df[['Open_Imputed', 'High_Imputed', 'Low_Imputed', 'Close_Imputed', 'Volume_Imputed_KNN']] = imputed_data

print("Applied imputation:")
print(df[['Volume_Imputed_Mean', 'Volume_Imputed_KNN']].head())


Applied imputation:
   Volume_Imputed_Mean  Volume_Imputed_KNN
0                324.0               324.0
1                700.0               700.0
2                277.0               277.0
3                692.0               692.0
4                109.0               109.0


# Save File


In [None]:
# Save the final dataset
df.to_csv('BATB_feature_engineered.csv', index=False)
print("Feature engineering completed and saved to 'BATB_feature_engineered.csv'.")



Feature engineering completed and saved to 'BATB_feature_engineered.csv'.


# Filter Methods

**1.1 Correlation Analysis**

In [None]:
# Compute correlation matrix
correlation_matrix = df.corr()

# Display highly correlated features (absolute correlation > 0.8)
high_correlation = correlation_matrix[correlation_matrix.abs() > 0.8]
print("Highly correlated features:\n", high_correlation)


Highly correlated features:
                             Date      Open      High       Low     Close  \
Date                    1.000000  0.891622  0.891854  0.891847  0.892019   
Open                    0.891622  1.000000  0.999874  0.999869  0.999813   
High                    0.891854  0.999874  1.000000  0.999772  0.999884   
Low                     0.891847  0.999869  0.999772  1.000000  0.999899   
Close                   0.892019  0.999813  0.999884  0.999899  1.000000   
Volume                       NaN       NaN       NaN       NaN       NaN   
High_Low_Interaction         NaN  0.953480  0.953177  0.953178  0.952892   
Open_Close_Interaction       NaN  0.953451  0.953081  0.953087  0.952832   
Daily_Range                  NaN       NaN       NaN       NaN       NaN   
Volatility                   NaN       NaN       NaN       NaN       NaN   
Close_to_Open_Ratio          NaN       NaN       NaN       NaN       NaN   
PCA1                    0.891885  0.999944  0.999938  0.999

**1.2 Mutual Information**

In [None]:
import pandas as pd
from sklearn.feature_selection import mutual_info_regression
# Load dataset
df = pd.read_csv('/content/BATB_feature_engineered.csv')

# Define features (X) and target (y)
X = df.drop(['Close', 'Date'], axis=1)  # Drop target and non-numeric columns
y = df['Close']

# Compute mutual information
mutual_info = mutual_info_regression(X, y)

# Create a DataFrame for results
mutual_info_df = pd.DataFrame({
    'Feature': X.columns,
    'Mutual_Information': mutual_info
}).sort_values(by='Mutual_Information', ascending=False)

print("Top Features by Mutual Information:\n", mutual_info_df.head(10))

# Select top features
selected_features_mi = mutual_info_df['Feature'].head(10).tolist()
X_mi = X[selected_features_mi]


Top Features by Mutual Information:
                Feature  Mutual_Information
8        Close_Imputed            6.882125
2                  Low            5.052659
7          Low_Imputed            5.051396
6         High_Imputed            4.943950
1                 High            4.942723
5         Open_Imputed            4.619163
0                 Open            4.618248
3               Volume            0.528728
4  Volume_Imputed_Mean            0.528252
9   Volume_Imputed_KNN            0.527463


**2. Wrapper Methods**

**2.1 Recursive Feature Elimination (RFE)**

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Ensure X contains only numeric columns
X_numeric = X.select_dtypes(include=['float64', 'int64'])

# Apply RFE
rfe = RFE(estimator=model, n_features_to_select=10)
rfe.fit(X_numeric, y)

# Get selected features
rfe_selected_features = X_numeric.columns[rfe.support_]
print("Selected features by RFE:\n", rfe_selected_features)

# Subset DataFrame with selected features
X_rfe = X_numeric[rfe_selected_features]



Selected features by RFE:
 Index(['Open', 'High', 'Low', 'Volume', 'Volume_Imputed_Mean', 'Open_Imputed',
       'High_Imputed', 'Low_Imputed', 'Close_Imputed', 'Volume_Imputed_KNN'],
      dtype='object')


#  Save Selected Features

In [None]:
from google.colab import files

# Combine selected features with the target variable
selected_features_df = pd.concat([X_rfe, y], axis=1)

# Save to a CSV file
file_name = 'selected_features.csv'
selected_features_df.to_csv(file_name, index=False)
print(f"Selected features saved to '{file_name}'.")

# Download the file
files.download(file_name)


Selected features saved to 'selected_features.csv'.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>