In [1]:
import os

In [2]:
%pwd

'/Users/c5367983/Desktop/Projects/QuantileX/research'

In [3]:
os.chdir("../")

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    assets_type: str


In [5]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir= config.root_dir,
            data_path = config.data_path + "/" + self.config.data_ingestion.assets_type ,
            assets_type =config.assets_type
        )

        return data_transformation_config


In [7]:
import os
from src.logging import logger
from datetime import datetime
from datasets import load_dataset, load_from_disk
from ta import add_all_ta_features
import numpy as np
import pandas as pd

In [8]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.df = pd.read_csv(self.config.data_path + "/" + self.config.assets_type + "_2015-01-01_" +datetime.now().strftime('%Y-%m-%d') + ".csv")
        self.preprocessed_df = None
        self.continuous_cols = []
        self.categorical_cols = []

    def preprocess(self):
        preprocessed_df = self.df.copy()
        preprocessed_df['Date'] = pd.to_datetime(preprocessed_df['Date'])
        preprocessed_df = preprocessed_df.set_index('Date')
        preprocessed_df['Timestamp'] = preprocessed_df.index.astype(np.int64)//10**9
        preprocessed_df['Year'] = preprocessed_df.index.year
        preprocessed_df['Month'] = preprocessed_df.index.month
        preprocessed_df['Day'] = preprocessed_df.index.day
        preprocessed_df['DayOfWeek'] = preprocessed_df.index.dayofweek
        preprocessed_df['MA7_Close'] = preprocessed_df['Close'].rolling(window=7).mean()
        preprocessed_df['MA30_Close'] = preprocessed_df['Close'].rolling(window=30).mean()
        preprocessed_df['Lag1_Close'] = preprocessed_df['Close'].shift(1)
        preprocessed_df['Volume_Change_Pct'] = preprocessed_df['Volume'].pct_change()
        preprocessed_df['target'] = np.where(preprocessed_df['Close'] < preprocessed_df['Open'], 0, 1)
        preprocessed_df['target'] = preprocessed_df['target'].shift(-1)

        preprocessed_df_with_ta = add_all_ta_features(preprocessed_df, open="Open", high="High", low="Low", close="Close", volume="Volume")
        
        preprocessed_df_with_ta = self.drop_cols(preprocessed_df_with_ta)
        
        self.preprocessed_df = preprocessed_df_with_ta

    def identify_column_type(self, threshold_unique = 100):
             
        for col in self.preprocessed_df.columns:
            unique_values = self.preprocessed_df[col].nunique(dropna=False)
            has_floats = any(self.preprocessed_df[col].apply(lambda x: isinstance(x,float)))
            if has_floats:
                self.continuous_cols.append(col)
                continue
            if unique_values <= threshold_unique:
                self.categorical_cols.append(col)
            else:
                self.continuous_cols.append(col)

    def impute_missing_values(self):
        for continuous_col in self.continuous_cols:
            self.preprocessed_df[continuous_col].fillna(self.preprocessed_df[continuous_col].mean(), inplace = True)
        for cotegorical_col in self.categorical_cols:
            mode_value = self.preprocessed_df[cotegorical_col].mode()[0]
            self.preprocessed_df[cotegorical_col].fillna(mode_value, inplace=True)
   
    def drop_cols(self, df):
        for column in df.columns:
            max_count = df[column].value_counts().max()
            if max_count/len(df) > 0.8:
                df.drop(column, axis=1, inplace=True)
        return df
    
    def save_data(self):
        if not os.path.exists(self.config.root_dir):
            os.makedirs(self.config.root_dir)
            
        filepath = os.path.join(self.config.root_dir, f"{self.config.assets_type}.csv")
        self.preprocessed_df.to_csv(filepath, index=True)
        print(f"Data saved successfully to {filepath}")

    
    def convert(self):
        self.preprocess()
        self.identify_column_type()
        self.impute_missing_values()
        self.save_data()
    

In [9]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e
 

[2024-02-25 20:20:28,918: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-02-25 20:20:28,919: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-25 20:20:28,920: INFO: common: Created directory at: artifacts]
[2024-02-25 20:20:28,920: INFO: common: Created directory at: artifacts/data_transformation]


  self._psar[i] = high2
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.preprocessed_df[continuous_col].fillna(self.preprocessed_df[continuous_col].mean(), inplace = True)


Data saved successfully to artifacts/data_transformation/BTC-USD.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.preprocessed_df[cotegorical_col].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.preprocessed_df[cotegorical_col].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the in