In [1]:
import polars as pl
import yaml
import gzip
#sckit learn

from sklearn.preprocessing  import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from data_processor_polars import DataProcessor

### <span style="background-color: lightyellow;">Data retrieval task</span>
- Retrieve the data. (No cleaning needed yet)

In [2]:

def get_config(file_name):

    with open(file_name, 'r', encoding="UTF-8") as stream:
        config = yaml.safe_load(stream)
    return config


def parse_metadata(file_path):
    data = {}
    with gzip.open(file_path, 'rt') as f:
        lines = f.readlines()

    for line in lines:
        splited = line.split('\t', 1)
        if len(splited) >= 2:
            # Only include lines where the first part is '!Sample_title' or does not start with '!'
            if splited[0].strip() == '!Sample_title' or not splited[0].lstrip().startswith('!'):
                data[splited[0].strip()] = splited[1].strip().strip(
                    '"').split("\t")

    df = pl.LazyFrame(data)
    df = df.drop(['!Sample_title'])
    df = df.drop('"ID_REF"',)
    # df = df.apply(pl.to_numeric, errors='coerce')
    df = df.with_columns(
    pl.all().cast(pl.Float64, strict=False)  # Convert all **non-string** columns
)   
    return df.collect()


def retrieve_data():

    config = get_config('config.yaml')
    lung3 = config['lung3_csv']
    gene = config['gene']

    lung3_df = pl.read_csv(lung3)
    

    gene_expression_df = parse_metadata(gene)
    
    combined_df = gene_expression_df.with_columns([lung3_df[col] for col in lung3_df.columns])
    
    return combined_df


def sub_classification(histology):
    if "Carcinoma" in histology:
        return 'Carcinoma'
    elif "Adenocarcinoma" in histology:
        return 'Adenocarcinoma'
    else:
        return 'Others'

### <span style="background-color: lightyellow;">Feature engineering Task</span>

In [3]:
class FeatureProcessing(TransformerMixin, BaseEstimator): 
    def __init__(self, covariance_threshold=0, quantile_percentage=95, nan_threshold =35):
        self.covariance_threshold = covariance_threshold
        self.quantile_percentage = quantile_percentage
        self.nan_threshold= nan_threshold
        
    def fit(self, X, y=None):
        
        data_processor = DataProcessor(X)
        data_processor.remove_nonrelated_columns()
        data_processor.impute_notavailable_values('characteristics.tag.grade')
        data_processor.drop_nan_columns(self.nan_threshold)
        # data_processor.change_column_datatype()
        
        data_processor.cramerV(y, self.covariance_threshold)
        self.covarrianced_columns = data_processor.covarrianced_columns
        removed_catagorical_features = set(data_processor.find_cols_on_type(pl.String)) - set(self.covarrianced_columns)
        data_processor.drop_columns(column_list = list(removed_catagorical_features))
        data_processor.selecting_high_variance_gene_expression(self.quantile_percentage)
        self.features = data_processor.dataframe.columns
        
        self.scaler = StandardScaler()
        self.scaler.fit(data_processor.dataframe[data_processor.find_cols_on_type(pl.Float64)])
        
        self.one_hot_encoder= OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.one_hot_encoder.fit(data_processor.dataframe[data_processor.covarrianced_columns])
        self.processed_df = data_processor.dataframe
        return self
    
    def transform(self, X):
    
       data_processor = DataProcessor(X)
       data_processor.dataframe = data_processor.dataframe[self.features]
       data_processor.fit_standard_scaling(self.scaler)
       data_processor.encoding_catagorical_features(self.one_hot_encoder, self.covarrianced_columns)
       data_processor.dataframe = data_processor.dataframe.fill_null(value=0)
       
       X = data_processor.dataframe

       return X


### <span style="background-color: lightyellow;">Pipeline Task</span>


In [4]:
from sklearn.preprocessing import LabelEncoder
from codecarbon import track_emissions
def return_train_test_split():
    encoder = LabelEncoder()
    dataset = retrieve_data()
    y = dataset["characteristics.tag.histology"].map_elements(lambda x: sub_classification(x), return_dtype=pl.String)
    y_encoded = pl.DataFrame(encoder.fit_transform(y)).to_series()
    X = dataset.drop('characteristics.tag.histology')
    # Before doing any preprocessing steps we will split the data into train and test inorder to prevent data leakage
    train_X, test_X, train_y, test_y = train_test_split(X, y_encoded, random_state=42)
    return train_X, test_X, train_y, test_y 


In [5]:
@track_emissions
def run_feature_processing():
    train_X, test_X, train_y, test_y = return_train_test_split()
    print("###### Feature processing ######")
    fp = FeatureProcessing()
    fp.fit(train_X,train_y)
    x = fp.transform(train_X)
    
run_feature_processing()

[codecarbon INFO @ 21:34:47] [setup] RAM Tracking...
[codecarbon INFO @ 21:34:47] [setup] GPU Tracking...
[codecarbon INFO @ 21:34:47] No GPU found.
[codecarbon INFO @ 21:34:47] [setup] CPU Tracking...
[codecarbon INFO @ 21:34:48] CPU Model on constant consumption mode: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
[codecarbon INFO @ 21:34:48] >>> Tracker's metadata:
[codecarbon INFO @ 21:34:48]   Platform system: Linux-6.1.0-32-amd64-x86_64-with-glibc2.36
[codecarbon INFO @ 21:34:48]   Python version: 3.12.3
[codecarbon INFO @ 21:34:48]   CodeCarbon version: 2.2.2
[codecarbon INFO @ 21:34:48]   Available RAM : 880.353 GB
[codecarbon INFO @ 21:34:48]   CPU count: 80
[codecarbon INFO @ 21:34:48]   CPU model: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
[codecarbon INFO @ 21:34:48]   GPU count: None
[codecarbon INFO @ 21:34:48]   GPU model: None


###### Feature processing ######


[codecarbon INFO @ 21:35:00] 
Graceful stopping: collecting and writing information.
Please wait a few seconds...
[codecarbon INFO @ 21:35:00] Energy consumed for RAM : 0.000816 kWh. RAM Power : 330.1325168609619 W
[codecarbon INFO @ 21:35:00] Energy consumed for all CPUs : 0.000105 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 21:35:00] 0.000921 kWh of electricity used since the beginning.
[codecarbon INFO @ 21:35:00] Done!

