In [1]:
import pandas as pd
import yaml
import gzip
#sckit learn

from sklearn.preprocessing  import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from data_processor import DataProcessor


### <span style="background-color: lightyellow;">Data retrieval task</span>

In [2]:

def get_config(file_name):

    with open(file_name, 'r', encoding="UTF-8") as stream:
        config = yaml.safe_load(stream)
    return config


def parse_metadata(file_path):
    data = {}
    with gzip.open(file_path, 'rt') as f:
        lines = f.readlines()

    for line in lines:
        splited = line.split('\t', 1)
        if len(splited) >= 2:
            # Only include lines where the first part is '!Sample_title' or does not start with '!'
            if splited[0].strip() == '!Sample_title' or not splited[0].lstrip().startswith('!'):
                data[splited[0].strip()] = splited[1].strip().strip(
                    '"').split("\t")

    df = pd.DataFrame.from_dict(data, orient='index').transpose()
    df.drop(columns=['!Sample_title'], inplace=True)
    df.drop('"ID_REF"', inplace=True, axis=1)
    df = df.apply(pd.to_numeric, errors='coerce')
    return df


def retrieve_data():

    config = get_config('config.yaml')
    lung3 = config['lung3_csv']
    gene = config['gene']

    lung3_df = pd.read_csv(lung3)
    gene_expression_df = parse_metadata(gene)

    combined_df = lung3_df.merge(gene_expression_df, left_index=True,right_index=True)
    
    return combined_df

def sub_classification(histology):
    if "Carcinoma" in histology:
        return 'Carcinoma'
    elif "Adenocarcinoma" in histology:
        return 'Adenocarcinoma'
    else:
        return 'Others'
    


### <span style="background-color: lightyellow;">Feature engineering Task</span>




In [3]:

class FeatureProcessing(TransformerMixin, BaseEstimator): 
    def __init__(self, covariance_threshold=0, quantile_percentage=95, nan_threshold =35):
        self.covariance_threshold = covariance_threshold
        self.quantile_percentage = quantile_percentage
        self.nan_threshold= nan_threshold
        
    def fit(self, X, y=None):
        
        data_processor = DataProcessor(X)
        data_processor.remove_nonrelated_columns()
        data_processor.impute_notavailable_values('characteristics.tag.grade')
        data_processor.drop_nan_columns(self.nan_threshold)
        # data_processor.change_column_datatype()
        
        data_processor.cramerV(y, self.covariance_threshold)
        self.covarrianced_columns = data_processor.covarrianced_columns
        removed_catagorical_features = set(data_processor.find_cols_on_type('object')) - set(self.covarrianced_columns)
        data_processor.drop_columns(column_list = list(removed_catagorical_features))
        data_processor.selecting_high_variance_gene_expression(self.quantile_percentage)
        self.features = data_processor.dataframe.columns
        
        self.scaler = StandardScaler()
        self.scaler.fit(data_processor.dataframe[data_processor.find_cols_on_type('float64')])
        
        self.one_hot_encoder= OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.one_hot_encoder.fit(data_processor.dataframe[data_processor.covarrianced_columns])
        self.processed_df = data_processor.dataframe
        return self
    
    def transform(self, X):
    
       data_processor = DataProcessor(X)
       data_processor.dataframe = data_processor.dataframe[self.features]
       data_processor.fit_standard_scaling(self.scaler)
       data_processor.encoding_catagorical_features(self.one_hot_encoder, self.covarrianced_columns)
       data_processor.dataframe.fillna(0, inplace=True)
       
       X = data_processor.dataframe

       return X




### <span style="background-color: lightyellow;">Pipeline Task</span>


In [4]:
from sklearn.preprocessing import LabelEncoder
from codecarbon import track_emissions

def return_train_test_split():
    encoder = LabelEncoder()
    X = retrieve_data()
    y = X['characteristics.tag.histology'].apply(lambda x: sub_classification(x))
    y_encoded = pd.DataFrame(encoder.fit_transform(y),columns=['classes']).classes
    X.drop(columns='characteristics.tag.histology',inplace=True)

    # Before doing any preprocessing steps we will split the data into train and test inorder to prevent data leakage
    train_X, test_X, train_y, test_y = train_test_split(X, y_encoded, random_state=42)


    test_y.reset_index(drop=True, inplace = True)
    train_y.reset_index(drop=True, inplace = True)
    return train_X, test_X, train_y, test_y
    


In [5]:
@track_emissions
def run_feature_processing():
    train_X, test_X, train_y, test_y = return_train_test_split()
    print("###### Feature processing ######")
    fp = FeatureProcessing()
    fp.fit(train_X,train_y)
    x = fp.transform(train_X)
    
run_feature_processing()

[codecarbon INFO @ 21:34:00] [setup] RAM Tracking...
[codecarbon INFO @ 21:34:00] [setup] GPU Tracking...
[codecarbon INFO @ 21:34:00] No GPU found.
[codecarbon INFO @ 21:34:00] [setup] CPU Tracking...
[codecarbon INFO @ 21:34:01] CPU Model on constant consumption mode: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
[codecarbon INFO @ 21:34:01] >>> Tracker's metadata:
[codecarbon INFO @ 21:34:01]   Platform system: Linux-6.1.0-32-amd64-x86_64-with-glibc2.36
[codecarbon INFO @ 21:34:01]   Python version: 3.12.3
[codecarbon INFO @ 21:34:01]   CodeCarbon version: 2.2.2
[codecarbon INFO @ 21:34:01]   Available RAM : 880.353 GB
[codecarbon INFO @ 21:34:01]   CPU count: 80
[codecarbon INFO @ 21:34:01]   CPU model: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
[codecarbon INFO @ 21:34:01]   GPU count: None
[codecarbon INFO @ 21:34:01]   GPU model: None


###### Feature processing ######


[codecarbon INFO @ 21:34:19] Energy consumed for RAM : 0.001390 kWh. RAM Power : 330.1325168609619 W
[codecarbon INFO @ 21:34:19] Energy consumed for all CPUs : 0.000179 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 21:34:19] 0.001569 kWh of electricity used since the beginning.
  cramer.fillna(value=0, inplace=True)
[codecarbon INFO @ 21:34:22] 
Graceful stopping: collecting and writing information.
Please wait a few seconds...
[codecarbon INFO @ 21:34:22] Energy consumed for RAM : 0.001478 kWh. RAM Power : 330.1325168609619 W
[codecarbon INFO @ 21:34:22] Energy consumed for all CPUs : 0.000190 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 21:34:22] 0.001668 kWh of electricity used since the beginning.
[codecarbon INFO @ 21:34:22] Done!

