In [42]:

import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from missforest import MissForest

class RemoveSamplesWithManyMissingValues(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.5):
        self.threshold = threshold
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Calculate the threshold (more than half of the samples/columns)
        threshold = X.shape[1] * self.threshold
        # Count the number of missing values per sample (row)
        missing_counts_per_sample = X.isna().sum(axis=1)
        # Filter samples with missing values greater than the threshold
        samples_to_keep = missing_counts_per_sample[missing_counts_per_sample <= threshold].index
        return X.loc[samples_to_keep]

class Log2Transformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
            X = X.map(lambda x: 1e-9 if isinstance(x, (int, float)) and x < 1e-9 else x)
            return np.log2(X)

pipeline = Pipeline([
    ('remove_missing_proteins', RemoveSamplesWithManyMissingValues(threshold=0.5)),
    ('imputer', MissForest(categorical=None)),
    ('log2_transform', Log2Transformer())
])

# Save the transformed data to a new CSV file

data = pd.read_csv("/Users/emirhanyagmur/Desktop/CSF/PEA/syn52282088_PEA_proteomics_2.csv")
print("Original data shape:", data.shape)
print(data.head())

data.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)

data = data.set_index('ID')

data.replace(0, np.nan, inplace=True)
print("After converting zeros to NaNs:")
print(data.head())

data = RemoveSamplesWithManyMissingValues(threshold=0.5).transform(data)
print("After RemoveSamplesWithManyMissingValues shape:", data.shape)
print(data.head())

data = MissForest(categorical=None,max_iter=10).fit_transform(data)
print("After MissForest shape:", data.shape)
print(data.head())

data_transformed = Log2Transformer().transform(data)
print("After Log2Transformer shape:", data_transformed.shape)
print(data_transformed.head())

data_transformed_df = pd.DataFrame(data_transformed, columns=data.columns, index=data.index)
data_transformed_df.to_csv("/Users/emirhanyagmur/Desktop/syn52282088_PEA_proteomics_2_clean.csv")




Original data shape: (639, 426)
  Unnamed: 0  PEA1_17120103  PEA1_17120104  PEA1_17120107  PEA1_17120109  \
0     O60462       1.191990       1.327450       1.088250       0.956650   
1     Q8N126       6.764735       6.874475       6.794415       6.781885   
2     O95185       4.597555       4.820505       4.418655       4.097365   
3     Q2TAL6       4.613770       4.765410       4.122860       4.433440   
4     Q9Y336       3.653210       3.417620       3.289040       3.104620   

   PEA1_17120111  PEA1_17120112  PEA1_17120113  PEA1_17120114  PEA1_17120115  \
0       1.215530       1.422600       1.415700       1.525990       1.628530   
1       6.829805       6.788815       6.912295       6.847515       6.735885   
2       4.512355       4.440325       4.722855       5.045965       5.186225   
3       4.934210       4.828130       4.881700       5.291640       5.297550   
4       3.125970       3.872120       3.326490       3.614250       3.664670   

   ...  PEA1_18080444  PEA1_18

100%|██████████| 10/10 [23:32<00:00, 141.29s/it]
100%|██████████| 10/10 [00:00<00:00, 22.83it/s]


After MissForest shape: (639, 425)
        PEA1_17120103  PEA1_17120571  PEA1_17120570  PEA1_17120566  \
ID                                                                   
O60462       1.191990       1.578220       2.146910       1.501060   
Q8N126       6.764735       6.887015       6.744225       6.868215   
O95185       4.597555       4.649505       5.209405       4.798315   
Q2TAL6       4.613770       5.324090       5.097760       5.190760   
Q9Y336       3.653210       4.204270       3.899820       3.083590   

        PEA1_17120565  PEA1_17120564  PEA1_17120563  PEA1_17120562  \
ID                                                                   
O60462       1.912550       1.642690       2.773370       2.062580   
Q8N126       6.900905       6.751935       6.749435       6.792055   
O95185       4.701865       4.846125       5.443885       4.884535   
Q2TAL6       4.864020       5.408040       5.656690       5.286450   
Q9Y336       3.912810       4.286590       5.145130   