In [1]:
import pandas as pd
import numpy as np

In [2]:
# trait_data = pd.read_csv("../../results/data/prepared/filtered_derived_traits.csv", index_col=0)
# groups = pd.read_csv("../../results/data/prepared/groups.csv", index_col=0)
# clinical = pd.read_csv("../../results/data/prepared/clinical.csv", index_col=0)

trait_data = pd.read_csv(snakemake.input[0], index_col=0)
groups = pd.read_csv(snakemake.input[1], index_col=0)
clinical = pd.read_csv(snakemake.input[2], index_col=0)

In [3]:
trait_data = trait_data[['CG', 'TB', 'TF', 'CA3', 'CA4']]
trait_data['CA3+CA4'] = trait_data['CA3'] + trait_data['CA4']
trait_data = trait_data.drop(columns=['CA3', 'CA4'])
trait_data

Unnamed: 0_level_0,CG,TB,TF,CA3+CA4
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
S1,0.911867,0.079516,0.307541,0.106579
S100,0.949564,0.078383,0.252681,0.153346
S101,0.948655,0.054226,0.203946,0.145298
S104,0.942385,0.055878,0.252844,0.202150
S105,0.956603,0.080068,0.279007,0.142945
...,...,...,...,...
S95,0.931637,0.060952,0.242750,0.130128
S96,0.901379,0.071040,0.357324,0.158077
S97,0.944712,0.079063,0.252689,0.156449
S98,0.935696,0.054904,0.340483,0.257529


In [4]:
data = trait_data.join(groups).query('group != "QC"').drop(columns='group').join(clinical[['AST', 'ALB', 'TBIL', 'GGT']])
data['TBIL'] = np.log(data['TBIL'])
data['AST'] = np.log(data['AST'])
data['ALB'] = np.log(data['ALB'].max() - data['ALB'] + 1)
data['GGT'] = np.log(data['GGT'])
data

Unnamed: 0_level_0,CG,TB,TF,CA3+CA4,AST,ALB,TBIL,GGT
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
S1,0.911867,0.079516,0.307541,0.106579,3.871201,2.243896,2.541602,3.828641
S100,0.949564,0.078383,0.252681,0.153346,2.944439,1.568616,2.610070,2.995732
S101,0.948655,0.054226,0.203946,0.145298,3.465736,2.186051,2.468100,4.174387
S104,0.942385,0.055878,0.252844,0.202150,4.584967,2.704711,3.194583,5.420535
S105,0.956603,0.080068,0.279007,0.142945,3.091042,2.174752,2.602690,2.708050
...,...,...,...,...,...,...,...,...
S95,0.931637,0.060952,0.242750,0.130128,3.178054,2.351375,2.714695,3.044522
S96,0.901379,0.071040,0.357324,0.158077,3.583519,2.978586,2.602690,4.060443
S97,0.944712,0.079063,0.252689,0.156449,3.295837,2.401525,2.610070,3.178054
S98,0.935696,0.054904,0.340483,0.257529,4.077537,3.109061,3.100092,5.043425


In [5]:
traits = ['CG', 'TB', 'TF', 'CA3+CA4']
liver_markers = ['AST', 'ALB', 'TBIL', 'GGT']

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import r2_score

def get_r2(features: list[str], target: str) -> float:
    """Get the R2 score of a linear regression model with the given features and target."""
    X = data[features]
    y = data[target]
    model = LinearRegression()
    y_pred = cross_val_predict(model, X, y, cv=10)
    return r2_score(y, y_pred)

In [8]:
from collections import namedtuple
from itertools import combinations

Record = namedtuple('Record', 'features target r2')
records = []
for target in liver_markers:
    for n_features in range(1, len(traits) + 1):
        for features in combinations(traits, n_features):
            r2 = get_r2(list(features), target)
            features_str = ','.join(features)
            records.append(Record(features_str, target, r2))
r2_df = pd.DataFrame(records)
r2_df

Unnamed: 0,features,target,r2
0,CG,AST,0.059303
1,TB,AST,0.044184
2,TF,AST,0.166243
3,CA3+CA4,AST,0.064718
4,"CG,TB",AST,0.06114
5,"CG,TF",AST,0.164069
6,"CG,CA3+CA4",AST,0.200277
7,"TB,TF",AST,0.181767
8,"TB,CA3+CA4",AST,0.153155
9,"TF,CA3+CA4",AST,0.239153


In [9]:
r2_df.to_csv(snakemake.output[0], index=False)

Unnamed: 0,features,target,r2
0,CG,AST,0.059303
1,TB,AST,0.044184
2,TF,AST,0.166243
3,CA3+CA4,AST,0.064718
4,"CG,TB",AST,0.06114
5,"CG,TF",AST,0.164069
6,"CG,CA3+CA4",AST,0.200277
7,"TB,TF",AST,0.181767
8,"TB,CA3+CA4",AST,0.153155
9,"TF,CA3+CA4",AST,0.239153


In [9]:
pred_dfs = []
for target in liver_markers:
    pred = cross_val_predict(LinearRegression(), data[traits], data[target], cv=10)
    pred_df = pd.DataFrame({'true': data[target], 'pred': pred}, index=data.index)
    pred_df['target'] = target
    pred_dfs.append(pred_df)
pred_df = pd.concat(pred_dfs)
pred_df

Unnamed: 0_level_0,true,pred,target
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S1,3.871201,3.362533,AST
S100,2.944439,3.235792,AST
S101,3.465736,3.071589,AST
S104,4.584967,3.506448,AST
S105,3.091042,3.244994,AST
...,...,...,...
S95,3.044522,3.524010,GGT
S96,4.060443,4.164919,GGT
S97,3.178054,3.550675,GGT
S98,5.043425,4.548328,GGT


In [None]:
pred_df.to_csv(snakemake.output[1])