In [47]:
import pandas as pd
import numpy as np


## paper data preparion

In [57]:
paper_data = pd.read_csv("all_muts_classes_updated.csv")
paper_data = paper_data[["name", "class"]]
paper_data.columns = ["molecular_mutant", "trend"]
drug_mapping = {
    "enz": "Enzalutamide",
    "bic": "Bicalutamide",
    "arn": "Apalutamide",
    "flu": "hydroxyflutamide",
    "odm": "Darolutamide",
    "566": "VPC-13566",
    "789": "VPC-13789",
}
paper_data["drug"] = paper_data["molecular_mutant"].apply(
    lambda x: x.rsplit("_", 1)[1]
)
paper_data["mutant"] = paper_data["molecular_mutant"].apply(
    lambda x: x.rsplit("_", 1)[0]
)

paper_data["mutant"] = paper_data["mutant"].str.upper()
paper_data["mutant"] = paper_data["mutant"].str.upper()
paper_data["drug"] = paper_data["drug"].map(drug_mapping)
paper_data["molecular_mutant"] = (
    paper_data["drug"].astype(str) + "_" + paper_data["mutant"]
)
paper_data = paper_data[~paper_data['drug'].str.startswith('VPC')]
# tread_mapping = {1:"down",2:"up",0:"flat",3:'Mixed'}
paper_data['trend'] = paper_data['trend'].replace(0, 3)
# tread_mapping = {1:"down",2:"up",3:"flat or Mixed"}

In [58]:
paper_data

Unnamed: 0,molecular_mutant,trend,drug,mutant
0,Bicalutamide_A587V,3,Bicalutamide,A587V
1,Bicalutamide_A749V,3,Bicalutamide,A749V
2,Bicalutamide_A588S,3,Bicalutamide,A588S
3,Bicalutamide_A722T,1,Bicalutamide,A722T
4,Bicalutamide_A749T,3,Bicalutamide,A749T
...,...,...,...,...
273,hydroxyflutamide_W742L,3,hydroxyflutamide,W742L
274,Darolutamide_W742L,1,Darolutamide,W742L
277,Apalutamide_WT,1,Apalutamide,WT
278,hydroxyflutamide_WT,3,hydroxyflutamide,WT


## in house data preparation 

In [59]:
from datapoints_jane import *

dose_value = [
    1.698970004,
    1.22184875,
    0.744727495,
    0.26760624,
    -0.209515015,
    -0.686636269,
    -1.163757524,
]

def remove_outlier(test):
    return np.hstack(test)[(np.hstack(test)>np.quantile(np.hstack(test),0.1)) & (np.hstack(test)<np.quantile(np.hstack(test),0.9))].tolist()

def reject_outliers(test, m = 2.):
    data = np.hstack(test)
    d = np.abs(data - np.median(data))
    mdev = np.median(d)
    s = d/mdev if mdev else np.zeros(len(d))
    return data[s<m]

def cal_avg(
    L702_D1,
    L702_D2,
    L702_D3,
    L702_D4,
    L702_D5,
    L702_D6,
    L702_D7,
    drug_name,
    mutant_name,
):
    L702_D1 = reject_outliers(L702_D1)
    L702_D2 = reject_outliers(L702_D2)
    L702_D3 = reject_outliers(L702_D3)
    L702_D4 = reject_outliers(L702_D4)
    L702_D5 = reject_outliers(L702_D5)
    L702_D6 = reject_outliers(L702_D6)
    L702_D7 = reject_outliers(L702_D7)

    average_per_dose = [
        np.average(i)
        for i in [L702_D1, L702_D2, L702_D3, L702_D4, L702_D5, L702_D6, L702_D7]
    ]
    return {drug_name + "+" + mutant_name: average_per_dose}

def trend_reg(l2):
    df = pd.DataFrame(l2)
    df = df.transpose()
    df = df.fillna(0)
    l1 = df.values.tolist()
    res = []
    for li in l1:
        decreasing_sorted_value = sorted(li, reverse = True)
        increasing_sorted_value = sorted(li, reverse = False)
        if decreasing_sorted_value == li:
            res.append("up")
        elif increasing_sorted_value == li:
            res.append("down")
        else:
            res.append("mixed")

drugs = ["Enzalutamide","Bicalutamide","Darolutamide"]
mutants = ["L702H","V716M","W742L","W742C","H875Y","F877L",
           "T878A","T878S","M896T","H875Y_T878A","F877L_T878A",
           "D891H_T878A"]

doses = ["D1","D2","D3","D4","D5","D6","D7"]

final = []
for i in mutants:
    for j in drugs:
        res = []
        for k in doses:
            name = i + "_" + j + "_" + k 
            res.append(globals()[name])
       # test = trend_reg(res)
        test = cal_avg(res[0],res[1],res[2],res[3],res[4],res[5],res[6],j,i)
        final.append(test)

final_pattern = []
for i in final:
    for key,value in i.items():
        decreasing_sorted_value = sorted(value, reverse = True)
        increasing_sorted_value = sorted(value, reverse = False)
        if decreasing_sorted_value == value:
            final_pattern.append({key:"up"})
        elif increasing_sorted_value == value:
            final_pattern.append({key:"down"})
        else:
            final_pattern.append({key:"mixed"})

df = pd.DataFrame()
df['pair'] = [list(i.keys())[0] for i in final_pattern]
df[['drug', 'mutant']] = df['pair'].str.split('+', n=1, expand=True)
df['trend'] = [list(i.values())[0] for i in final_pattern]
df = df.sort_values('drug')
trend_mapping = {'down': 1, 'up': 2, 'mixed': 3}
df['trend'] = df['trend'].map(trend_mapping)
df

Unnamed: 0,pair,drug,mutant,trend
34,Bicalutamide+D891H_T878A,Bicalutamide,D891H_T878A,3
1,Bicalutamide+L702H,Bicalutamide,L702H,3
19,Bicalutamide+T878A,Bicalutamide,T878A,3
31,Bicalutamide+F877L_T878A,Bicalutamide,F877L_T878A,1
4,Bicalutamide+V716M,Bicalutamide,V716M,3
28,Bicalutamide+H875Y_T878A,Bicalutamide,H875Y_T878A,3
7,Bicalutamide+W742L,Bicalutamide,W742L,3
16,Bicalutamide+F877L,Bicalutamide,F877L,3
25,Bicalutamide+M896T,Bicalutamide,M896T,3
10,Bicalutamide+W742C,Bicalutamide,W742C,3


## merge datasets

In [63]:

# Merge the DataFrames based on the 'drug' and 'mutant' columns
merged_df = pd.merge(paper_data, df, on=['drug', 'mutant'], how='outer', suffixes=('_df1', '_df2'))

# Resolve conflicts: if 'trend_df1' != 'trend_df2', use 'trend_df2'
merged_df['trend'] = merged_df['trend_df2'].fillna(merged_df['trend_df1'])

# Drop the unnecessary columns (trend_df1, trend_df2) for clarity
merged_df = merged_df.drop(columns=['trend_df1', 'trend_df2'])
merged_df = merged_df[['drug', 'mutant', 'trend']]

# Show the merged DataFrame
merged_df.to_csv("merged_dataset.csv")
merged_df

Unnamed: 0,drug,mutant,trend
0,Bicalutamide,A587V,3.0
1,Bicalutamide,A749V,3.0
2,Bicalutamide,A588S,3.0
3,Bicalutamide,A722T,1.0
4,Bicalutamide,A749T,3.0
...,...,...,...
233,Bicalutamide,F877L_T878A,1.0
234,Darolutamide,F877L_T878A,3.0
235,Darolutamide,D891H_T878A,1.0
236,Enzalutamide,D891H_T878A,3.0
