In [1]:
import os, glob
import numpy as np
import pandas as pd
import math
import pickle
import matplotlib.pyplot as plt
#from scipy.stats import norm
# from scipy import stats
# from statsmodels.stats.weightstats import ztest
from IPython.display import display, HTML

In [2]:
non_coding_region= "TFBS"
cancer_type= "Brain"

In [3]:
TFBS_LIST= ['CBFA2T3','CEBPG','CREB1','E2F4']

In [4]:
def log_oods_ratio(p1,p2):
    term1 = math.log2(p1/(1-p1))
    term2 = math.log2(p2/(1-p2))
    return (term1-term2)
def calculate_diff_probab(p1,p2):
    return((p2-p1)*max(p1,p2))

In [5]:
for tfbs in TFBS_LIST:
    base_dir= f"/data/projects/GDC_Cancer_Wise/New_data/{cancer_type}/Generated_files/DNABERT_Data/{non_coding_region}/{tfbs}/Patient_wise"
    print(base_dir)
    output_dir= f"/data/projects/GDC_Cancer_Wise/New_data/{cancer_type}/Generated_files/Variant_Analysis/{non_coding_region}/{tfbs}"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # df = pd.read_csv(f"/data/projects/DNABERT_snv/Manuscript_11_2023/TFBS_fine_tune_data/Data_Jan_2024/1_1_pos_neg/{tfbs}/100bp.bed", sep="\t", header=None)
    #display(df)
    with open(f"/data/projects/GDC_Cancer_Wise/New_data/{cancer_type}/Generated_files/Intersected_Data/{non_coding_region}/{tfbs}/intersected_vcf_data.pkl", "rb") as file:
        loaded_dictionary = pickle.load(file)
    
    
    patient_dfs= []
    patient_id = []
    for dirpath, dirnames, filenames in os.walk(base_dir):
        #print(dirpath, dirnames, filenames)
        for dir_name in dirnames:
            dir_path = dirpath +"/" + dir_name
            #print(dir_path)
            patient_id.append(dir_name)
            #print(dir_name)
            #input()
            df_temp = loaded_dictionary[dir_name]
            # display(df_temp)
            # input()
            df_temp = df_temp.drop_duplicates(subset=[0, 1, 2, 'START_POS', 'END_POS', 'REF','ALT']).reset_index()
            concat_probab = np.load(dir_path+"/Prediction_result/pred_results.npy")
            #print(len(concat_probab))
            #print(len(df_temp))
            if(2*len(df_temp)==len(concat_probab)):
                #input()
                Reference_probab = concat_probab[::2]
                Alternative_probab = concat_probab[1::2]
                df_temp['Ref_probab']=Reference_probab
                df_temp['Alt_probab']=Alternative_probab
                df_temp['ScoreChange'] = df_temp.apply(lambda row: calculate_diff_probab(row['Ref_probab'], row['Alt_probab']), axis=1)
                df_temp['LogOddRatio'] = df_temp.apply(lambda row: log_oods_ratio(row['Ref_probab'], row['Alt_probab']), axis=1)
                #display(df_temp)
                #input()
                # Apply the filter conditions
                df_temp = df_temp[((df_temp['Ref_probab'] > 0.5) & (df_temp['Alt_probab'] <= 0.5)) | 
                          ((df_temp['Ref_probab'] <= 0.5) & (df_temp['Alt_probab'] > 0.5))]
                #display(df_temp[[0, 1, 2, 'START_POS', 'END_POS', 'REF', 'ALT']])
                #input()
                patient_dfs.append(df_temp)
            else:
                print(dir_name)
                input()
            #input()
        break
    unique_patient_regions = pd.concat(patient_dfs).drop_duplicates(subset=[0, 1, 2, 'START_POS', 'END_POS', 'REF', 'ALT'])
    # Step 3: Create a new dataframe with three rows for each patient, with columns as the acceptor coordinates
    # Initialize the dictionaries
    data_ref, data_alt, data_log_odd, data_score_change = {}, {}, {}, {}

    # Loop over each region in the unique_patient_regions
    for _, region in unique_patient_regions.iterrows():
        #print(region)
        key = f'{region[0]}_{region[1]}_{region[2]}_{region.START_POS}_{region.END_POS}_{region.REF}_{region.ALT}'
        # print(key)
        # input()
        data_ref[key] = []
        data_alt[key] = []
        data_log_odd[key] = []
        data_score_change[key] = []

    # Loop through each patient and for each region, get the Alt_probab score for that region for that patient
    for df in patient_dfs:
        #print(df.shape)
        #print(df)
        for _, region in unique_patient_regions.iterrows():
            # print(region)
            # input()
            for data, score_column in zip([data_ref, data_alt, data_log_odd, data_score_change], ['Ref_probab', 'Alt_probab', 'LogOddRatio', 'ScoreChange']):
                score_series = df.loc[(df[0] == region[0]) & (df[1] == region[1])& (df[2] == region[2]) & (df['START_POS'] == region.START_POS) & (df['END_POS'] == region.END_POS) & (df['REF'] == region.REF) & (df['ALT'] == region.ALT) , score_column]
                #print(score_series)
                if score_series.empty:
                    score = np.nan
                else:
                    score = score_series.max()
                    #print(score_series , score)
                    #input()
                data[f'{region[0]}_{region[1]}_{region[2]}_{region.START_POS}_{region.END_POS}_{region.REF}_{region.ALT}'].append(score)
    result_df_ref = pd.DataFrame(data_ref)
    result_df_ref.insert(0, "Patient_ID", patient_id)
    result_df_alt = pd.DataFrame(data_alt)
    result_df_alt.insert(0, "Patient_ID", patient_id)
    result_df_score_change = pd.DataFrame(data_score_change)
    result_df_score_change.insert(0, "Patient_ID", patient_id)
    result_df_log_odd = pd.DataFrame(data_log_odd)
    result_df_log_odd.insert(0, "Patient_ID", patient_id)
    result_df_ref.to_csv(output_dir+"/df_ref_score.tsv", sep="\t", index=False)
    result_df_alt.to_csv(output_dir+"/df_alt_score.tsv", sep="\t", index=False)
    result_df_log_odd.to_csv(output_dir+"/df_log_odd_score.tsv", sep="\t", index=False)
    result_df_score_change.to_csv(output_dir+"/df_score_change.tsv", sep="\t", index=False)
    #input()

/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/TFBS/CBFA2T3/Patient_wise
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/TFBS/CEBPG/Patient_wise
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/TFBS/CREB1/Patient_wise
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/TFBS/E2F4/Patient_wise
