# Harmonize ONT reads with array reference

1. Converting CRAM to FASTQ (or basecall straight into fastq and skip this step):

    `samtools fastq -@ <num_threads> input.cram > output.fastq`

2. Minimap2 aligment:

    `minimap2 -ax lr:hq -t <num_threads> ../ref/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna input.fastq.gz > output.sam`

3. Converting SAM to BAM, sorting, and indexing:

    a. Convert SAM to BAM:

    `samtools view -@ <num_threads> -bS output.sam > output.bam`

    b. Sort BAM:

    `samtools sort -@ <num_threads> output.bam -o sorted_output.bam`

    c. Create BAM Index:

    `samtools index sorted_output.bam`

4. Modkit
    
    `modkit pileup uf_hembank_1852.bam uf_hembank_1852_pacmap.bed --combine-mods --no-filtering -t 32 --combine-strands --cpg --ref GCA_000001405.15_GRCh38_no_alt_analysis_set.fna --include-bed pacmap_reference.bed `


## Where data at?

In [92]:
import pandas as pd

mount = '/mnt/d/'

reference_path = mount + 'genome_references/Illumina_methylation_arrays/EPIC.hg38.manifest.tsv.gz'
output_path = mount + 'MethylScore_v2/Processed_Data/'

sample_name = 'uf_hembank_1852'

## Create BED6 file to harmonize probes from EPIC array with nanopore

In [150]:
# read df_discovery and df_validation
df_discovery = pd.read_pickle(mount+'MethylScore_v2/Intermediate_Files/'+'3308samples_333059cpgs_withbatchcorrection_bvalues.pkl').sort_index().iloc[:,1:]

array_reference = pd.read_csv("/mnt/c/Users/fmarc/OneDrive/Desktop/nanopore_processed/ref/EPIC.hg38.manifest.gencode.v36.tsv.gz", sep='\t', compression='gzip',
                              usecols=['CpG_chrm','CpG_beg','CpG_end','probe_strand','probeID']
                              ).set_index('probeID').sort_index()

In [151]:
pacmap_reference = array_reference.loc[df_discovery.columns].reset_index()

# remove `.0` from `CpG_beg` and `CpG_end` and coordinate
pacmap_reference['CpG_beg'] = pacmap_reference['CpG_beg'].astype(int)
pacmap_reference['CpG_end'] = pacmap_reference['CpG_end'].astype(int)
pacmap_reference['score'] = 0

# make `pacamap_reference` a BED6 file
# pacmap_reference['CpG_beg'] = pacmap_reference['CpG_beg'] - 1
# pacmap_reference['CpG_end'] = pacmap_reference['CpG_end'] - 1
# pacmap_reference['CpG_chrm'] = pacmap_reference['CpG_chrm'].str.replace('chr','')
# pacmap_reference['CpG_chrm'] = pacmap_reference['CpG_chrm'].str.replace('X','23')
# pacmap_reference['CpG_chrm'] = pacmap_reference['CpG_chrm'].str.replace('Y','24')
# pacmap_reference['CpG_chrm'] = pacmap_reference['CpG_chrm'].astype(str)

pacmap_reference = pacmap_reference[['CpG_chrm','CpG_beg','CpG_end','IlmnID','score','probe_strand']]
pacmap_reference = pacmap_reference.rename(columns={'CpG_chrm':'chrom','CpG_beg':'chromStart','CpG_end':'chromEnd','IlmnID':'name','probe_strand':'strand'})

# sort by chromosome and start
pacmap_reference = pacmap_reference.sort_values(by=['chrom','chromStart'])

pacmap_reference.to_csv('/mnt/c/Users/fmarc/OneDrive/Desktop/nanopore_processed/ref/pacmap_reference.bed', sep='\t', index=False, header=False)

In [149]:
pacmap_reference

Unnamed: 0_level_0,chrom,chromStart,chromEnd,name,score,strand
coordinate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
chr1:69590,chr1,69590,69592,cg21870274,0,+
chr1:864702,chr1,864702,864704,cg08258224,0,-
chr1:870160,chr1,870160,870162,cg16619049,0,-
chr1:877158,chr1,877158,877160,cg18147296,0,-
chr1:898802,chr1,898802,898804,cg13938959,0,+
...,...,...,...,...,...,...
chr9:138119084,chr9,138119084,138119086,cg00378292,0,+
chr9:138120221,chr9,138120221,138120223,cg07982825,0,-
chr9:138122338,chr9,138122338,138122340,cg14491707,0,+
chr9:138122548,chr9,138122548,138122550,cg13811936,0,-


## Load modkit processed file

In [138]:
# # Define columns to be used for the input data
usecols = [0, 1, 4, 10]
column_names = ["chrom", "start_position", "score", "fraction_modified"]

# Read the input data, skipping the first row if it's a header or irrelevant
df = pd.read_csv('/mnt/c/Users/fmarc/OneDrive/Desktop/nanopore_processed/bed/uf_hembank_1852_pacmap.bed', sep='\s+', skiprows=1, usecols=usecols, names=column_names)

In [139]:

# Create 'coordinate' column for merging
df['coordinate'] = df['chrom'].astype(str) + ':' + df['start_position'].astype(str)

df_filtered = df.set_index('coordinate')

In [148]:
df_filtered

Unnamed: 0_level_0,chrom,start_position,score,fraction_modified
coordinate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chr1:864703,chr1,864703,9,88.89
chr1:870161,chr1,870161,6,66.67
chr1:877159,chr1,877159,9,77.78
chr1:898802,chr1,898802,13,38.46
chr1:898803,chr1,898803,1,100.00
...,...,...,...,...
chr22:50737087,chr22,50737087,11,100.00
chr22:50737978,chr22,50737978,15,93.33
chr22:50738282,chr22,50738282,13,61.54
chr22:50739553,chr22,50739553,15,100.00


In [140]:
pacmap_reference['coordinate'] = pacmap_reference['chrom'].astype(str) + ':' + pacmap_reference['chromStart'].astype(str)
pacmap_reference = pacmap_reference.set_index('coordinate')

In [146]:
# Join with reference data on 'coordinate'
df_merged = df_filtered.join(pacmap_reference[['name']], how='outer')

In [147]:
df_merged

Unnamed: 0_level_0,chrom,start_position,score,fraction_modified,name
coordinate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chr10:100008943,chr10,100008943,6,50.00,cg14405603
chr10:100020168,chr10,100020168,14,85.71,cg01099300
chr10:100062255,chr10,100062255,8,87.50,cg25833003
chr10:100065360,chr10,100065360,6,83.33,cg19754520
chr10:100066954,chr10,100066954,10,80.00,cg22534585
...,...,...,...,...,...
chr9:99906442,chr9,99906442,9,0.00,cg21606237
chr9:99906832,chr9,99906832,11,0.00,cg03950009
chr9:99907330,chr9,99907330,11,9.09,cg14153061
chr9:99967624,chr9,99967624,9,100.00,cg14233861


In [123]:

# Interpolate missing values in 'fraction_modified' column linearly
df_merged['fraction_modified'] = df_merged['fraction_modified'].astype(float).interpolate(method='linear')


df_merged = df_merged[['fraction_modified']].join(pacmap_reference, how='inner').fillna(50)

df_merged = df_merged.drop_duplicates(subset='name')

# Calculate the fraction_modified and prepare the final DataFrame
df_merged.loc[:, sample_name] = (df_merged['fraction_modified'] / 100).round(3)

df_processed = df_merged[['name', sample_name]].set_index('name').T

## Apply PaCMAP model

In [124]:
import pacmap

def apply_pacmap_model_to_new_data(df, components):

    # Load reducer
    reducer = pacmap.load(f'../models/pacmap_{components}d_model_al_atlas')

    # Project the high dimensional dataset into existing embedding space and return the embedding.
    embedding = reducer.transform(df.to_numpy(dtype='float16'))

    # Create column names
    cols = ['PaCMAP '+ str(i+1) + f' of {components}' for i in range(components)]

    # Turn embedding into dataframe
    df_embedding = pd.DataFrame(embedding, columns=cols, index=df.index)

    return df_embedding

df_embedding_2d = apply_pacmap_model_to_new_data(df_processed, 2)
df_embedding_5d = apply_pacmap_model_to_new_data(df_processed, 5)

df_embedding_2d.to_pickle(output_path + sample_name + '_pacmap_2d.pkl')
df_embedding_5d.to_pickle(output_path + sample_name + '_pacmap_5d.pkl')

## Apply supervised models

In [125]:
import joblib

# Load models
lgbm_px_model = joblib.load('../models/lgbm_px_model.pkl')
lgbm_dx_model = joblib.load('../models/lgbm_dx_model.pkl')

# load `df_embedding_5d` from the previous step
df_embedding_5d = pd.read_pickle(output_path + sample_name + '_pacmap_5d.pkl')

def save_predictions(df, classifier, model_name):

    # ignore sklearn warnings
    import warnings
    warnings.filterwarnings('ignore')

    # Select necessary columns
    df_features = df.copy()

    # Predict using the selected columns
    predictions = classifier.predict(df_features)

    # Predict probabilities using the selected columns
    probabilities = classifier.predict_proba(df_features)

    # Convert predictions to a Series with the same index as df_features
    predictions_series = pd.Series(predictions, index=df_features.index, name=model_name)

    # Convert probabilities to a DataFrame with the same index as df_features and the same columns as the classes
    probabilities_df = pd.DataFrame(probabilities, index=df_features.index, columns=classifier.classes_).round(3)

    # Add " - predict_proba" to the column names
    probabilities_df.columns ='P(' + probabilities_df.columns + ')'

    # Transform classes of the predictions into integers based on unique values in the classes
    probabilities_df[model_name + '_int'] = predictions_series.map({c: i for i, c in enumerate(classifier.classes_)})

    # Join predictions with the original DataFrame (already indexed)
    df_joined = predictions_series.to_frame().join(probabilities_df)

    return df_joined

# Execution
df_pred_px = save_predictions(df=df_embedding_5d, classifier=lgbm_px_model, model_name='AML Epigenomic Risk')
df_pred_dx = save_predictions(df=df_embedding_5d, classifier=lgbm_dx_model, model_name='AL Epigenomic Phenotype')

# Map the classes to more desirable labels (low and high risk)
df_pred_px['AML Epigenomic Risk'] = df_pred_px['AML Epigenomic Risk'].map({'Alive': 'Low', 'Dead': 'High'})
df_pred_px = df_pred_px.rename(columns={'P(Alive)': 'AML Epigenomic Risk P(Low Risk)', 'P(Dead)': 'AML Epigenomic Risk P(High Risk)'})

# Join predictions with clinical data
df_combined = df_embedding_2d.join(df_embedding_5d).join(df_pred_px).join(df_pred_dx)

df_combined[['AML Epigenomic Risk', 'AML Epigenomic Risk P(High Risk)', 'AL Epigenomic Phenotype', f'P({df_combined["AL Epigenomic Phenotype"].item()})']]

Unnamed: 0,AML Epigenomic Risk,AML Epigenomic Risk P(High Risk),AL Epigenomic Phenotype,P(AML with inv(16); t(16;16); CBFB::MYH11)
uf_hembank_1852,High,0.501,AML with inv(16); t(16;16); CBFB::MYH11,0.636


## EWASCox-Lasso

In [126]:
import math
import sys
sys.path.append('../')
from source.cox_lasso import *

raw_coefs = pd.read_csv(output_path + 'multivariate_cox_lasso/ewas_cog_os_raw_coefs_newrisk.csv', index_col=0)

mean_coefs = set_cutoff(coefs=raw_coefs,threshold=0.99)

df_validation = df_processed[mean_coefs.index]

df_validation_transformed = df_validation.replace(1, 0.999).replace(0, 0.001)

def beta2m(val):
    '''Transfrom beta-values into m-values'''
    return math.log2(val/(1-val))

x_test_m = df_validation_transformed.apply(np.vectorize(beta2m))

def standardize_data(df, reference_df):
    """Standardize data using mean and standard deviation of reference dataset"""

    # Keep only columns that are in both datasets
    reference_df = reference_df.loc[:, df.columns]

    # Standardize data
    df_z = (df - reference_df.mean()) / reference_df.std()

    return df_z

# Read top CpGs selected from previous code file (univariate cox-ph EWAS)
ewas_top_cpgs = pd.read_csv(output_path+'ewas_dmr/ewas_top_cpgs_os.csv', index_col=0)

# Standardize data
x_test_m_z = standardize_data(df= x_test_m, reference_df= ewas_top_cpgs)

score_name = 'EWASCox_OS_48CpGs'

df_test, threshold = generate_coxph_score(coef_mean=mean_coefs,
                                        x=x_test_m_z,
                                        df=df_validation_transformed,
                                        score_name=score_name,
                                        train_test=0.4934,
                                        rpart_outcome='os.time')

df_validation_transformed[['EWASCox_OS_48CpGs','EWASCox_OS_48CpGs Categorical']]

Continuous score cut at the value of 0.4934


name,EWASCox_OS_48CpGs,EWASCox_OS_48CpGs Categorical
uf_hembank_1852,1.451607,High


## Save results

In [127]:
df_nanopore = df_combined.join(df_validation_transformed[['EWASCox_OS_48CpGs','EWASCox_OS_48CpGs Categorical']])

df_nanopore['Train-Test'] = 'Long-read Nanopore sequencing'
df_nanopore['Clinical Trial'] = 'UF Hem Bank'
df_nanopore['Patient_ID'] = sample_name
df_nanopore['Hematopoietic Entity'] = np.nan
df_nanopore['WHO 2022 Diagnosis'] =  np.nan
df_nanopore['Vital Status'] = np.nan
df_nanopore['Risk Group AAML1831'] = np.nan

df_nanopore.to_excel(output_path + sample_name + '_processed.xlsx')

# print save message
print(f'Processed data for {sample_name} saved as {output_path + sample_name + "_processed.xlsx"}')

Processed data for uf_hembank_1852 saved as /mnt/d/MethylScore_v2/Processed_Data/uf_hembank_1852_processed.xlsx


## Watermark

In [88]:
%load_ext watermark

In [90]:
# watermark with all libraries used in this notebook
%watermark -v -p numpy,pandas,pacmap,sklearn,lightgbm -a Francisco_Marchi@Lamba_Lab_UF -d -m

Author: Francisco_Marchi@Lamba_Lab_UF

Python implementation: CPython
Python version       : 3.8.16
IPython version      : 8.12.3

numpy   : 1.24.4
pandas  : 2.0.3
pacmap  : 0.7.0
sklearn : 1.2.2
lightgbm: 3.3.5

Compiler    : GCC 11.3.0
OS          : Linux
Release     : 5.15.133.1-microsoft-standard-WSL2
Machine     : x86_64
Processor   : x86_64
CPU cores   : 6
Architecture: 64bit



```{note}
Please only use the following versions:
`python`: 3.8.16
`pacmap`: 0.7.0
`lightgbm`: 3.3.5
`scikit-learn`: 1.2.2
```