# Test Dataset (Work in Progress)

1. Basecall and align using Dorado and minimap2:

    ```bash
    ~/dorado-0.7.3-linux-x64/bin/dorado basecaller hac,5mCG_5hmCG uf1848/20240905_1712_P2S-02395-A_PAW87958_2d796b97/pod5/ --reference ref/hg38.fna
    --trim adapters > uf1848/uf1848_hac5_unsorted.bam
    ```

2. Sort and index bam:

    ```bash
    samtools sort --write-index -o uf1857_hac430.bam -O BAM --reference ref/hg38.fna uf1857/uf1857_hac430_unsorted.bam
    ```

3. Create a BedMethyl file with Modkit
    
    ```bash
    modkit pileup bam/uf1829.bam bed/uf1829.bed -t 32 --combine-strands --cpg --ignore h --ref ref/hg38.fna --no-filtering --include-bed bed/pacmap_reference.bed
    ```

## Where the data at?

In [1]:
import pandas as pd

mount = '/mnt/e/'
input_path = mount + 'Intermediate_Files/'
output_path = mount + 'Processed_Files/'
zhou2016_probes_path = mount + 'UnreliableProbesList_Zhou2016/EPIC.anno.GRCh38.tsv'

## Create BED6 file to harmonize probes from EPIC array with nanopore

In [2]:
# # read df_discovery and df_validation
# df_discovery = pd.read_pickle(
#     input_path+'3314samples_331557cpgs_withbatchcorrection_bvalues.pkl').sort_index().iloc[:,1:]

# # Load the list of suboptimal probes
# array_reference = pd.read_csv(zhou2016_probes_path, sep='\t',index_col=0)

# pacmap_reference = array_reference.loc[df_discovery.columns].reset_index()

# # change values in `orientation` column to match the orientation of the probes in the discovery dataset
# pacmap_reference['orientation'] = pacmap_reference['orientation'].map({'down': '-', 'up': '+'})
# pacmap_reference['score'] = 0

# # Rename `IlmnID` to `name` to follow BED standard
# pacmap_reference = pacmap_reference.rename(columns={'IlmnID': 'name', 'orientation': 'strand'})

# # Order columns
# pacmap_reference = pacmap_reference[['chrm', 'start', 'end', 'name', 'score', 'strand', ]]

# # Sort by `chrm` and `start`
# pacmap_reference = pacmap_reference.sort_values(by=['chrm', 'start'])

# pacmap_reference.to_csv('../data/pacmap_reference.bed', sep='\t', index=False, header=False)

pacmap_reference = pd.read_csv('../data/pacmap_reference.bed', sep='\t',  names=['chrm', 'start', 'end', 'name', 'score', 'strand'])
pacmap_reference

Unnamed: 0,chrm,start,end,name,score,strand
0,chr1,69590,69592,cg21870274,0,+
1,chr1,864702,864704,cg08258224,0,-
2,chr1,870160,870162,cg16619049,0,-
3,chr1,877158,877160,cg18147296,0,-
4,chr1,898802,898804,cg13938959,0,+
...,...,...,...,...,...,...
331552,chr9,138119084,138119086,cg00378292,0,+
331553,chr9,138120221,138120223,cg07982825,0,-
331554,chr9,138122338,138122340,cg14491707,0,+
331555,chr9,138122548,138122550,cg13811936,0,-


## Where data at?

In [5]:
import pandas as pd
from pathlib import Path

# Constants
MOUNT = Path('/mnt/e/')
SAMPLE_NAME = 'uf1837'
PACMAP_REFERENCE_PATH = Path('../data/pacmap_reference.bed')
NANOPORE_PROCESSED_PATH = MOUNT / 'nanopore_processed/bed'

# Column names
BED_COLUMNS = [
    "chrom", "start_position", "end_position", "modified_base_code", "score",
    "strand", "start_position2", "end_position2", "color", "Nvalid_cov",
    "fraction_modified", "Nmod", "Ncanonical", "Nother_mod", "Ndelete",
    "Nfail", "Ndiff", "Nnocall"
]

def read_pacmap_reference(file_path):
    return pd.read_csv(
        file_path, 
        sep='\t', 
        names=['chrm', 'start', 'end', 'name', 'score', 'strand']
    )

def read_sample_data(file_path):
    return pd.read_csv(file_path, sep='\t', names=BED_COLUMNS)

def create_coordinate_column(df, chrom_col, start_col):
    return df[chrom_col].astype(str) + ':' + df[start_col].astype(str)

def process_data(pacmap_ref, sample_df, sample_name):
    # Filter and set index for sample data
    sample_filtered = sample_df[sample_df['modified_base_code'] == 'm'].set_index('coordinate')
    
    # Merge data
    merged = pacmap_ref[['name']].join(sample_filtered, how='inner')
    
    # Calculate beta values and prepare final DataFrame
    processed = merged[['name', 'fraction_modified']].copy()
    processed[sample_name] = (processed['fraction_modified'] / 100).round(3)
    processed = processed[['name', sample_name]].set_index('name').T
    
    return processed.sort_index(axis=1)

def main():
    # Read data
    pacmap_reference = read_pacmap_reference(PACMAP_REFERENCE_PATH)
    sample_df = read_sample_data(NANOPORE_PROCESSED_PATH / f'{SAMPLE_NAME}.bed')
    
    # Create coordinate columns
    pacmap_reference['coordinate'] = create_coordinate_column(pacmap_reference, 'chrm', 'start')
    sample_df['coordinate'] = create_coordinate_column(sample_df, 'chrom', 'start_position')
    
    # Set index for pacmap_reference
    pacmap_reference.set_index('coordinate', inplace=True)
    
    # Process data
    result = process_data(pacmap_reference, sample_df, SAMPLE_NAME)
    
    # Save result as pickle file
    # output_path = NANOPORE_PROCESSED_PATH / f'{SAMPLE_NAME}_processed.pkl'
    # result.to_pickle(output_path)
    
    return result

# Run the main function
result = main()
result  # This will display the result in the notebook

name,cg00000109,cg00000236,cg00000292,cg00000363,cg00000622,cg00000658,cg00000714,cg00000721,cg00000734,cg00000769,...,cg27665648,cg27665659,cg27665715,cg27665767,cg27665769,cg27665860,cg27665925,cg27665985,cg27666046,cg27666123
uf1837,1.0,0.875,1.0,0.714,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.4,1.0,0.833,0.0,1.0,0.0,0.625,0.143,0.667


In [7]:
import pandas as pd
from pathlib import Path

# Constants
MOUNT = Path('/mnt/e/')
PACMAP_REFERENCE_PATH = Path('../data/pacmap_reference.bed')
NANOPORE_PROCESSED_PATH = MOUNT / 'nanopore_processed/bed'

# Column names
BED_COLUMNS = [
    "chrom", "start_position", "end_position", "modified_base_code", "score",
    "strand", "start_position2", "end_position2", "color", "Nvalid_cov",
    "fraction_modified", "Nmod", "Ncanonical", "Nother_mod", "Ndelete",
    "Nfail", "Ndiff", "Nnocall"
]

def read_pacmap_reference(file_path):
    return pd.read_csv(
        file_path, 
        sep='\t', 
        names=['chrm', 'start', 'end', 'name', 'score', 'strand']
    )

def read_sample_data(file_path):
    return pd.read_csv(file_path, sep='\t', names=BED_COLUMNS)

def create_coordinate_column(df, chrom_col, start_col):
    return df[chrom_col].astype(str) + ':' + df[start_col].astype(str)

def process_sample(pacmap_ref, sample_df, sample_name):
    # Filter and set index for sample data
    sample_filtered = sample_df[sample_df['modified_base_code'] == 'm'].set_index('coordinate')
    
    # Merge data
    merged = pacmap_ref[['name']].join(sample_filtered, how='inner')
    
    # Calculate beta values and prepare final Series
    beta_values = (merged['fraction_modified'] / 100).round(3)
    beta_values.index = merged['name']
    beta_values.name = sample_name
    
    return beta_values

def process_directory(directory_path, pacmap_reference):
    results = []
    
    for bed_file in directory_path.glob('*.bed'):
        sample_name = bed_file.stem  # Use filename without extension as sample name
        sample_df = read_sample_data(bed_file)
        
        # Create coordinate column
        sample_df['coordinate'] = create_coordinate_column(sample_df, 'chrom', 'start_position')
        
        # Process sample
        result = process_sample(pacmap_reference, sample_df, sample_name)
        results.append(result)
    
    # Concatenate all results
    return pd.concat(results, axis=1).T

def main():
    # Read pacmap reference
    pacmap_reference = read_pacmap_reference(PACMAP_REFERENCE_PATH)
    pacmap_reference['coordinate'] = create_coordinate_column(pacmap_reference, 'chrm', 'start')
    pacmap_reference.set_index('coordinate', inplace=True)
    
    # Process all samples in the directory
    result = process_directory(NANOPORE_PROCESSED_PATH, pacmap_reference)
    
    # Save result as pickle file
    output_path = NANOPORE_PROCESSED_PATH / 'all_samples_processed.pkl'
    result.to_pickle(output_path)
    
    return result

# Run the main function
result = main()
result  # This will display the result in the notebook

name,cg21870274,cg08258224,cg16619049,cg18147296,cg13938959,cg12445832,cg23999112,cg11527153,cg27573606,cg04195702,...,cg09635994,cg19004771,cg20569369,cg26034629,cg25232725,cg05615487,cg22122449,cg08423507,cg19565306,cg04700648
uf1829,1.0,0.714,0.214,0.454,0.056,0.0,0.059,0.792,0.956,0.917,...,,,,,,,,,,
uf1830,0.5,0.667,0.083,0.37,0.292,0.125,0.167,0.889,1.0,0.966,...,0.846,0.75,0.8,0.833,0.765,0.944,0.867,0.75,0.0,
uf1831,1.0,0.667,0.857,0.7,0.889,0.556,0.667,1.0,1.0,0.875,...,1.0,1.0,0.6,0.8,1.0,0.8,0.833,1.0,0.0,1.0
uf1832,,1.0,0.0,1.0,1.0,0.25,0.5,0.75,1.0,1.0,...,1.0,0.6,1.0,1.0,1.0,1.0,0.8,1.0,0.0,0.857
uf1837,1.0,1.0,0.833,0.8,0.5,0.5,0.5,0.8,1.0,1.0,...,1.0,0.75,1.0,1.0,0.8,0.75,1.0,0.667,0.0,1.0
uf1838,1.0,0.875,0.909,0.79,0.667,0.5,0.667,0.875,1.0,1.0,...,1.0,0.889,0.889,0.889,0.875,0.889,0.875,1.0,0.0,1.0


In [3]:
import pandas as pd

mount = '/mnt/e/'

sample_name = 'uf1837'

pacmap_reference = pd.read_csv('../data/pacmap_reference.bed', sep='\t',  names=['chrm', 'start', 'end', 'name', 'score', 'strand'])

df = pd.read_csv(mount + 'nanopore_processed/bed/' + sample_name +'.bed', sep='\t',names=["chrom", "start_position","end_position","modified base code","score",
                        'strand' ,"start position","end position", "color", "Nvalid_cov",
                        "fraction modified", "Nmod", "Ncanonical", "Nother_mod", "Ndelete",
                        "Nfail", "Ndiff", "Nnocall"])

# Create 'coordinate' column for merging
df['coordinate'] = df['chrom'].astype(str) + ':' + df['start_position'].astype(str)

df_filtered = df[df['modified base code'].isin(['m'])].set_index('coordinate')

pacmap_reference['coordinate'] = pacmap_reference['chrm'].astype(str) + ':' + pacmap_reference['start'].astype(str)
pacmap_reference = pacmap_reference.set_index('coordinate')

df_merged = pacmap_reference[['name']].join(df_filtered, how='inner')

# Transform the fraction modified into beta values
df_merged.loc[:, sample_name] = (df_merged['fraction modified'] / 100).round(3)

# Calculate the fraction modified and prepare the final DataFrame
df_merged.loc[:, sample_name] = (df_merged['fraction modified'] / 100).round(3)

df_processed = df_merged[['name', sample_name]].set_index('name').T

# sort columns
df_processed = df_processed.sort_index(axis=1)

In [4]:
df_processed

name,cg00000109,cg00000236,cg00000292,cg00000363,cg00000622,cg00000658,cg00000714,cg00000721,cg00000734,cg00000769,...,cg27665648,cg27665659,cg27665715,cg27665767,cg27665769,cg27665860,cg27665925,cg27665985,cg27666046,cg27666123
uf1837,1.0,0.875,1.0,0.714,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.4,1.0,0.833,0.0,1.0,0.0,0.625,0.143,0.667


## Other

In [10]:
import pandas as pd
import glob

# Get a list of all CSV files in the directory
file_list = glob.glob(mount + 'nanopore_processed/bed/*.bed')

# Create an empty list to store the dataframes
dfs = []

# Read each CSV file and append it to the list
for file in file_list:
    df = pd.read_csv(file, index_col=0, sep='\t', header=None, names=['chrm', 'start', 'end', 'name', 'score', 'strand'])
    dfs.append(df)

# Concatenate all dataframes in the list
df = pd.concat(dfs)

ParserError: Could not construct index. Requested to use 1 number of columns, but 12 left to parse.

In [9]:
df

Unnamed: 0_level_0,69590,69591,m,1,.,69590.1,69591.1,"255,0,0",1.1,100.00,...,3,3.1,66.67,2,2.1,2.2,898802,898803,898802.1,898803.1
chr1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1,898802.0,898803.0,m,7.0,.,898802.0,898803.0,25500,7.0,0.00,...,,,,,,,,,,
chr1,902155.0,902156.0,m,6.0,.,902155.0,902156.0,25500,6.0,83.33,...,,,,,,,,,,
chr1,903105.0,903106.0,m,8.0,.,903105.0,903106.0,25500,8.0,100.00,...,,,,,,,,,,
chr1,904371.0,904372.0,m,9.0,.,904371.0,904372.0,25500,9.0,11.11,...,,,,,,,,,,
chr1,904893.0,904894.0,m,9.0,.,904893.0,904894.0,25500,9.0,0.00,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr22,,,m,2.0,.,,,25500,2.0,100.00,...,,,,,,,50735217.0,50735218.0,50735217.0,50735218.0
chr22,,,m,1.0,.,,,25500,1.0,100.00,...,,,,,,,50736935.0,50736936.0,50736935.0,50736936.0
chr22,,,m,3.0,.,,,25500,3.0,100.00,...,,,,,,,50737978.0,50737979.0,50737978.0,50737979.0
chr22,,,m,3.0,.,,,25500,3.0,66.67,...,,,,,,,50738282.0,50738283.0,50738282.0,50738283.0


## Appendix 1. Retrospective lower coverage analysis

```bash
samtools view -@ 32 -bh -s ${subsampling_fraction} bam/uf_hembank_${ID}.bam > bam/subsampled_${ID}_01x.bam
```