# Test Dataset (Work in Progress)

1. Basecall and align using Dorado and minimap2:

    ```bash
    ~/dorado-0.7.3-linux-x64/bin/dorado basecaller hac,5mCG_5hmCG uf1848/20240905_1712_P2S-02395-A_PAW87958_2d796b97/pod5/ --reference ref/hg38.fna
    --trim adapters > uf1848/uf1848_hac5_unsorted.bam
    ```

2. Sort and index bam:

    ```bash
    samtools sort --write-index -o uf1857_hac430.bam -O BAM --reference ref/hg38.fna uf1857/uf1857_hac430_unsorted.bam
    ```

3. Create a BedMethyl file with Modkit
    
    ```bash
    modkit pileup bam/uf_hembank_${ID}.bam bed/uf_hembank_${ID}.bed -t 32 --combine-strands --cpg --ignore h --ref ref/hg38.fna
    ```

## Create BED6 file to harmonize probes from EPIC array with nanopore

In [None]:
import pandas as pd

mount = '/mnt/e/'

input_path = mount + 'nanopore_processed/'

# read df_discovery and df_validation
df_discovery = pd.read_pickle(mount+'MethylScore_v2/Intermediate_Files/'+'3308samples_333059cpgs_withbatchcorrection_bvalues.pkl').sort_index().iloc[:,1:]

array_reference = pd.read_csv("/mnt/c/Users/fmarc/OneDrive/Desktop/nanopore_processed/ref/EPIC.anno.GRCh38.tsv.gz", sep='\t', compression='gzip',
                              usecols=['chrm','start','end','orientation', 'probeID']
                              ).set_index('probeID').sort_index()

pacmap_reference = array_reference.loc[df_discovery.columns].reset_index()

# change values in `orientation` column to match the orientation of the probes in the discovery dataset
pacmap_reference['orientation'] = pacmap_reference['orientation'].map({'down': '-', 'up': '+'})
pacmap_reference['score'] = 0

# Rename `IlmnID` to `name` to follow BED standard
pacmap_reference = pacmap_reference.rename(columns={'IlmnID': 'name', 'orientation': 'strand'})

# Order columns
pacmap_reference = pacmap_reference[['chrm', 'start', 'end', 'name', 'score', 'strand', ]]

# Sort by `chrm` and `start`
pacmap_reference = pacmap_reference.sort_values(by=['chrm', 'start'])

pacmap_reference.to_csv('../data/pacmap_reference.bed', sep='\t', index=False, header=False)

pacmap_reference = pd.read_csv('../data/pacmap_reference.bed', sep='\t', header=None, names=['chrm', 'start', 'end', 'name', 'score', 'strand'])
pacmap_reference

## Where data at?

In [1]:
import pandas as pd
import glob

# Get a list of all CSV files in the directory
file_list = glob.glob('../../pacmap/*.csv')

# Create an empty list to store the dataframes
dfs = []

# Read each CSV file and append it to the list
for file in file_list:
    df = pd.read_csv(file, index_col=0)
    dfs.append(df)

# Concatenate all dataframes in the list
df = pd.concat(dfs)

In [2]:
df

Unnamed: 0,cg00000109,cg00000236,cg00000292,cg00000363,cg00000622,cg00000658,cg00000714,cg00000721,cg00000734,cg00000769,...,cg21278787,cg22708233,cg23138682,cg23282051,cg24677744,cg25023752,cg26185531,cg26293201,cg26576875,cg07148304
uf_hembank_1852,1.0,0.913,0.615,0.174,0.0,0.964,0.25,1.0,0.0,0.0,...,,,,,,,,,,
uf_hembank_1831,1.0,0.8,0.786,0.143,0.0,0.933,0.25,0.933,0.0,0.0,...,,,,,,,,,,
uf_hembank_1829,0.889,0.632,0.75,0.0,0.0,1.0,0.1,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,
uf_hembank_1830,0.96,0.5,0.615,0.294,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
uf_hembank_1832,1.0,1.0,0.8,0.0,0.0,1.0,0.0,1.0,0.0,,...,,0.0,,,,,,,,
uf_hembank_1841,0.714,0.9,0.667,0.0,0.0,0.923,0.091,0.929,0.0,0.0,...,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


## Appendix 1. Retrospective lower coverage analysis

```bash
samtools view -@ 32 -bh -s ${subsampling_fraction} bam/uf_hembank_${ID}.bam > bam/subsampled_${ID}_01x.bam
```