## Load Modules

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import glob
import os

## Functions

In [2]:
def extract_table_from_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    
    table = soup.find("table", class_="dataframe")
    if table is None:
        return None
    
    headers = [th.text for th in table.find("thead").find_all("th")]
    rows = []
    for tr in table.find("tbody").find_all("tr"):
        cells = [td.text.strip() for td in tr.find_all("td")]
        rows.append(cells)
    
    return pd.DataFrame(rows, columns=headers)

In [3]:
def extract_all_html(file_list, has_pcw=False):
    data = []

    for file in file_list:
        df = extract_table_from_html(file)
        if has_pcw:
            df['cluster'] = file.split('/')[9].split('_')[0]
            df['pcw'] = file.split('/')[9].split('_')[1]
            df['fold'] = file.split('/')[10]
            cols_left = ['cluster', 'pcw', 'fold']
            idx_cols = [c for c in df.columns if c in cols_left]
            idx_cols += [c for c in df.columns if c not in cols_left and 'logo' not in c and 'cwm' not in c]
        else:
            df['cluster'] = file.split('/')[9]
            df['fold'] = file.split('/')[10]
            cols_left = ['cluster', 'fold']
            idx_cols = [c for c in df.columns if c in cols_left]
            idx_cols += [c for c in df.columns if c not in cols_left and 'logo' not in c and 'cwm' not in c]
        df = df[idx_cols]
        
        if df is not None:
            data.append(df)
    
    # return concatenated data
    return pd.concat(data, ignore_index=True)

catdtype_pattern = pd.CategoricalDtype(categories=['pos', 'neg'], ordered=True)

def reorder_by_pcw(df):
    # extract ordering values
    df['pcw'] = df['cluster'].str.extract(r'PCW(\d+)').astype(int)
    df['pattern_type'] = df['pattern'].str.extract(r'(pos|neg)').astype(catdtype_pattern)
    df['pattern_idx'] = df['pattern'].str.extract(r'pattern_(\d+)').astype(int)
    vars_order = ['pcw', 'pattern_type', 'pattern_idx']
    
    # sort & tidy
    df = df.sort_values(by=vars_order).drop(columns=vars_order)

    return df

## Identify Files

In [4]:
# Process all HTML files in the current directory
html_str = "/work/aaa/projects/chrombpnet-devmult/pipeline/results/chrombpnet_nobias/hsc_unified/*/*/modisco/counts_scores/motifs.html"
html_files_all = glob.glob(html_str)

In [5]:
folds = ["mean"]
html_dict = {
    'hsc_pcw': {fold: [f for f in html_files_all if ("PCW" in f) and (fold in f) and ("HSC_" in f)] for fold in folds}
}

for k, fs in html_dict.items():
    for f, hs in fs.items():
        print(f'{k} - {f}: {len(hs)}')

hsc_pcw - mean: 14


In [6]:
html_dict

{'hsc_pcw': {'mean': ['/work/aaa/projects/chrombpnet-devmult/pipeline/results/chrombpnet_nobias/hsc_unified/HSC_PCW8/mean/modisco/counts_scores/motifs.html',
   '/work/aaa/projects/chrombpnet-devmult/pipeline/results/chrombpnet_nobias/hsc_unified/HSC_PCW14/mean/modisco/counts_scores/motifs.html',
   '/work/aaa/projects/chrombpnet-devmult/pipeline/results/chrombpnet_nobias/hsc_unified/HSC_PCW16/mean/modisco/counts_scores/motifs.html',
   '/work/aaa/projects/chrombpnet-devmult/pipeline/results/chrombpnet_nobias/hsc_unified/HSC_PCW5/mean/modisco/counts_scores/motifs.html',
   '/work/aaa/projects/chrombpnet-devmult/pipeline/results/chrombpnet_nobias/hsc_unified/HSC_PCW6/mean/modisco/counts_scores/motifs.html',
   '/work/aaa/projects/chrombpnet-devmult/pipeline/results/chrombpnet_nobias/hsc_unified/HSC_PCW18/mean/modisco/counts_scores/motifs.html',
   '/work/aaa/projects/chrombpnet-devmult/pipeline/results/chrombpnet_nobias/hsc_unified/HSC_PCW17/mean/modisco/counts_scores/motifs.html',
   '

# Load Data

In [7]:
df_dict = {k: {f: extract_all_html(hs) if len(hs) > 0 else None for f, hs in fs.items()} for k, fs in html_dict.items()}
df_dict = {k: {f: reorder_by_pcw(df) for f, df in fs.items()} for k, fs in df_dict.items()}
for k, fs in df_dict.items():
    for f, df in fs.items():
        print(f'{k} - {f}: {0 if df is None else len(df)}')

hsc_pcw - mean: 943


In [8]:
for k, fs in df_dict.items():
    for f, df in fs.items():
        if df is not None:
            display(df)

Unnamed: 0,cluster,fold,pattern,num_seqlets,match0,qval0,match1,qval1,match2,qval2,...,match5,qval5,match6,qval6,match7,qval7,match8,qval8,match9,qval9
220,HSC_PCW5,mean,pos_patterns.pattern_0,87175,ETV6.H13CORE.1.P.B,6.256450e-04,EHF.H13CORE.0.P.B,1.998510e-03,ERF.H13CORE.0.PS.A,1.998510e-03,...,SPI1.H13CORE.0.P.B,6.073680e-03,SPIB.H13CORE.0.P.B,7.838080e-03,SPIB.H13CORE.2.SM.B,7.838080e-03,ETV1.H13CORE.1.PM.A,1.174170e-02,ETV2.H13CORE.1.PM.A,1.174170e-02
221,HSC_PCW5,mean,pos_patterns.pattern_1,37227,CTCF.H13CORE.0.P.B,6.511990e-12,CTCFL.H13CORE.0.P.B,9.159230e-07,ZNF503.H13CORE.0.P.B,1.356880e-01,...,MUSC.H13CORE.0.SM.B,1.613900e-01,ZIC2.H13CORE.0.P.B,1.916780e-01,ZIC3.H13CORE.0.P.B,1.916780e-01,NDF2.H13CORE.0.P.B,1.916780e-01,ZIC5.H13CORE.0.P.B,3.607400e-01
222,HSC_PCW5,mean,pos_patterns.pattern_2,13631,SP3.H13CORE.0.P.B,9.538220e-07,SP1.H13CORE.2.P.B,9.538220e-07,SP1.H13CORE.0.P.B,1.378540e-05,...,SP4.H13CORE.0.P.C,5.216870e-05,KLF8.H13CORE.0.P.C,8.343540e-05,KLF9.H13CORE.1.P.B,2.190180e-04,SP2.H13CORE.0.P.D,4.060670e-04,SP5.H13CORE.0.P.B,8.455530e-04
223,HSC_PCW5,mean,pos_patterns.pattern_3,7293,RUNX3.H13CORE.0.P.B,6.307710e-02,RUNX2.H13CORE.0.P.B,6.307710e-02,RUNX1.H13CORE.0.P.B,6.307710e-02,...,RUNX3.H13CORE.2.S.B,5.547530e-01,RUNX2.H13CORE.1.S.B,5.547530e-01,FOXH1.H13CORE.0.P.B,7.128760e-01,TCF7.H13CORE.0.PSM.A,8.150840e-01,RUNX3.H13CORE.1.S.B,1.000000e+00
224,HSC_PCW5,mean,pos_patterns.pattern_4,5862,SPIB.H13CORE.1.S.C,3.203210e-02,SPI1.H13CORE.0.P.B,1.866570e-01,SPIB.H13CORE.0.P.B,1.866570e-01,...,ZNF92.H13CORE.0.P.B,3.059600e-01,SPIB.H13CORE.2.SM.B,3.059600e-01,SPI1.H13CORE.1.S.B,3.059600e-01,TWST1.H13CORE.0.P.B,3.059600e-01,TWST1.H13CORE.1.P.B,3.059600e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,HSC_PCW18,mean,neg_patterns.pattern_1,42,ZN248.H13CORE.0.P.B,1.000000e+00,REL.H13CORE.0.P.B,1.000000e+00,ZNF7.H13CORE.0.P.B,1.000000e+00,...,NFIA.H13CORE.1.PS.A,1.000000,TEAD4.H13CORE.1.P.B,1.000000,NR6A1.H13CORE.0.P.C,1.000000,TEAD1.H13CORE.0.PSM.A,1.000000,EHF.H13CORE.0.P.B,1.000000
414,HSC_PCW18,mean,neg_patterns.pattern_2,40,ZN410.H13CORE.0.SM.B,1.132100e-01,ZN264.H13CORE.0.P.B,4.434950e-01,ZKSCAN4.H13CORE.0.SG.A,1.000000e+00,...,ZNF831.H13CORE.0.PSGI.A,1.000000,,,,,,,,
415,HSC_PCW18,mean,neg_patterns.pattern_3,38,ZNF841.H13CORE.0.PSGI.A,1.610920e-01,ZN296.H13CORE.0.S.C,5.335420e-01,ZN610.H13CORE.1.P.C,5.335420e-01,...,SOX10.H13CORE.2.S.C,1.000000,ZSCA4.H13CORE.0.SM.B,1.000000,ZXDA.H13CORE.0.PSI.A,1.000000,ZNF26.H13CORE.1.P.B,1.000000,ZNF587B.H13CORE.0.SG.A,1.000000
416,HSC_PCW18,mean,neg_patterns.pattern_4,23,HXB6.H13CORE.0.SM.B,1.000000e+00,HXB8.H13CORE.0.PSM.A,1.000000e+00,,,...,,,,,,,,,,


# Export Files

In [9]:
# Save to TSV and XLSX
os.makedirs("motifs", exist_ok=True)

for k, fs in df_dict.items():
    for f, df in fs.items():
        if df is not None:
            df.to_csv(f"motifs/df_counts_motifs_{k}_{f}.tsv.gz", sep="\t", index=False)
            df.to_excel(f"motifs/df_counts_motifs_{k}_{f}.xlsx", index=False)