In [None]:
import subprocess
from pathlib import Path
import pandas as pd
import numpy as np

In [None]:
def seq_stats(seqfile):
    output = subprocess.check_output(f'seqkit stats -abT {seqfile} | tail -n 1', shell=True)
    output = output.decode().split()
    num_seqs, sum_len, n50 = output[3], output[4], output[12]
    return num_seqs, sum_len, n50

In [None]:
dirpath = Path('/media/GenomicResearch/Issue/20220215_modification_paper_data')
labels = ['ILLUMINA_WGS', 'ONT_WGS', 'ONT_WGA']

refpath = dirpath/'RefSeqs'

In [None]:
def calculate_distance(reference, query):
    reference = pd.read_csv(reference, sep='\t', index_col=0)
    query = pd.read_csv(query, sep='\t', index_col=0)
    return (query['allele_id'].fillna(0) != reference['allele_id'].fillna(0)).sum()

In [None]:
data = []
for label in labels:
    for query in (dirpath/label/'Profile').iterdir():
        isolate = query.stem
        reference = refpath/'Profile'/(isolate + '.tsv')
        distance = calculate_distance(reference, query)
        data.append((isolate, label, distance))

distance_df = pd.DataFrame(data, columns=['Isolate', 'Method', 'cgMLST_distance'])

In [None]:
data = []
for label in labels:
    for query in (dirpath/label/'Depth').iterdir():
        isolate = query.stem
        with open(query) as handle:
            all_position_depth = (line.split()[-1] for line in handle)
            mean_depth = np.mean(list(map(lambda x: int(x), all_position_depth)))
        data.append((isolate, label, mean_depth))

depth_df = pd.DataFrame(data, columns=['Isolate', 'Method', 'Depth'])

depth_df['Depth'] = depth_df['Depth'].round(2)

In [None]:
accuracy_df = pd.DataFrame()
for label in labels:
    for query in (dirpath/label/'Assembly_accuracy').iterdir():
        isolate = query.stem
        df = pd.read_csv(query, sep='\t', usecols=['qscore', 'segment_median_qscore', 'num_mismatches', 'num_insertions', 'num_deletions'])
        df.index = [isolate]
        df['Method'] = label
        accuracy_df = pd.concat([accuracy_df, df])
accuracy_df.index.name = 'Isolate'
accuracy_df = accuracy_df.reset_index()

In [None]:
sequence_df = pd.DataFrame()
for label in labels:
    df = pd.DataFrame({query.stem: seq_stats(query) for query in (dirpath/label/'Contigs').iterdir()}, index=['contig_num', 'genome_size', 'N50']).T
    df['Method'] = label
    df.index.name = 'Isolate'
    df = df.reset_index()
    sequence_df = pd.concat([sequence_df, df])

sequence_df['N50'] = sequence_df['N50'].astype(int)
sequence_df['contig_num'] = sequence_df['contig_num'].astype(int)

In [None]:
results = distance_df.merge(accuracy_df).merge(depth_df).merge(sequence_df)
results.head()

In [None]:
pivot_table = pd.pivot_table(results, ['cgMLST_distance', 'Depth', 'qscore', 'segment_median_qscore', 'num_mismatches', 'num_insertions', 'num_deletions', 'N50', 'contig_num'], index=['Isolate'], columns=['Method'],)

In [None]:
pivot_table.columns = pivot_table.columns.swaplevel(0, 1)
pivot_table = pivot_table.sort_index(axis=1)
pivot_table

In [None]:
sequence_type = pd.read_csv('/media/GenomicResearch/Issue/20201221_hybrid_and_denovo/Unicycler/MLST.tsv', sep='\t', usecols=['Key', 'ST'], index_col=0)
sequence_type = sequence_type['ST'].to_dict()

pivot_table.insert(0, 'MLST' ,pivot_table.index.map(sequence_type))

In [None]:
with pd.ExcelWriter('/media/GenomicResearch/Issue/20220215_modification_paper_data/report.xlsx') as writer:
    pivot_table.to_excel(writer)

In [None]:
%%time
with open('/media/GenomicResearch/Issue/20220215_modification_paper_data/ONT_WGS/Depth/R20-0140.txt') as handle:
    all_position_depth = (line.split()[-1] for line in handle)
    mean_depth = np.mean(list(map(lambda x: int(x), all_position_depth)))

In [None]:
mean_depth