In [1]:
import pandas as pd
import sys
import os
import glob
from collections import defaultdict, OrderedDict
# from matplotlib import pyplot as plt
import pprint
# from IPython.display import display

from process_alignment_table import *

# %matplotlib inline

In [2]:
set_pandas_display_options()

In [42]:
directory = '/scratch/groups/dpwall/personal/chloehe/unmapped_reads/bam'

fam1_folders = sorted([os.path.join(directory, 'fam1', folder) for folder in os.listdir(os.path.join(directory, 'fam1')) if os.path.isdir(os.path.join(directory, 'fam1', folder))])
fam1_folders.remove(os.path.join(directory, 'fam1', 'old'))
fam2_folders = sorted([os.path.join(directory, 'fam2', folder) for folder in os.listdir(os.path.join(directory, 'fam2')) if os.path.isdir(os.path.join(directory, 'fam2', folder))])
fam3_folders = sorted([os.path.join(directory, 'fam3', folder) for folder in os.listdir(os.path.join(directory, 'fam3')) if os.path.isdir(os.path.join(directory, 'fam3', folder))])

fam1_folders

['/scratch/groups/dpwall/personal/chloehe/unmapped_reads/bam/fam1/MH0143008',
 '/scratch/groups/dpwall/personal/chloehe/unmapped_reads/bam/fam1/MH0143009',
 '/scratch/groups/dpwall/personal/chloehe/unmapped_reads/bam/fam1/MH0143013',
 '/scratch/groups/dpwall/personal/chloehe/unmapped_reads/bam/fam1/MH0143018',
 '/scratch/groups/dpwall/personal/chloehe/unmapped_reads/bam/fam1/MH0143019']

In [20]:
def get_viral_alignment(directory):
    folders = [folder for folder in os.listdir(directory) if os.path.isdir(os.path.join(directory, folder))]
    folders = sorted(folders)
    try:
        folders.remove('old')
    except:
        pass
    
    family_tallied = []
    family_counts = []
    for folder in folders:
        print(folder + '..')
        
        # read final_alignment_table.csv into memory
        dtype = {'R1_ref': object, 'R1_start': object, 'R1_MAPQ': 'float64',
                 'R2_ref': object, 'R2_start': object, 'R2_MAPQ': 'float64',
                 'is_proper_pair': bool}
        file = glob.glob(os.path.join(directory, folder, '*alignment_table.csv'))[0]
        table = pd.read_csv(file, dtype=dtype, index_col=0)

        viral_counts = defaultdict(lambda: defaultdict(int))
        for row in table.itertuples():
            if row.R1_ref.startswith('NC') or row.R1_ref.startswith('VIRL'):
                viral_counts[row.R1_ref][row.R2_ref] += 1
            elif row.R2_ref.startswith('NC') or row.R2_ref.startswith('VIRL'):
                viral_counts[row.R2_ref][row.R1_ref] += 1
        
        viral_total = defaultdict(int)
        for key in viral_counts:
            viral_total[key] = sum(viral_counts[key].values())
            viral_counts[key] = OrderedDict(sorted(viral_counts[key].items(), key=lambda x: x[1], reverse=True))
        viral_total = OrderedDict(sorted(viral_total.items(), key=lambda x: x[1], reverse=True))

        viral_counts_sorted = OrderedDict()
        for key in list(viral_total):
            viral_counts_sorted[key] = viral_counts[key]
            
        family_tallied.append(viral_total)
        family_counts.append(viral_counts_sorted)
        
    return family_tallied, family_counts

**Family 1**

In [21]:
fam1_virus_tallied, fam1_virus_counts = get_viral_alignment(os.path.join(directory, 'fam1'))

df1 = []
for i, tallied in enumerate(fam1_virus_tallied):
    df1.append(pd.DataFrame.from_dict(tallied, orient='index', columns=[os.path.basename(fam1_folders[i])]))
df1 = pd.concat(df1, axis=1)
# df1 = df1[df1.ge(10).any(axis=1)]
# df1

MH0143008..
MH0143009..
MH0143013..
MH0143018..
MH0143019..


**Family 2**

In [22]:
fam2_virus_tallied, fam2_virus_counts = get_viral_alignment(os.path.join(directory, 'fam2'))
df2 = []
for i, tallied in enumerate(fam2_virus_tallied):
    df2.append(pd.DataFrame.from_dict(tallied, orient='index', columns=[os.path.basename(fam2_folders[i])]))
df2 = pd.concat(df2, axis=1)
# df2 = df2[df2.ge(10).any(axis=1)]
# df2

02C10540..
02C10541..
02C10542..
02C10543..


**Family 3**

In [23]:
fam3_virus_tallied, fam3_virus_counts = get_viral_alignment(os.path.join(directory, 'fam3'))
df3 = []
for i, tallied in enumerate(fam3_virus_tallied):
    df3.append(pd.DataFrame.from_dict(tallied, orient='index', columns=[os.path.basename(fam3_folders[i])]))
df3 = pd.concat(df3, axis=1)
# df3 = df3[df3.ge(10).any(axis=1)]
# df3

03C16794..
03C16795..
03C16796..
03C16797..
03C16798..


**Combined**

In [46]:
threshold = 10
df = pd.concat([df1, df2, df3], axis=1)
df = df[df.ge(threshold).any(axis=1)]
df = df.fillna(0)
df.insert(0, 'pop_average', df.mean(numeric_only=True, axis=1))
df = df.sort_values('pop_average', ascending=False)
df

Unnamed: 0,pop_average,MH0143008,MH0143009,MH0143013,MH0143018,MH0143019,02C10540,02C10541,02C10542,02C10543,03C16794,03C16795,03C16796,03C16797,03C16798
VIRL|gi|9626372|ref|NC_001422.1|,412439.642857,66886.0,349630.0,170851.0,401790.0,185424.0,644015.0,659172.0,299970.0,724881.0,546050.0,700430.0,214390.0,338342.0,472324.0
NC_048798.1,36990.714286,64084.0,161184.0,64548.0,79940.0,63766.0,311.0,602.0,778.0,686.0,9444.0,6612.0,15207.0,22171.0,28537.0
VIRL|gi|73852470|ref|NC_007346.1|,109.714286,69.0,104.0,112.0,122.0,107.0,97.0,110.0,154.0,100.0,108.0,113.0,121.0,117.0,102.0
VIRL|gi|9629210|ref|NC_001782.1|,60.928571,100.0,128.0,82.0,54.0,78.0,1.0,0.0,1.0,3.0,88.0,88.0,80.0,80.0,70.0
VIRL|gi|131840030|ref|NC_009127.1|,50.428571,64.0,52.0,53.0,51.0,71.0,38.0,33.0,101.0,29.0,31.0,29.0,47.0,51.0,56.0
VIRL|gi|56694721|ref|NC_006560.1|,38.357143,48.0,44.0,54.0,46.0,53.0,25.0,23.0,38.0,31.0,27.0,25.0,39.0,45.0,39.0
VIRL|gi|22129792|ref|NC_004102.1|,21.785714,20.0,53.0,28.0,20.0,26.0,0.0,0.0,0.0,0.0,30.0,45.0,28.0,29.0,26.0
VIRL|gi|51874225|ref|NC_001716.2|,16.285714,4.0,7.0,5.0,7.0,61.0,13.0,21.0,28.0,33.0,8.0,10.0,4.0,9.0,18.0
VIRL|gi|157781212|ref|NC_009823.1|,15.857143,13.0,46.0,13.0,19.0,15.0,1.0,0.0,0.0,0.0,16.0,27.0,31.0,21.0,20.0
NC_022518.1,15.714286,21.0,28.0,11.0,23.0,18.0,4.0,6.0,5.0,7.0,13.0,13.0,29.0,17.0,25.0


In [50]:
for counts_dict in fam1_virus_counts:
    try:
        pprint.pprint(counts_dict['VIRL|gi|295441905|ref|NC_014096.1|'])
    except:
        pass

OrderedDict([('VIRL|gi|295441905|ref|NC_014096.1|', 144),
             ('VIRL|gi|295413923|ref|NC_014075.1|', 15),
             ('VIRL|gi|295441884|ref|NC_014091.1|', 9),
             ('VIRL|gi|295413928|ref|NC_014076.1|', 4),
             ('unmapped', 3),
             ('VIRL|gi|295413958|ref|NC_014081.1|', 2),
             ('BACT_95|gi|226942170|ref|NC_012560.1|', 1),
             ('EUKY_40|gi|50557461|ref|NC_006069.1|', 1),
             ('VIRL|gi|295413954|ref|NC_014080.1|', 1)])


In [51]:
for counts_dict in fam1_virus_counts:
    try:
        pprint.pprint(counts_dict['VIRL|gi|48696722|ref|NC_005881.1|'])
    except:
        pass

OrderedDict([('VIRL|gi|48696722|ref|NC_005881.1|', 92)])
OrderedDict([('chr4', 1)])
OrderedDict([('VIRL|gi|48696722|ref|NC_005881.1|', 30)])
OrderedDict([('chr7', 1)])
