In [1]:
import pandas as pd
import os
import re

In [2]:
# Filter out all runs done in September, October, and November
path = os.listdir("/ghds/cv19/analysis")
r = re.compile(r"20(09|10|11).{27}$")
file_names2 = list(filter(r.match, path))

In [3]:
all_fc = []
for i in file_names2:
    current = pd.read_csv(f"/ghds/cv19/analysis/{i}/pool_stats.hdr.tsv", sep='\t').iloc[:, :4]
    std_dev = current['reads_total'].std()
    mean = current['reads_total'].mean()
    lower_bound = mean - 2 * std_dev
    current['mean'] = mean
    current['lower_bound'] = lower_bound
    all_fc.append(current)
    
all_flow_cells = pd.concat(all_fc).reset_index().drop(columns=['level_0'])
date = [str(value[:6]) for _,value in all_flow_cells['runid'].items()]
wells = [value[14:] for _, value in all_flow_cells['pos_pooling'].items()]
column = [value[15] for _, value in all_flow_cells['pos_pooling'].items()]
row = [value[14] for _, value in all_flow_cells['pos_pooling'].items()]

all_flow_cells['well_position'] = wells
all_flow_cells['row'] = row
all_flow_cells['column'] = column
all_flow_cells['date'] = date
all_flow_cells['date'] = all_flow_cells['date'].astype('int')

# # Dropped wells with date 200925 beacause of weird naming convention
all_flow_cells.drop(all_flow_cells[all_flow_cells['date'] == 200925].index, inplace=True)
all_flow_cells

Unnamed: 0,runid,pos_pooling,index,reads_total,mean,lower_bound,well_position,row,column,date
0,200929_NB551639_0159_AH5FN7BGXG,PRP200807A048:A1,CCAGCCTG_AATTTGGC,5799672,7.087281e+06,3.357593e+06,A1,A,1,200929
1,200929_NB551639_0159_AH5FN7BGXG,PRP200807A048:A2,AGCTCGGC_GTGCAAGT,6176015,7.087281e+06,3.357593e+06,A2,A,2,200929
2,200929_NB551639_0159_AH5FN7BGXG,PRP200807A048:A3,ATACGCTC_ACATAGGA,8709182,7.087281e+06,3.357593e+06,A3,A,3,200929
3,200929_NB551639_0159_AH5FN7BGXG,PRP200807A048:A4,ATGACCTT_ATACTCGG,7912102,7.087281e+06,3.357593e+06,A4,A,4,200929
4,200929_NB551639_0159_AH5FN7BGXG,PRP200807A048:A5,CTTGGCAA_GTGCGCCC,5568258,7.087281e+06,3.357593e+06,A5,A,5,200929
...,...,...,...,...,...,...,...,...,...,...
1975,201027_NB552398_0034_AHHFJNBGXG,PRP200924A027:H2,ACAGCTCT_ACTGAGAA,11167511,7.827200e+06,3.500990e+06,H2,H,2,201027
1976,201027_NB552398_0034_AHHFJNBGXG,PRP200924A027:H3,CCTACTTA_GTATGGAG,11958874,7.827200e+06,3.500990e+06,H3,H,3,201027
1977,201027_NB552398_0034_AHHFJNBGXG,PRP200924A027:H4,ATAGCATG_GGAAACCC,8836723,7.827200e+06,3.500990e+06,H4,H,4,201027
1978,201027_NB552398_0034_AHHFJNBGXG,PRP200924A027:H5,CCTCGTCT_TTACGAAC,9688476,7.827200e+06,3.500990e+06,H5,H,5,201027


### Finding row, column, and well total counts for all runs

In [4]:
track_row = all_flow_cells['row'].value_counts()
track_column = all_flow_cells['column'].value_counts()
track_wellpos = all_flow_cells['well_position'].value_counts()

### Finding row, column, and well total counts for low read runs

In [5]:
low_reads = all_flow_cells.query('reads_total < lower_bound')

low_read_row = low_reads['row'].value_counts()
low_read_column = low_reads['column'].value_counts()
low_read_wellpos = low_reads['well_position'].value_counts()

### Combine all run and low read run to find percentage occurence

In [6]:
row_comp = pd.concat([track_row, low_read_row], axis=1)
column_comp = pd.concat([track_column, low_read_column], axis=1)
wellpos_comp = pd.concat([track_wellpos, low_read_wellpos], axis=1)

row_comp.dropna(inplace=True)
column_comp.dropna(inplace=True)
wellpos_comp.dropna(inplace=True)

row_comp.columns = ['all', 'low_reads']
column_comp.columns = ['all', 'low_reads']
wellpos_comp.columns = ['all', 'low_reads']

row_comp['percent'] = (row_comp['low_reads']/row_comp['all']) * 100
column_comp['percent'] = (column_comp['low_reads']/column_comp['all']) * 100
wellpos_comp['percent'] = (wellpos_comp['low_reads']/wellpos_comp['all']) * 100


In [7]:
row_comp.sort_values(by='percent', ascending = False)

Unnamed: 0,all,low_reads,percent
F,218,13.0,5.963303
G,218,4.0,1.834862
E,218,4.0,1.834862
D,241,4.0,1.659751
B,241,3.0,1.244813
C,241,2.0,0.829876
A,241,1.0,0.414938


In [10]:
column_comp.sort_values(by='percent', ascending = False)

Unnamed: 0,all,low_reads,percent
8,16,1,6.25
5,156,7,4.487179
7,32,1,3.125
3,384,7,1.822917
1,496,9,1.814516
4,176,3,1.704545
6,136,1,0.735294
2,440,2,0.454545


In [9]:
wellpos_comp.sort_values(by='percent', ascending = False)

Unnamed: 0,all,low_reads,percent
D8,4,1.0,25.0
B7,4,1.0,25.0
F5,17,4.0,23.529412
F3,48,5.0,10.416667
E5,17,1.0,5.882353
A6,17,1.0,5.882353
G1,62,3.0,4.83871
D5,22,1.0,4.545455
C4,22,1.0,4.545455
F4,22,1.0,4.545455
