In [1]:
# standard data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

# convenience
from tqdm import tqdm

In [2]:
def write_csv_to_wig_file(input_file, output_file, track_name, track_description, 
                          value_column='TISCounts', transform=None, 
                          input_dirpath='/lab/barcheese01/smaffa/filtered_tis_data',
                          output_dirpath='/lab/barcheese01/smaffa/igv_files'):
    if '.csv' in input_file:
        tis_df = pd.read_csv(os.path.join(input_dirpath, input_file))
    elif '.txt' in input_file:
        tis_df = pd.read_csv(os.path.join(input_dirpath, input_file), sep='\t')

    # trim the input table into the necessary columns for the output wig file
    df_slim = tis_df.loc[:, [value_column, 'GenomePos']]
    genome_annotations = df_slim['GenomePos'].str.split(':', expand=True)
    genome_annotations.columns = ['chr', 'span', 'strand']
    spans = genome_annotations['span'].str.split('-', expand=True)
    spans.columns = ['start', 'end']
    df_slim = pd.concat([df_slim, genome_annotations, spans], axis=1)
    df_slim['locus'] = df_slim.apply(lambda x: x['start'] if x['strand'] == '+' else x['end'] if x['strand'] == '-' else 'None', axis=1)

    if transform is not None:
        df_slim[value_column] = transform(df_slim[value_column])

    with open(os.path.join(output_dirpath, output_file), "w") as f:
        for chr, idx in df_slim.groupby('chr').groups.items():
            if 'chr' in chr:
                f.write(f'track type=wiggle_0 name={track_name} description={track_description}\n')
                f.write(f'variableStep chrom={chr} span=1\n')
                subset = df_slim.loc[idx, :].sort_values('locus')
                for locus, counts in zip(subset['locus'].tolist(), subset[value_column].tolist()):
                    if counts != 0:
                        f.write(f'{locus} {counts}\n')
    print(f'Saved file to {os.path.join(output_dirpath, output_file)}')


# Global wig files

In [7]:
write_csv_to_wig_file(
    input_file='HeLa_TIS_predict_all.txt',
    output_file='HeLa_all_ribotish_counts.wig',
    track_name = 'HeLa_all_ribotish_counts',
    track_description = 'All ribotish TIS counts',
    input_dirpath='/lab/barcheese01/aTIS_data/ribotish'
)

Saved file to /lab/barcheese01/smaffa/igv_files/HeLa_all_ribotish_counts.wig


In [8]:
write_csv_to_wig_file(
    input_file='HeLa_TIS_predict_all.txt',
    output_file='HeLa_all_ribotish_qvals.wig',
    value_column='TISQvalue',
    transform=lambda x: (-np.log10(x)).replace([np.inf, -np.inf], 100).fillna(0),
    track_name = 'HeLa_all_ribotish_log10_qval',
    track_description = 'All ribotish TISDiff q-values',
    input_dirpath='/lab/barcheese01/aTIS_data/ribotish'
)

  result = getattr(ufunc, method)(*inputs, **kwargs)


Saved file to /lab/barcheese01/smaffa/igv_files/HeLa_all_ribotish_qvals.wig


In [15]:
write_csv_to_wig_file(
    input_file='HeLa_TIS_predict_all.txt',
    output_file='HeLa_all_ribotish_frameq.wig',
    value_column='FrameQvalue',
    transform=lambda x: (-np.log10(x)).replace([np.inf, -np.inf], 100).fillna(0),
    track_name = 'HeLa_all_ribotish_log10_frame_qval',
    track_description = 'All ribotish TISDiff q-values',
    input_dirpath='/lab/barcheese01/aTIS_data/ribotish'
)

Saved file to /lab/barcheese01/smaffa/igv_files/HeLa_all_ribotish_frameq.wig


# Strategy-specific wig files

In [9]:
write_csv_to_wig_file(
    input_file='HeLa_TIS_filtered.csv',
    output_file='HeLa_exclusion_window_30nt.wig',
    track_name = 'HeLa_exclusion_window_30nt',
    track_description = 'Filtering by iteratively highest readcount, then exclusion of TISs within 30nt downstream'
)

Saved file to /lab/barcheese01/smaffa/igv_files/HeLa_exclusion_window_30nt.wig


In [10]:
write_csv_to_wig_file(
    input_file='HeLa_TIS_filtered2.csv',
    output_file='HeLa_exclusion_window_30nt_mincount_5.wig',
    track_name = 'HeLa_exclusion_window_30nt_mincount_5',
    track_description = 'Filtering by iteratively highest readcount, then exclusion of TISs within 30nt downstream, with a minimum readcount of 5'
)

Saved file to /lab/barcheese01/smaffa/igv_files/HeLa_exclusion_window_30nt_mincount_5.wig


In [11]:
write_csv_to_wig_file(
    input_file='HeLa_TIS_filtered3.csv',
    output_file='HeLa_exclusion_window_45nt.wig',
    track_name = 'HeLa_exclusion_window_45nt',
    track_description = 'Filtering by iteratively highest readcount, then exclusion of TISs within 45nt downstream'
)

Saved file to /lab/barcheese01/smaffa/igv_files/HeLa_exclusion_window_45nt.wig


In [12]:
write_csv_to_wig_file(
    input_file='HeLa_TIS_alt_filtered.csv',
    output_file='HeLa_peak_finding_qval_increase_order2_mincount_5.wig',
    track_name = 'HeLa_peak_finding_qval_increase_order2_mincount_5',
    track_description = 'Filtering by peak finding using a minimum readcount of 5 and a TISDiff increase in significance by 10^2'
)

Saved file to /lab/barcheese01/smaffa/igv_files/HeLa_peak_finding_qval_increase_order2_mincount_5.wig


In [13]:
write_csv_to_wig_file(
    input_file='HeLa_TIS_alt_filtered2.csv',
    output_file='HeLa_peak_finding_qval_increase_order3_mincount_5.wig',
    track_name = 'HeLa_peak_finding_qval_increase_order3_mincount_5',
    track_description = 'Filtering by peak finding using a minimum readcount of 5 and a TISDiff increase in significance by 10^3'
)

Saved file to /lab/barcheese01/smaffa/igv_files/HeLa_peak_finding_qval_increase_order3_mincount_5.wig


In [None]:
tis_df = pd.read_csv(os.path.join('/lab/barcheese01/smaffa/filtered_tis_data', 'HeLa_TIS_filtered.csv'))
alt_tis_df = pd.read_csv(os.path.join('/lab/barcheese01/smaffa/filtered_tis_data', 'HeLa_TIS_alt_filtered.csv'))

NameError: name 'pd' is not defined

In [15]:
pd.read_csv(os.path.join('/lab/barcheese01/smaffa/filtered_tis_data', 'HeLa_TIS_filtered.csv')).shape

(24963, 19)

In [16]:
pd.read_csv(os.path.join('/lab/barcheese01/smaffa/filtered_tis_data', 'HeLa_TIS_filtered2.csv')).shape

(17298, 19)

In [17]:
pd.read_csv(os.path.join('/lab/barcheese01/smaffa/filtered_tis_data', 'HeLa_TIS_filtered3.csv')).shape

(15842, 19)

In [None]:
pd.read_csv(os.path.join('/lab/barcheese01/smaffa/filtered_tis_data', 'HeLa_TIS_alt_filtered.csv')).assign(
    RecatTisType=lambda x: 
        
        'Annotated' if ('Annotated' in x['TisType']) or ('Known' in x['TisType']) else
        'Truncated' if 'Truncated' in x['TisType'] else
        'uORF' if "5' UTR" in x['TisType'] else
        'Extended' if 'Extended' in x['TisType'] else
        'Other',
        axis=1
).value_counts(['RecatTisType'])

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [7]:
pd.read_csv(os.path.join('/lab/barcheese01/smaffa/filtered_tis_data', 'HeLa_TIS_alt_filtered.csv')).groupby(['TisType']).nunique()['Symbol']

TisType
3'UTR                          2
3'UTR:CDSFrameOverlap          1
3'UTR:Known                    2
5'UTR                        379
5'UTR:CDSFrameOverlap          7
5'UTR:Known                    8
Annotated                   6414
Extended                     309
Extended:CDSFrameOverlap      12
Extended:Known                 9
Internal                      16
Internal:CDSFrameOverlap       2
Internal:Known                 1
Truncated                    664
Truncated:Known               32
Name: Symbol, dtype: int64

In [8]:
pd.read_csv(os.path.join('/lab/barcheese01/aTIS_data/ribotish/HeLa_TIS_filtered_predictor.txt'), sep='\t').groupby(['TisType']).nunique()['Symbol']

TisType
3'UTR                          9
3'UTR:CDSFrameOverlap        509
3'UTR:Known                   71
5'UTR                        447
5'UTR:CDSFrameOverlap         93
5'UTR:Known                   77
Annotated                   5956
Extended                    1825
Extended:CDSFrameOverlap     330
Extended:Known                98
Internal                      65
Internal:CDSFrameOverlap     154
Internal:Known                25
Novel                        329
Novel:CDSFrameOverlap       2100
Novel:Known                 1558
Truncated                   5029
Truncated:Known              508
Name: Symbol, dtype: int64

In [19]:
pd.read_csv(os.path.join('/lab/barcheese01/smaffa/filtered_tis_data', 'HeLa_TIS_alt_filtered2.csv')).shape

(10114, 21)

In [4]:
tis_df.groupby('Symbol').nunique()['GenomePos'].sort_values(ascending=False).head(5)

Symbol
AHNAK     35
MAP4      30
KYNU      28
POLR1A    27
ITPR3     26
Name: GenomePos, dtype: int64

In [5]:
alt_tis_df.groupby('Symbol').nunique()['GenomePos'].sort_values(ascending=False).head(5)

Symbol
LDHA    14
TFG     12
CLU     12
TMX2    11
ASPH    10
Name: GenomePos, dtype: int64

In [70]:
tis_df[tis_df['Symbol'] == 'MAP4'].sort_values('GenomePos')

Unnamed: 0,Gid,Tid,Symbol,StartCodon,Start,TisType,TISGroup,TISCounts,TISQvalue,GenomePos,Seq,AASeq,feature_type,strand,frame,gene_type,transcript_type,transcript_support_level,MANE_Select
16854,ENSG00000047849.23,ENST00000395734.8,MAP4,AAG,2648,Truncated,5,21,1.0,chr3:47852791-47871239:-,AAGTCTGTACCAGCTGACTTGAGTCGCCCAAAGAGCACCTCCACCA...,KSVPADLSRPKSTSTSSMKKTTTLSGTAPAAGVVPSRVKATPMPSR...,transcript,-,.,protein_coding,protein_coding,2.0,False
16850,ENSG00000047849.23,ENST00000395734.8,MAP4,AAG,2498,Truncated,5,32,1.0,chr3:47852791-47872019:-,AAGAGCACTCAGACTGTTGCAAAAACCACAACAGCTGCTGCTGTTG...,KSTQTVAKTTTAAAVASTGPSSRSPSTLLPKKPTAIKTEGKPAEVK...,transcript,-,.,protein_coding,protein_coding,2.0,False
16853,ENSG00000047849.23,ENST00000395734.8,MAP4,AAG,2225,Truncated,5,21,1.0,chr3:47852791-47875876:-,AAGACTTCAACATCGAAAGCCAAAACACAGCCCACTTCTCTCCCTA...,KTSTSKAKTQPTSLPKQPAPTTIGGLNKKPMSLASGLVPAAPPKRP...,transcript,-,.,protein_coding,protein_coding,2.0,False
16852,ENSG00000047849.23,ENST00000395734.8,MAP4,GTG,1802,Truncated,5,23,1.0,chr3:47852791-47916119:-,GTGACTCCAGCCAAAGATGTTCCACCACTCTCAGAAACAGAGGCAA...,VTPAKDVPPLSETEATPVPIKDMEIAQTQKGISEDSHLESLQDVGQ...,transcript,-,.,protein_coding,protein_coding,2.0,False
16851,ENSG00000047849.23,ENST00000395734.8,MAP4,AAG,1316,Truncated,5,23,1.0,chr3:47852791-47916605:-,AAGGATTTGGTATTACTCTCAGAAATAGAGGTGGCACAGGCTAATG...,KDLVLLSEIEVAQANDIISSTEISSAEKVALSSETEVALARDMTLP...,transcript,-,.,protein_coding,protein_coding,2.0,False
16849,ENSG00000047849.23,ENST00000395734.8,MAP4,ATG,95,Annotated,5,91,0.01346053,chr3:47852791-47998860:-,ATGGCTGACCTCAGTCTTGCAGATGCATTAACAGAACCATCTCCAG...,MADLSLADALTEPSPDIEGEIKRDFIATLEAEAFDDVVGETVGKTD...,transcript,-,.,protein_coding,protein_coding,2.0,False
24685,ENSG00000047849.23,ENST00000683076.1,MAP4,AAG,6726,Truncated,4,17,1.0,chr3:47852933-47855319:-,AAGGAGAAGGCCCAGGCCAAGGTGGGATCCCTCGATAATGTGGGCC...,KEKAQAKVGSLDNVGHLPAGGAVKTEGGGSEAPLCPGPPAGEEPAI...,transcript,-,.,protein_coding,protein_coding,,True
24678,ENSG00000047849.23,ENST00000683076.1,MAP4,AAG,6567,Truncated,4,37,0.2341482,chr3:47852933-47867281:-,AAGGACAATATTAAGCATGTCCCTGGAGGTGGTAATGTTCAGATTC...,KDNIKHVPGGGNVQIQNKKVDISKVSSKCGSKANIKHKPGGGDVKI...,transcript,-,.,protein_coding,protein_coding,,True
24687,ENSG00000047849.23,ENST00000683076.1,MAP4,AAG,6555,Truncated,4,13,1.0,chr3:47852933-47867293:-,AAGTGTGGTTCCAAGGACAATATTAAGCATGTCCCTGGAGGTGGTA...,KCGSKDNIKHVPGGGNVQIQNKKVDISKVSSKCGSKANIKHKPGGG...,transcript,-,.,protein_coding,protein_coding,,True
24682,ENSG00000047849.23,ENST00000683076.1,MAP4,AAG,6090,Truncated,4,21,1.0,chr3:47852933-47871239:-,AAGTCTGTACCAGCTGACTTGAGTCGCCCAAAGAGCACCTCCACCA...,KSVPADLSRPKSTSTSSMKKTTTLSGTAPAAGVVPSRVKATPMPSR...,transcript,-,.,protein_coding,protein_coding,,True


In [None]:
"""
IGV examples: 
DNM1L => chr12:32677451-32759074 (around start & gene body)
AHNAK => chr11:62516743-62536098 (majority of the gene body)
MAP4 => chr3:47853031-48088870 (majority of the gene body)
ASPH => chr8:61503358-61714371 (around start)

Validated real:
CDC20
AURKAIP1
TRMT1
UBE2M
"""

'\nIGV examples: \nDNM1L => chr12:32,677,451-32,759,074\nAHNAK => chr11:62,516,743-62,536,098\nMAP4 => chr3:47853031-48088870\nASPH => chr8:61503358-61714371\n'

In [14]:
alt_tis_df[alt_tis_df['Gid'] == 'ENSG00000198363.19'].sort_values('GenomePos')

Unnamed: 0,Gid,Tid,Symbol,StartCodon,Start,TisType,TISGroup,TISCounts,TISQvalue,count_diff,...,GenomePos,Seq,AASeq,feature_type,strand,frame,gene_type,transcript_type,transcript_support_level,MANE_Select
6126,ENSG00000198363.19,ENST00000379454.9,ASPH,AAG,233,Truncated,4,81,0.0001968915,81,...,chr8:61503358-61714359:-,AAGAATGCCAAGAGCAGCGGCAACAGCAGCAGCAGCGGCTCCGGCA...,KNAKSSGNSSSSGSGSGSTSAGSSSPGARRETKHGGHKNGRKGGLS...,transcript,-,.,protein_coding,protein_coding,1.0,True
6125,ENSG00000198363.19,ENST00000379454.9,ASPH,ATG,221,Annotated,4,334,8.257402e-23,334,...,chr8:61503358-61714371:-,ATGGCCCAGCGTAAGAATGCCAAGAGCAGCGGCAACAGCAGCAGCA...,MAQRKNAKSSGNSSSSGSGSGSTSAGSSSPGARRETKHGGHKNGRK...,transcript,-,.,protein_coding,protein_coding,1.0,True
6127,ENSG00000198363.19,ENST00000379454.9,ASPH,CTG,176,Extended,4,151,1.867372e-09,151,...,chr8:61503358-61714416:-,CTGAAGGAGGTCCGCCAGCCCTCACCAGCCCCCGCGGACCGTGCAA...,LKEVRQPSPAPADRAMAQRKNAKSSGNSSSSGSGSGSTSAGSSSPG...,transcript,-,.,protein_coding,protein_coding,1.0,True
4470,ENSG00000198363.19,ENST00000356457.9,ASPH,ATG,225,Annotated,5,334,6.170442e-13,334,...,chr8:61626272-61714371:-,ATGGCCCAGCGTAAGAATGCCAAGAGCAGCGGCAACAGCAGCAGCA...,MAQRKNAKSSGNSSSSGSGSGSTSAGSSSPGARRETKHGGHKNGRK...,transcript,-,.,protein_coding,protein_coding,1.0,False
7783,ENSG00000198363.19,ENST00000445642.6,ASPH,ATG,269,Annotated,5,334,6.170442e-13,334,...,chr8:61626272-61714371:-,ATGGCCCAGCGTAAGAATGCCAAGAGCAGCGGCAACAGCAGCAGCA...,MAQRKNAKSSGNSSSSGSGSGSTSAGSSSPGARRETKHGGHKNGRK...,transcript,-,.,protein_coding,protein_coding,2.0,False
8660,ENSG00000198363.19,ENST00000518068.5,ASPH,ATG,197,Annotated,5,334,6.170442e-13,334,...,chr8:61626272-61714371:-,ATGGCCCAGCGTAAGAATGCCAAGAGCAGCGGCAACAGCAGCAGCA...,MAQRKNAKSSGNSSSSGSGSGSTSAGSSSPGARRETKHGGHKNGRK...,transcript,-,.,protein_coding,protein_coding,1.0,False
4471,ENSG00000198363.19,ENST00000356457.9,ASPH,CTG,180,Extended,5,151,4.078146e-05,151,...,chr8:61626272-61714416:-,CTGAAGGAGGTCCGCCAGCCCTCACCAGCCCCCGCGGACCGTGCAA...,LKEVRQPSPAPADRAMAQRKNAKSSGNSSSSGSGSGSTSAGSSSPG...,transcript,-,.,protein_coding,protein_coding,1.0,False
7784,ENSG00000198363.19,ENST00000445642.6,ASPH,CTG,224,Extended,5,151,4.078146e-05,151,...,chr8:61626272-61714416:-,CTGAAGGAGGTCCGCCAGCCCTCACCAGCCCCCGCGGACCGTGCAA...,LKEVRQPSPAPADRAMAQRKNAKSSGNSSSSGSGSGSTSAGSSSPG...,transcript,-,.,protein_coding,protein_coding,2.0,False
8661,ENSG00000198363.19,ENST00000518068.5,ASPH,CTG,152,Extended,5,151,4.078146e-05,151,...,chr8:61626272-61714416:-,CTGAAGGAGGTCCGCCAGCCCTCACCAGCCCCCGCGGACCGTGCAA...,LKEVRQPSPAPADRAMAQRKNAKSSGNSSSSGSGSGSTSAGSSSPG...,transcript,-,.,protein_coding,protein_coding,1.0,False
8652,ENSG00000198363.19,ENST00000517856.5,ASPH,AAG,228,Truncated,2,81,5.606415e-11,81,...,chr8:61661906-61714359:-,AAGAATGCCAAGAGCAGCGGCAACAGCAGCAGCAGCGGCTCCGGCA...,KNAKSSGNSSSSGSGSGSTSAGSSSPGARRETKHGGHKNGRKGGLS...,transcript,-,.,protein_coding,protein_coding,2.0,False


In [6]:
tis_df[tis_df['Symbol'] == 'TRNT1'].sort_values('GenomePos')

Unnamed: 0,Gid,Tid,Symbol,StartCodon,Start,TisType,TISGroup,TISCounts,TISQvalue,GenomePos,Seq,AASeq,feature_type,strand,frame,gene_type,transcript_type,transcript_support_level,MANE_Select
1933,ENSG00000072756.19,ENST00000251607.11,TRNT1,ATG,78,Annotated,1,20,0.003329,chr3:3129040-3148154:+,ATGCTGAGGTGCCTGTATCATTGGCACAGGCCAGTGCTGAACCGTA...,MLRCLYHWHRPVLNRRWSRLCLPKQYLFTMKLQSPEFQSLFTEGLK...,transcript,+,.,protein_coding,protein_coding,1.0,True
1934,ENSG00000072756.19,ENST00000251607.11,TRNT1,ATG,165,Truncated,1,20,0.003329,chr3:3129127-3148154:+,ATGAAGTTGCAGTCTCCCGAATTCCAGTCACTTTTCACAGAAGGAC...,MKLQSPEFQSLFTEGLKSLTELFVKENHELRIAGGAVRDLLNGVKP...,transcript,+,.,protein_coding,protein_coding,1.0,True


In [1]:
# standard data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

# convenience
from tqdm import tqdm

In [34]:
replicate_df = pd.read_csv('ribotish_replicate_manifest.csv')
sample_df = pd.read_csv('ribotish_sample_manifest.csv')
experiment_table = sample_df.merge(
    replicate_df[
        replicate_df['condition'] == 'TIS'
    ].groupby('sample').apply(lambda x: list(x['bam_qc_file'])).rename('bam_qc_file'), left_on='sample', right_index=True
).set_index('sample')

In [40]:
replicate_df.dropna(subset=['predict_file'])

Unnamed: 0,sample,replicate,condition,bam_qc_file,offset_file,predict_file,filtered_file,dropped_file
1,HeLa,rep1,TIS,/lab/cheeseman_sequencing/Jimmy_HTPdata/Riboso...,/lab/barcheese01/aTIS_data/ribosome_profiling/...,/lab/barcheese01/aTIS_data/ribotish/per_rep/He...,/lab/barcheese01/smaffa/filtered_tis_data/per_...,/lab/barcheese01/smaffa/filtered_tis_data/per_...
3,HeLa,rep2,TIS,/lab/cheeseman_sequencing/Jimmy_HTPdata/Riboso...,/lab/barcheese01/aTIS_data/ribosome_profiling/...,/lab/barcheese01/aTIS_data/ribotish/per_rep/He...,/lab/barcheese01/smaffa/filtered_tis_data/per_...,/lab/barcheese01/smaffa/filtered_tis_data/per_...
5,K562,rep1,TIS,/lab/cheeseman_sequencing/Jimmy_HTPdata/Riboso...,/lab/barcheese01/aTIS_data/ribosome_profiling/...,/lab/barcheese01/aTIS_data/ribotish/per_rep/K5...,/lab/barcheese01/smaffa/filtered_tis_data/per_...,/lab/barcheese01/smaffa/filtered_tis_data/per_...
7,K562,rep2,TIS,/lab/cheeseman_sequencing/Jimmy_HTPdata/Riboso...,/lab/barcheese01/aTIS_data/ribosome_profiling/...,/lab/barcheese01/aTIS_data/ribotish/per_rep/K5...,/lab/barcheese01/smaffa/filtered_tis_data/per_...,/lab/barcheese01/smaffa/filtered_tis_data/per_...
9,RPE1_Async,rep1,TIS,/lab/cheeseman_sequencing/Jimmy_HTPdata/Riboso...,/lab/barcheese01/aTIS_data/ribosome_profiling/...,/lab/barcheese01/aTIS_data/ribotish/per_rep/RP...,/lab/barcheese01/smaffa/filtered_tis_data/per_...,/lab/barcheese01/smaffa/filtered_tis_data/per_...
11,RPE1_Async,rep2,TIS,/lab/cheeseman_sequencing/Jimmy_HTPdata/Riboso...,/lab/barcheese01/aTIS_data/ribosome_profiling/...,/lab/barcheese01/aTIS_data/ribotish/per_rep/RP...,/lab/barcheese01/smaffa/filtered_tis_data/per_...,/lab/barcheese01/smaffa/filtered_tis_data/per_...
13,RPE1_Que,rep1,TIS,/lab/cheeseman_sequencing/Jimmy_HTPdata/Riboso...,/lab/barcheese01/aTIS_data/ribosome_profiling/...,/lab/barcheese01/aTIS_data/ribotish/per_rep/RP...,/lab/barcheese01/smaffa/filtered_tis_data/per_...,/lab/barcheese01/smaffa/filtered_tis_data/per_...
15,RPE1_Que,rep2,TIS,/lab/cheeseman_sequencing/Jimmy_HTPdata/Riboso...,/lab/barcheese01/aTIS_data/ribosome_profiling/...,/lab/barcheese01/aTIS_data/ribotish/per_rep/RP...,/lab/barcheese01/smaffa/filtered_tis_data/per_...,/lab/barcheese01/smaffa/filtered_tis_data/per_...
17,RPE1_Sen,rep1,TIS,/lab/cheeseman_sequencing/Jimmy_HTPdata/Riboso...,/lab/barcheese01/aTIS_data/ribosome_profiling/...,/lab/barcheese01/aTIS_data/ribotish/per_rep/RP...,/lab/barcheese01/smaffa/filtered_tis_data/per_...,/lab/barcheese01/smaffa/filtered_tis_data/per_...
19,RPE1_Sen,rep2,TIS,/lab/cheeseman_sequencing/Jimmy_HTPdata/Riboso...,/lab/barcheese01/aTIS_data/ribosome_profiling/...,/lab/barcheese01/aTIS_data/ribotish/per_rep/RP...,/lab/barcheese01/smaffa/filtered_tis_data/per_...,/lab/barcheese01/smaffa/filtered_tis_data/per_...


In [25]:
sample_to_filtered_tis = dict()
for sample in experiment_table.index.tolist():
    pred_table = pd.read_csv(experiment_table.loc[sample, 'predict_file'], sep='\t', usecols=['Tid', 'GenomePos', 'Start', 'AASeq'])
    filt_table = pd.read_csv(experiment_table.loc[sample, 'filtered_file'])
    sample_to_filtered_tis[sample] = filt_table.merge(pred_table, left_on=['Tid', 'GenomePos', 'Start'], right_on=['Tid', 'GenomePos', 'Start'], how='left')

In [27]:
all_tis_df= pd.concat(sample_to_filtered_tis, axis=0).reset_index(names=['Sample', 'drop']).drop(['drop'], axis=1)

In [None]:
all_tis_df.to_csv('/lab/barcheese01/smaffa/filtered_tis_data/all_samples_filtered.csv', index=False)

In [33]:
experiment_table['dropped_file']

sample
HeLa          /lab/barcheese01/smaffa/latest_filtered_tis_da...
K562          /lab/barcheese01/smaffa/latest_filtered_tis_da...
RPE1_Async    /lab/barcheese01/smaffa/latest_filtered_tis_da...
RPE1_Que      /lab/barcheese01/smaffa/latest_filtered_tis_da...
RPE1_Sen      /lab/barcheese01/smaffa/latest_filtered_tis_da...
U2OS          /lab/barcheese01/smaffa/latest_filtered_tis_da...
Name: dropped_file, dtype: str