In [None]:
# NOTES
# Removed Il14H	Il14H	SRR5911768	Kern
# Removed CML52	CML52	SRR5911880	Shoot

In [1]:
import os
import pandas as pd

# Transcriptomic analysis of 3' RNA-seq data

The trannscriptomic 'cDNA' files produced for this analysis are based on:\
https://github.com/eporetsky/bioinformatics-assortment/blob/master/RNAseq/3prime/fast3p_extender.ipynb \

Briefly, I used the primary/canonical transcript GFF3 file to get the coordinates for the CDS and UTRs.\
For the 3' UTRs, the originals were kept as is unless they were shorter than 500bp, then they were\
extended to 500bp. The original/extended UTRs were combined with the last 500bp of the CDS sequence\
to generate the final transcript file that was used to generate an index file.\
I did not take into account overlapping extensions with genes. I might consider this in the future\
but if the 3' UTR of your gene of interest is very close to the 3' UTR of another gene (<1000bp)\
the interpretation might be wrong. I might provide some Kallisto pseudobams to assess results.

There are other tools that might provide more accurate results (https://github.com/ctlab/quant3p) \
but the following analysis was meant to keep methods simple, customizable and easily repeatable.

In [85]:
df = pd.read_csv("Kallisto_names.txt", header=0, sep="\t")
df.head()

Unnamed: 0,table,index,run_accession,Renamed
0,B73v4,B73v4,SRR5909740,Root
1,B73v4,B73v4,SRR5909736,Shoot
2,B73v4,B73v4,SRR5909632,Kern
3,B73v4,B73v4,SRR5909706,Base
4,B73v4,B73v4,SRR5909674,Tip


In [10]:
# Iterate over every row of the df and run kallisto for each row
# The output folder names will contain all the information to build final tables
# For now each folder is named as: table name, index name and condition name
# In the future my replace index name with column name and use index name internally
#for row in df.iterrows():
for row in df[df["table"]=="CML103"].iterrows():
    row = list(row[1])
    table, index, treatment, file_name = row[0], row[1], row[3], row[2]
    os.system("""kallisto quant --single -l 90 -s 1 -t 32 --plaintext \
                 -i Kallisto/{0}.idx -o kallisto_results/{1}_{2}_{3}/ {4}.fp.fastq.gz """.format(index, table, index, treatment, file_name))


[quant] fragment length distribution is truncated gaussian with mean = 90, sd = 1
[index] k-mer length: 31
[index] number of targets: 40,013
[index] number of k-mers: 40,291,562
[index] number of equivalence classes: 122,917
[quant] running in single-end mode
[quant] will process file 1: SRR5911415.fp.fastq.gz
[quant] finding pseudoalignments for the reads ... done
[quant] processed 4,041,021 reads, 3,197,124 reads pseudoaligned
[   em] quantifying the abundances ... done
[   em] the Expectation-Maximization algorithm ran for 553 rounds


[quant] fragment length distribution is truncated gaussian with mean = 90, sd = 1
[index] k-mer length: 31
[index] number of targets: 40,013
[index] number of k-mers: 40,291,562
[index] number of equivalence classes: 122,917
[quant] running in single-end mode
[quant] will process file 1: SRR5911414.fp.fastq.gz
[quant] finding pseudoalignments for the reads ... done
[quant] processed 4,784,000 reads, 3,807,933 reads pseudoaligned
[   em] quantifying t

['HP301',
 'Oh43',
 'Ms71',
 'NC350',
 'CML322',
 'Ki11',
 'Ki3',
 'P39',
 'Tx303',
 'B73',
 'M37W',
 'CML103',
 'CML333',
 'W22',
 'Oh7B',
 'Ky21',
 'NC358',
 'B73v4',
 'CML247',
 'M162W',
 'CML228',
 'B97',
 'CML277',
 'CML69']

In [92]:
NAM_list = ['B73', 'Il14H','HP301','Oh43','Ms71','NC350','CML322','Ki11','Ki3','P39','Tx303','CML52','M37W','CML103','CML333','Oh7B','Ky21','NC358','B73v4','CML247','M162W', 'CML228', 'B97','CML277','CML69']

In [93]:
# The tsv file contains the pre-defined table names
# Iterate over the unique table names and find them in the Kallisto output folder names
# Use folder name list and if table name is not in folder list item skip to next folder
# Start with iterating over unique table names because it is easier to build dataframe
complete_df = pd.DataFrame()
output_folders = os.listdir('kallisto_results')
#for table in set(list(df["table"])):
for table in NAM_list:
#for table in set(list(df["table"])):
    first = True # Build a dataframe using only the first dataset. All others will merge to this
    col_list = [] # After all results are merged, rename the columns

    # Iterate over all the folders in the Kallisto output list
    for folder_name in output_folders:
        
        # The first part of the folder name is the table name. I
        fname = folder_name.split("_")
        if fname[0] != table:
            continue
        
        # If first part of folder name matches table name then merge results 
        
        # Might change this scheme but for now if table and index name are equal then don't include the 
        # genotype in the final column name. Will probably add explicit column name in the future
        if fname[0] == fname[1]:
            col_list.append(fname[2])
        else:
            col_list.append(fname[1]+"_"+fname[2])
        
        # If 
        if first:
            tpm = pd.read_csv("kallisto_results/"+folder_name+"/abundance.tsv", sep="\t", header=0, index_col=0)["tpm"]
            first = False
        else:
            tpm = pd.merge(tpm, 
                       pd.read_csv("kallisto_results/"+folder_name+"/abundance.tsv", sep="\t", header=0, index_col=0)["tpm"], 
                       left_index=True, right_index=True)
    tpm.columns = col_list
    tpm.to_csv("TPM_tables/"+table+".tpm.tsv", sep="\t")
    complete_df = pd.concat([complete_df, tpm]) 


  tpm = pd.merge(tpm,


In [98]:
complete_df.drop(0, axis=1).to_csv("TPM_tables/PanNAM.tpm.tsv", sep="\t")