In [1]:
import pandas as pd
import numpy as np
import re
import argparse


def fix_names(countdata):
    '''
    This fixes the name specific issues in the NF54GFPxNHP4026 cross.
    The names have changed several times and been recorded in different
    formats, so I'll fix them with this.
    '''
    countdata = countdata.T
    countdata.index = countdata.index.str.replace('\/','', regex = True)
    countdata.index = countdata.index.str.replace('ND5A5', 'AC075', regex = True)
    countdata.index = countdata.index.str.replace('ND6G8', 'AC125', regex = True)
    countdata.index = countdata.index.str.replace('N1', '', regex = True)
    countdata.index = countdata.index.str.replace('\\.', '', regex = True)
    countdata.index = countdata.index.str.replace('_4026', '_NHP4026', regex = True)
    countdata.index = countdata.index.str.replace('^4026', 'NHP4026', regex = True)
    countdata.index = countdata.index.str.replace('2H9', 'AC030', regex = True)
    countdata.index = countdata.index.str.replace('6E5', 'AC033', regex = True)
    return countdata.T
   


def pick_best_reps(countdata):
    '''
    Get this, the GTEx Consortium handles TECHNICAL replicates
    (please don't mistake these for BIOLOGICAL replicates, which
    are amazing and useful) by simply choosing the replicate
    with the most reads. Good enough for the Broad, good enough
    for me, so I'll do that here. Note that this contains a
    pile of regular expressions for name handling specific to the
    formats used in this cross. 
    '''
    
    # Calculate sample count sums
    countdata = countdata.T
    good_samples = []
    sample_sum = []
    for i in range(6,len(countdata.index)):
        genes_rep = np.sum(countdata.iloc[i,:] > 0)
        good_samples.append(countdata.index[i])
        sample_sum.append(np.sum(countdata.iloc[i,:]))
        
    # Build summary frame, iterate over samples by strain/hpi and choose 
    # the sample with the most reads to keep

    count_summary = {'Sample Name':good_samples,
                     'Total Counts':sample_sum}
    count_summary = pd.DataFrame(count_summary)

    # This is a hot mess of regular expression that just converts the entire
    # sample name to strain_##, where ## is the sampling timepoint. The formatting
    # is very inconsistent throughout, so a mess of replacements need to be carefully
    # made. This mess of substitutions makes them.

    count_summary['Strain and Time'] = count_summary['Sample Name'].str.replace('^GF_PL[\d]+[a,b,c]{0,1}_', '',regex = True)
    count_summary['Strain and Time'] = count_summary['Strain and Time'].str.replace('[A,B,C]_', '', regex = True)
    count_summary['Strain and Time'] = count_summary['Strain and Time'].str.replace('_[d]+$', '', regex = True)
    count_summary['Strain and Time'] = count_summary['Strain and Time'].str.replace('_S.*', '', regex = True)
    count_summary['Strain and Time'] = count_summary['Strain and Time'].str.replace('_[0-9]{3,4}$', '', regex = True)
    count_summary['Strain and Time'] = count_summary['Strain and Time'].str.replace('hpi', '', regex = True)
    count_summary['Strain and Time'] = count_summary['Strain and Time'].str.replace('_T', '_', regex = True)

    # Believe it or not, the GTEx consortium handled reps across batches and such by...
    # taking the replicate with the most counts. That's easy, let's do that.

    best_samples = []
    for sample in count_summary['Strain and Time'].unique():
        subframe = count_summary[count_summary['Strain and Time'] == sample]
        best_samples.append(count_summary.iloc[subframe['Total Counts'].idxmax(),:])
    
    # Now I'll build a frame with just the best samples in it
    
    best_samples = pd.DataFrame(best_samples)
    best_samplenames = ['Geneid', 'Chr', 'Start', 'End', 'Strand', 'Length']
    best_samplenames.extend(list(best_samples['Sample Name']))
    curated_counts = countdata[countdata.index.isin(best_samplenames)]
    return curated_counts.T




def split_times(rawcounts):
    '''
    In this particular study I took 3 time points; 4hpi, 30hpi,
    and 44hpi. This function takes a full set of count data and
    splits it up in to 3 DataFrames. Note that it's hardcoded for the
    time points I took, so it's not suitable for all crosses.
    '''

    T4_samples = rawcounts.filter(regex='T4_|_4hpi')
    T4_samples = rawcounts.iloc[:,0:5].merge(T4_samples, 
                                             left_index = True, 
                                             right_index = True)
    T30_samples = rawcounts.filter(regex='T30_|_30hpi')
    T30_samples = rawcounts.iloc[:,0:5].merge(T30_samples, 
                                             left_index = True, 
                                             right_index = True)
    T44_samples = rawcounts.filter(regex='T44_|_44hpi')
    T44_samples = rawcounts.iloc[:,0:5].merge(T44_samples, 
                                             left_index = True, 
                                             right_index = True)



    # Let's pull out the count data for each time point
    # Now that we're normalized we can minimize this to a pure count matrix

    return [T4_samples, T30_samples, T44_samples]
    



def build_covariates(metadata):
    '''
    Our samples were run across different plates; this
    is the only real place I can account for technical
    variation. Our sampling batches are confounded with
    stage, so I am forced to rely on CRC and PEER to
    pull anything in there out. To run PEER, I need
    one-hot encoded plate data, so I do that here.
    '''
    encoding = pd.get_dummies(metadata['PlateID'])
    metadata = metadata.merge(encoding,
                              left_index = True, 
                              right_index = True)
    metadata.drop(columns = 'PlateID',
                 inplace = True)
    
    return metadata
    




def fix_vcf(vcf_file, progenydata, vcf_out):
    '''
    The vcf file contains TX versions of strain names; this
    function swaps them with the correct version.
    '''
    # Create dict for progeny data
    
    progenydata.index = progenydata.iloc[:,1]
    progenydict = progenydata.iloc[:,0]
    progenydict = progenydict.to_dict()
    
    
    
    vcf = open(vcf_file, 'r')
    vcfout = open(vcf_out, 'w')
    for line in vcf:
        for key in progenydict.keys():
            line = re.sub(key, progenydict[key], line)
        vcfout.write(line)
    
    vcf.close()
    vcfout.close()

def strip_names(countdata):
    '''
    This function strips the long sample names from the
    count data down to their strain only. NOTE: You really
    don't want to use this until after you've built the metadata
    file, as you need the full sample names to get the correct
    Plate Number for batch correction.
    '''
    
    countdata.columns = countdata.columns.str.replace('^GF[\d]*_', '', regex = True)
    countdata.columns = countdata.columns.str.replace('PL[\da-z]*_', '', regex = True)
    countdata.columns = countdata.columns.str.replace('[A-C]{1}_', '', regex = True)
    countdata.columns = countdata.columns.str.partition('_').to_frame().iloc[:,0]
    
    return countdata

def to_gct(file, expdata):
	row = len(expdata.index)
	col = len(expdata.columns)
	outfile = open(file, 'w')
	outfile.write(f'1.0\n')
	outfile.write(f'{row}\t{col}\n')
	outfile.close()
	expdata.to_csv(file,
                       mode = 'a',
                       header = True,
                       sep = '\t',
                       encoding='utf-8',
                       index = False)

########################################################
#
# Main block- parses all files in one pass, if you're
# feeling lazy
#
########################################################





In [74]:
counts = pd.read_csv(f'GENE2.count', sep = '\t')
counts = fix_names(counts)
counts = pick_best_reps(counts)
timepoints = split_times(counts)[0]
probeinfo = counts.iloc[:,0:6]
probeinfo.to_csv(f'test_probeinfo.csv')

In [75]:
metadata = pd.read_csv(f'GENE2_NAME.txt', sep = '\t', header = None, index_col = 0)
metadata.columns = ['PlateID', 'Strain', 'Sampling Time', 'Sample Number', 'No Clue', 'Same']
covariates = build_covariates(metadata)
covariates.drop(columns = ['Strain', 'Sampling Time', 'Sample Number', 'No Clue', 'Same'], inplace = True)
covariates.rename_axis(None, inplace = True)
covariates = fix_names(covariates.T).T
covariates

Unnamed: 0,PL01,PL02,PL03,PL04,PL05,PL05a,PL05b
AC030_T30_S97_L00,0,0,0,0,0,0,0
AC030_T44_S98_L00,0,0,0,0,0,0,0
AC030_T4_S96_L00,0,0,0,0,0,0,0
NHP4026_T30_S103_L00,0,0,0,0,0,0,0
NHP4026_T44_S104_L00,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
AC075_T44_S112_L00,0,0,0,0,0,0,0
AC075_T4_S110_L00,0,0,0,0,0,0,0
AC125_T30_S114_L00,0,0,0,0,0,0,0
AC125_T44_S115_L00,0,0,0,0,0,0,0


In [76]:
metasub = metadata
timepoint = timepoints
metasub = covariates[covariates.index.isin(timepoint.columns)].T
metasub = strip_names(metasub).T
# NOTE: we have RNAseq for this parasite, but no genome seq
metasub.drop(index = 'AC081', inplace = True)
#metasub.to_csv(f'{args.data_path}{timepoint_times[i]}batchcov.csv')
timepoint.drop(columns = ['Start', 'End', 'Strand', 'Chr'], inplace = True)
timepoint = strip_names(timepoint)
timepoint.drop(columns = 'AC081', inplace = True)
#timepoint.to_csv(f'{args.data_path}{timepoint_times[i]}counts.txt',
#        sep = '\t',
#        index = False)  
metasub

Unnamed: 0_level_0,PL01,PL02,PL03,PL04,PL05,PL05a,PL05b
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AC033,0,0,0,0,0,0,0
AC004,0,1,0,0,0,0,0
AC006,0,1,0,0,0,0,0
AC007,0,1,0,0,0,0,0
AC008,0,1,0,0,0,0,0
AC025,0,1,0,0,0,0,0
AC027,0,1,0,0,0,0,0
AC028,0,1,0,0,0,0,0
AC034,0,1,0,0,0,0,0
AC049,0,1,0,0,0,0,0


In [10]:
import pandas as pd
import pfal_genomeandanno_prep as pfal
import os
import argparse
import numpy as np

def sample_and_geneqc(expdata):
    '''
    Here, we'll remove poor samples and poor genes, and QC low read #s
    Low reads are any gene/sample reading < 5 counts; those are zeroed out
    Poor samples are defined as those with < 3000 genes with reads
    Poor genes are those that appear in < 20% of samples
    '''
    
    # Curate Samples
    
    expdata = expdata.mask(expdata < 5, 0)
    
    genecounts = pd.Series(data = np.count_nonzero(expdata, axis = 1),
                             index = expdata.index)
    samplecounts = pd.Series(np.count_nonzero(expdata,axis = 0),
                          index = expdata.columns)
    
    goodgenes = genecounts[genecounts/samplecounts.size > 0.2]
    
    goodsamples = samplecounts[samplecounts > 3000]
    
    allcur = expdata.loc[goodgenes.index, goodsamples.index]
    
    # Zero out counts < 5
    
    
    
    return allcur


In [11]:
expdata = pd.read_csv('T4_counts.txt', sep = '\t', index_col = 0)
test = sample_and_geneqc(expdata)
test

Unnamed: 0_level_0,AC033,AC004,AC006,AC008,AC027,AC028,AC034,AC049,AC050,AC056,...,AC103,AC109,AC118,AC038,AC082,AC030,AC032,AC074,AC130,NF54gfp
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
malmito_rna_LSUG:rRNA,0,7,5,0,0,0,6,0,0,5,...,5,5,15,0,6,5,12,0,0,10
malmito_SSUB:rRNA,0,5,0,0,0,0,0,0,0,0,...,0,8,5,0,0,0,9,0,0,0
malmito_rna_1:rRNA,0,23,24,7,18,9,16,0,7,13,...,20,22,25,21,21,29,88,21,16,26
malmito_rna_10:rRNA,0,23,38,14,26,10,40,6,23,31,...,29,22,29,47,48,38,130,31,29,44
mal_mito_1,18,11,21,12,16,11,25,14,23,18,...,16,11,40,13,30,19,10,8,5,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PF3D7_1478600,11,15,21,0,32,24,28,19,9,15,...,15,23,10,26,14,29,28,47,14,22
PF3D7_1478800,0,20,44,37,103,118,121,50,64,156,...,69,59,43,49,56,95,116,74,72,69
PF3D7_1478900,127,209,199,101,318,107,563,168,404,131,...,190,248,177,867,345,231,170,232,96,466
PF3D7_1479000,0,11,12,74,7,18,287,111,158,14,...,73,217,20,183,87,139,11,225,114,320
