In [1]:
#!/usr/bin/env python
# coding: utf-8



######################################################################
#
# Name: NF54xNHP4026 Specific Data Curation
# Author: Gabe Foster
# Date: 6/1/2020
# Purpose: This script performs the initial preprocessing of the 
# NF54HT-GFP-luc x NHP4026 transcriptional cross section- this
# is the curation that is unique to this cross
#
######################################################################

import pandas as pd
import numpy as np
import re
import argparse




In [2]:
def fix_names(countdata):
    '''
    This fixes the name specific issues in the NF54GFPxNHP4026 cross.
    The names have changed several times and been recorded in different
    formats, so I'll fix them with this.
    '''
    countdata = countdata.T
    countdata.index = countdata.index.str.replace('\/','', regex = True)
    countdata.index = countdata.index.str.replace('ND5A5', 'AC075', regex = True)
    countdata.index = countdata.index.str.replace('ND6G8', 'AC125', regex = True)
    countdata.index = countdata.index.str.replace('N1', '', regex = True)
    countdata.index = countdata.index.str.replace('\\.', '', regex = True)
    countdata.index = countdata.index.str.replace('_4026', '_NHP4026', regex = True)
    countdata.index = countdata.index.str.replace('^4026', 'NHP4026', regex = True)
    countdata.index = countdata.index.str.replace('2H9', 'AC030', regex = True)
    countdata.index = countdata.index.str.replace('6E5', 'AC033', regex = True)
    return countdata.T

In [3]:
   


def pick_best_reps(countdata):
    '''
    Get this, the GTEx Consortium handles TECHNICAL replicates
    (please don't mistake these for BIOLOGICAL replicates, which
    are amazing and useful) by simply choosing the replicate
    with the most reads. Good enough for the Broad, good enough
    for me, so I'll do that here. Note that this contains a
    pile of regular expressions for name handling specific to the
    formats used in this cross. 
    '''
    
    # Calculate sample count sums
    countdata = countdata.T
    good_samples = []
    sample_sum = []
    for i in range(6,len(countdata.index)):
        genes_rep = np.sum(countdata.iloc[i,:] > 0)
        good_samples.append(countdata.index[i])
        sample_sum.append(np.sum(countdata.iloc[i,:]))
        
    # Build summary frame, iterate over samples by strain/hpi and choose 
    # the sample with the most reads to keep

    count_summary = {'Sample Name':good_samples,
                     'Total Counts':sample_sum}
    count_summary = pd.DataFrame(count_summary)

    # This is a hot mess of regular expression that just converts the entire
    # sample name to strain_##, where ## is the sampling timepoint. The formatting
    # is very inconsistent throughout, so a mess of replacements need to be carefully
    # made. This mess of substitutions makes them.

    count_summary['Strain and Time'] = count_summary['Sample Name'].str.replace('^GF_PL[\d]+[a,b,c]{0,1}_', '',regex = True)
    count_summary['Strain and Time'] = count_summary['Strain and Time'].str.replace('[A,B,C]_', '', regex = True)
    count_summary['Strain and Time'] = count_summary['Strain and Time'].str.replace('_[d]+$', '', regex = True)
    count_summary['Strain and Time'] = count_summary['Strain and Time'].str.replace('_S.*', '', regex = True)
    count_summary['Strain and Time'] = count_summary['Strain and Time'].str.replace('_[0-9]{3,4}$', '', regex = True)
    count_summary['Strain and Time'] = count_summary['Strain and Time'].str.replace('hpi', '', regex = True)
    count_summary['Strain and Time'] = count_summary['Strain and Time'].str.replace('_T', '_', regex = True)

    # Believe it or not, the GTEx consortium handled reps across batches and such by...
    # taking the replicate with the most counts. That's easy, let's do that.

    best_samples = []
    for sample in count_summary['Strain and Time'].unique():
        subframe = count_summary[count_summary['Strain and Time'] == sample]
        best_samples.append(count_summary.iloc[subframe['Total Counts'].idxmax(),:])
    
    # Now I'll build a frame with just the best samples in it
    
    best_samples = pd.DataFrame(best_samples)
    best_samplenames = ['Geneid', 'Chr', 'Start', 'End', 'Strand', 'Length']
    best_samplenames.extend(list(best_samples['Sample Name']))
    curated_counts = countdata[countdata.index.isin(best_samplenames)]
    return curated_counts.T


In [78]:
def split_times(rawcounts):
    '''
    In this particular study I took 3 time points; 4hpi, 30hpi,
    and 44hpi. This function takes a full set of count data and
    splits it up in to 3 DataFrames. Note that it's hardcoded for the
    time points I took, so it's not suitable for all crosses.
    '''

    T4_samples = rawcounts.filter(regex='T4_|_4hpi')
    T4_samples = rawcounts.iloc[:,0:5].merge(T4_samples, 
                                             left_index = True, 
                                             right_index = True)
    T30_samples = rawcounts.filter(regex='T30_|_30hpi')
    T30_samples = rawcounts.iloc[:,0:5].merge(T30_samples, 
                                             left_index = True, 
                                             right_index = True)
    T44_samples = rawcounts.filter(regex='T44_|_44hpi')
    T44_samples = rawcounts.iloc[:,0:5].merge(T44_samples, 
                                             left_index = True, 
                                             right_index = True)



    # Let's pull out the count data for each time point
    # Now that we're normalized we can minimize this to a pure count matrix

    return [T4_samples, T30_samples, T44_samples]
    




In [5]:




def build_covariates(metadata):
    '''
    Our samples were run across different plates; this
    is the only real place I can account for technical
    variation. Our sampling batches are confounded with
    stage, so I am forced to rely on CRC and PEER to
    pull anything in there out. To run PEER, I need
    one-hot encoded plate data, so I do that here.
    '''
    encoding = pd.get_dummies(metadata['PlateID'])
    metadata = metadata.merge(encoding,
                              left_index = True, 
                              right_index = True)
    metadata.drop(columns = 'PlateID',
                 inplace = True)
    
    return metadata
    



In [69]:




def fix_vcf(vcf_file, progenydata, vcf_out):
    '''
    The vcf file contains TX versions of strain names; this
    function swaps them with the correct version.
    '''
    # Create dict for progeny data
    
    progenydata.index = progenydata.iloc[:,1]
    progenydict = progenydata.iloc[:,0]
    progenydict = progenydict.to_dict()
    
    
    
    vcf = open(vcf_file, 'r')
    vcfout = open(vcf_out, 'w')
    for line in vcf:
        for key in progenydict.keys():
            line = re.sub(key, progenydict[key], line)
        vcfout.write(line)
    
    vcf.close()
    vcfout.close()

def strip_names(countdata):
    '''
    This function strips the long sample names from the
    count data down to their strain only. NOTE: You really
    don't want to use this until after you've built the metadata
    file, as you need the full sample names to get the correct
    Plate Number for batch correction.
    '''
    
    countdata.columns = countdata.columns.str.replace('^GF[\d]*_', '', regex = True)
    countdata.columns = countdata.columns.str.replace('PL[\da-z]*_', '', regex = True)
    countdata.columns = countdata.columns.str.replace('[A-C]{1}_', '', regex = True)
    countdata.columns = countdata.columns.str.partition('_').to_frame().iloc[:,0]
    
    return countdata

In [100]:
counts = pd.read_csv(f'GENE2.count', sep = '\t')
counts = fix_names(counts)
counts = pick_best_reps(counts)
counts

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,AC033_T30_S100_L00,AC033_T4_S99_L00,AC100_T44_S107_L00,A_NHP4026_30hpi_S116_L00,...,GF_PL05b_AC082_30hpi_1217,GF_PL05b_AC082_44hpi_1218,GF_PL05b_AC130_30hpi_1217,GF_PL05b_AC130_44hpi_1218,GF_PL05b_AC130_4hpi_1216,GF_PL05b_C_NF54gfp_30hpi_1217,GF_PL05b_C_NF54gfp_4hpi_1216,AC075_T4_S110_L00,AC125_T30_S114_L00,AC125_T44_S115_L00
0,malmito_rna_16:rRNA,M76611,3,33,+,31,0,0,1,0,...,0,0,0,4,0,0,0,0,0,0
1,malmito_rna_20:rRNA,M76611,34,71,+,38,0,0,0,0,...,3,0,0,5,1,0,0,0,0,0
2,malmito_rna_9:rRNA,M76611,72,125,+,54,0,0,0,0,...,3,2,0,1,0,0,0,0,0,0
3,malmito_rna_17:rRNA,M76611,126,165,+,40,0,0,0,0,...,0,2,0,0,0,0,1,0,0,0
4,malmito_rna_LSUC:rRNA,M76611,204,226,-,23,0,0,0,0,...,3,1,0,3,0,3,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5767,PF3D7_1479700,Pf3D7_14_v3;Pf3D7_14_v3,3276230;3276509,3276283;3277501,+;+,1047,0,0,0,3,...,3,0,5,1,0,3,1,0,3,0
5768,PF3D7_1479800,Pf3D7_14_v3;Pf3D7_14_v3,3279500;3279697,3279568;3280662,+;+,1035,2,0,2,0,...,0,0,0,0,0,2,1,0,0,0
5769,PF3D7_1479900,Pf3D7_14_v3;Pf3D7_14_v3,3282729;3282907,3282797;3283752,+;+,915,0,0,0,1,...,0,0,0,0,0,7,0,0,2,0
5770,PF3D7_1480000,Pf3D7_14_v3;Pf3D7_14_v3,3285900;3286035,3285953;3287003,+;+,1023,0,1,0,2,...,1,1,1,0,1,10,1,0,4,0


In [101]:
probeinfo = counts.iloc[:,0:6]
#probeinfo.to_csv(f'{args.data_path}{args.counts_file.split(".")[0]}_probeinfo.csv')
probeinfo

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length
0,malmito_rna_16:rRNA,M76611,3,33,+,31
1,malmito_rna_20:rRNA,M76611,34,71,+,38
2,malmito_rna_9:rRNA,M76611,72,125,+,54
3,malmito_rna_17:rRNA,M76611,126,165,+,40
4,malmito_rna_LSUC:rRNA,M76611,204,226,-,23
...,...,...,...,...,...,...
5767,PF3D7_1479700,Pf3D7_14_v3;Pf3D7_14_v3,3276230;3276509,3276283;3277501,+;+,1047
5768,PF3D7_1479800,Pf3D7_14_v3;Pf3D7_14_v3,3279500;3279697,3279568;3280662,+;+,1035
5769,PF3D7_1479900,Pf3D7_14_v3;Pf3D7_14_v3,3282729;3282907,3282797;3283752,+;+,915
5770,PF3D7_1480000,Pf3D7_14_v3;Pf3D7_14_v3,3285900;3286035,3285953;3287003,+;+,1023


In [102]:
timepoints = split_times(counts)


In [103]:
metadata = pd.read_csv(f'GENE2_NAME.txt', sep = '\t', header = None, index_col = 0)
metadata.columns = ['PlateID', 'Strain', 'Sampling Time', 'Sample Number', 'No Clue', 'Same']
covariates = build_covariates(metadata)
covariates.drop(columns = ['Strain', 'Sampling Time', 'Sample Number', 'No Clue', 'Same'], inplace = True)
covariates = fix_names(covariates.T).T


Unnamed: 0_level_0,PL01,PL02,PL03,PL04,PL05,PL05a,PL05b
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AC030_T30_S97_L00,0,0,0,0,0,0,0
AC030_T44_S98_L00,0,0,0,0,0,0,0
AC030_T4_S96_L00,0,0,0,0,0,0,0
NHP4026_T30_S103_L00,0,0,0,0,0,0,0
NHP4026_T44_S104_L00,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
AC075_T44_S112_L00,0,0,0,0,0,0,0
AC075_T4_S110_L00,0,0,0,0,0,0,0
AC125_T30_S114_L00,0,0,0,0,0,0,0
AC125_T44_S115_L00,0,0,0,0,0,0,0


In [104]:
 timepoints = split_times(counts)

In [105]:
timepoint_times = ['T4_', 'T30_', 'T44_']
i = 0
for timepoint in timepoints:
    metasub = covariates[covariates.index.isin(timepoint.columns)].T
    metasub = strip_names(metasub).T
    metasub.to_csv(f'{timepoint_times[i]}batchcov.csv')
    timepoint.drop(columns = ['Start', 'End', 'Strand', 'Chr'], inplace = True)
    timepoint = strip_names(timepoint)
    timepoint.to_csv(f'{timepoint_times[i]}counts.txt',
        sep = '\t',
        index = False)  
    i = i + 1

    

In [106]:
metasub


Unnamed: 0_level_0,PL01,PL02,PL03,PL04,PL05,PL05a,PL05b
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AC100,0,0,0,0,0,0,0
NHP4026,0,0,0,0,0,0,0
AC088,1,0,0,0,0,0,0
AC004,0,1,0,0,0,0,0
AC006,0,1,0,0,0,0,0
AC007,0,1,0,0,0,0,0
AC008,0,1,0,0,0,0,0
AC025,0,1,0,0,0,0,0
AC027,0,1,0,0,0,0,0
AC028,0,1,0,0,0,0,0


In [99]:
timepoint.columns

Index(['Geneid', '2H9', '4026', '6E5', 'AC100N1', 'NHP4026', 'NF54gfp',
       'AC004', 'AC004', 'AC006',
       ...
       'AC030', 'AC032', 'AC038', 'AC074', 'AC075', 'AC082', 'AC130',
       'NF54gfp', 'ND5A5', 'ND6G8'],
      dtype='object', name=0, length=150)

In [None]:


########################################################
#
# Main block- parses all files in one pass, if you're
# feeling lazy
#
########################################################




if __name__=='__main__':
    parser = argparse.ArgumentParser(description='Curates data from the NF54GFP x NHP4026 Transcriptional Experiment for eQTL Analysis')
    parser.add_argument('counts_file', help = 'the *.count file provided to us after mapping')
    parser.add_argument('metadata', help = 'the *_name.txt file profixed to us after mapping')
    parser.add_argument('vcf_file', help = 'the *.vcf file from this genetic cross')
    parser.add_argument('vcf_out', help = 'file name for renamed vcf output')
    parser.add_argument('data_path', help = 'path for specific xls file for this cross only')
    args = parser.parse_args()

    
    # Read counts file, fix names, pick best reps, write out
    # probe metadata, and split out time points
    
    counts = pd.read_csv(f'{args.data_path}{args.counts_file}', sep = '\t')
    counts = fix_names(counts)
    counts = pick_best_reps(counts)
    
    probeinfo = counts.iloc[:,0:6]
    probeinfo.to_csv(f'{args.data_path}{args.counts_file.split(".")[0]}_probeinfo.csv')
    
    timepoints = split_times(counts)
 
    
    # Read in and correct metadata
    
    metadata = pd.read_csv(f'{args.data_path}{args.metadata}', sep = '\t')
    metadata.columns = ['SampleID', 'PlateID', 'Strain', 'Sampling Time', 'Sample Number', 'No Clue', 'Same']
    covariates = build_covariates(metadata)
    covariates.drop(columns = ['Strain', 'Sampling Time', 'Sample Number', 'No Clue', 'Same'], inplace = True)

    # build correct, separate metadata files for time points,
    # and write out metadata and expression
    
    timepoint_times = ['T4_', 'T30_', 'T44_']
    i = 0
    for timepoint in timepoints:
      metasub = covariates[covariates['SampleID'].isin(timepoint.columns)].T
      metasub.columns = metasub.iloc[0,:]
      metasub = strip_names(metasub).T
      metasub.to_csv(f'{args.data_path}{timepoint_times[i]}batchcov.csv')
      timepoint.drop(columns = ['Start', 'End', 'Strand', 'Chr'], inplace = True)
      timepoint = strip_names(timepoint)
      timepoint.to_csv(f'{args.data_path}{timepoint_times[i]}counts.txt',
        sep = '\t',
        index = False)  
      i = i + 1
     
    # Fix names in vcf file
    
    progenydata = pd.read_excel(f'{args.data_path}NF54gfpluc NHP4026 progeny in map 6-25-18 .xlsx',
                            usecols = ['freezerPro ID', 'Map 7/5/18'])
    progenydata = progenydata.drop(progenydata.index[57])
    progenydata.iloc[55,0] = 'NHP4026'
    progenydata.iloc[56,0] = 'NF54gfp'
    fix_vcf(f'{args.data_path}{args.vcf_file}', progenydata, f'{args.data_path}{args.vcf_out}')
    




