* Introduction

We generated some transfection results which are currently in a hand written excel file (uploaded to gdocs).
I need to reformat it to match the schema agreed upon by PSU and the DCC.

In [1]:
import pandas
import numpy
import gcat
import os
import collections
import pyBigWig

In [2]:
autosql = '''table transfection
"class for transfection data"
(
string chrom; "GRCm38/mm10”
uint chromStart; "Start position in chromosome"
uint chromEnd; "End position in chromosome"
string name; "Name given to a region"
uint score; "Rounded average of determinations"
char[1] strand; ". for unknown"
uint thickStart; "Start of where display should be thick"
uint thickEnd; "End of where display should be thick"
uint reserved; "Used as itemRgb; color by activityBin"
string predictionCat; "prediction category"
uint count; "count of determinations"
string determ; "comma separated list of determinations as fold change relative to activity from parental vector"
float aveDeterm; "average of determinations"
float stdDeterm; "standard deviation of determinations"
string activityBin; "classification of activity; Enhancer, NotEnhancer, Threshold"
)
'''
open('/tmp/transfection.as', 'wt').write(autosql)

837

In [3]:
sheet = gcat.get_file('Wold_transfection_data01152017', fmt='pandas')

In [4]:
sheet

Unnamed: 0,#ID,Type,CHR(mm10),Start(mm10),Stop(mm10),STDEV,C2C12 Myoblast Bio Rep1,STDEV.1,C2C12 Myoblast Bio Rep2,STDEV.2,...,STDEV.3,C2C12 Myocyte Bio Rep2,STDEV.4,10T1/2 Bio Rep1,STDEV.5,10T1/2 Bio Rep2,STDEV.6,10T1/2 Mock Differentiated Bio Rep1,STDEV.7,10T1/2 Mock Differentiated Bio Rep2
0,BJW1000A,TEST ELEMENT*NESTED,chr1,134244219,134245778,0.359053,2.497751,,,2.132855,...,,,0.872601,1.755624,,,0.285142,1.258984,,
1,BJW1000B,TEST ELEMENT,chr1,134244219,134245778,0.476389,1.653045,0.198390,2.108818,1.773110,...,3.993534,11.324074,0.711102,2.082971,0.679628,2.688467,4.512507,8.622666,0.526813,1.275765
2,BJW1001,TEST ELEMENT*NESTED,chr1,134244847,134245778,0.045775,0.325529,0.058186,0.211114,4.876986,...,2.544488,14.994512,0.067867,0.265127,0.0605345,0.397979,0.084999,0.595979,0.091783,0.666264
3,BJW1002,TEST ELEMENT*TSS,chr1,134192922,134193474,0.097593,0.795625,0.074187,0.946431,0.501093,...,0.283474,1.075926,0.029145,0.603520,0.185391,1.031936,0.167062,1.264036,0.108998,1.449387
4,BJW1003,TEST ELEMENT*NESTED,chr1,134195091,134196873,0.222086,2.738967,,,3.394812,...,,,0.791119,2.942621,,,0.437904,2.595539,,
5,BJW1004,TEST ELEMENT,chr1,134195091,134196873,0.175631,0.894279,0.103021,0.490867,3.631281,...,1.574816,21.121844,0.227619,0.538779,0.0252848,0.842880,0.116144,0.798558,0.127061,1.128835
6,BJW1005,TEST ELEMENT*NESTED,chr1,134195091,134196479,0.117763,0.403462,0.037817,0.412533,2.220133,...,2.090177,14.550494,0.115628,0.333443,0.0743763,0.956223,0.142184,0.661988,0.276872,1.170382
7,BJW1006,TEST ELEMENT*NESTED,chr1,134195091,134195506,0.105161,1.710481,0.094450,0.630778,0.840585,...,0.553585,2.617453,0.286922,1.578218,0.616976,3.236429,0.099618,1.606222,0.186260,1.825527
8,BJW1007,TEST ELEMENT*NESTED,chr1,134195091,134195589,0.377482,1.691410,,,0.481375,...,,,0.389050,2.354799,,,0.615788,1.914250,,
9,BJW1008,TEST ELEMENT*NESTED,chr1,134195643,134196479,0.256856,1.114423,0.307082,1.681287,2.291935,...,0.847015,7.754665,0.142365,1.094444,0.850404,2.566563,0.081946,1.488998,0.363837,1.575713


In [5]:
transfection = pandas.read_csv('transfection-merged.csv', header=0)

In [6]:
transfection.shape

(370, 49)

In [7]:
set(sheet['#ID']).difference(transfection['#ID'])

{'WOLD_LR_104'}

In [8]:
set(transfection['#ID']).difference(sheet['#ID'])

{'OFF_09', 'WOLD2014JAN_070'}

In [9]:
replicates = sheet[['#ID', 'Type', 'CHR(mm10)', 'Start(mm10)', 'Stop(mm10)']].merge(transfection, left_on='#ID', right_on="#ID", how="inner", )

* Make sure ID is actually unique

In [10]:
[ (k, v) for k, v in collections.Counter(sheet['#ID']).items() if v > 1 ]

[]

* Add a strand column

In [11]:
def list_determs(row):
    return [str(x) for x in row if not pandas.isnull(x)]

def format_determ(row):
    return ','.join(list_determs(row))

def nan_round_to_int(x):
    if pandas.isnull(x):
        return x
    else:
        return numpy.round(x, 0)

def makebed(sheet, rep_prefix, threshold):
    activity_label = {
        True: 'Enhancer',
        False: 'NotEnhancer'
    }
    activity_color = {
        True: '255,0,0',
        False: '0,0,255'
    }
    bed = pandas.DataFrame(columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 
                   'thickStart', 'thickEnd', 'reserved', 'predictionCat', 'count', 
                   'determ', 'aveDeterm', 'stdDeterm', 'activityBin'])

    determ_cols = [rep_prefix + '-Tech_Rep_'+x for x in ['1','2','3','4']]
    avg_col = rep_prefix + '-ASSAY'    
    bed['chrom'] = sheet['CHR(mm10)']
    bed['chromStart'] = sheet['Start(mm10)']
    bed['chromEnd'] = sheet['Stop(mm10)']
    bed['name'] = sheet['#ID']
    bed['score'] = [ nan_round_to_int(float(x * 10 )) for x in sheet[avg_col] ]
    bed['strand'] = '.'
    bed['thickStart'] = sheet['Start(mm10)']
    bed['thickEnd'] = sheet['Stop(mm10)']
    bed['reserved'] = [ activity_color[float(x) >= threshold] for x in sheet[avg_col]]
    bed['predictionCat'] = [ x.replace(' ', '_') for x in sheet['Type']]
    bed['count'] =  sheet[determ_cols].apply(lambda x: len(list_determs(x)), axis=1)
    bed['determ'] = sheet[determ_cols].apply(format_determ, axis=1)
    bed['aveDeterm'] = sheet[avg_col]
    bed['stdDeterm'] = sheet[rep_prefix+'-STDEV']
    bed['activityBin'] = [ activity_label[float(x) >= threshold] for x in sheet[avg_col]]
    
    bed.dropna(inplace=True)
    bed['score'] = [ int(x) for x in bed['score'] ]
    return bed.sort_values(['chrom', 'chromStart'])
    

In [12]:
bed_file_info = {
    'c2c12_myoblast_biorep1.bed': ('C2C12_Myoblast_Rep1', 2.75),
    'c2c12_myoblast_biorep2.bed': ('C2C12_Myoblast_Rep2', 2.75),
    'c2c12_myocyte_biorep1.bed': ('C2C12_Myocyte_Rep1', 3.0),
    'c2c12_myocyte_biorep2.bed': ('C2C12_Myocyte_Rep2', 3.0),
    '10T_half_biorep1.bed': ('10T1/2_Rep1', 2.75),
    '10T_half_biorep2.bed': ('10T1/2_Rep2', 2.75),
    '10T_half_mock_diff_biorep1.bed': ('10T1/2_MockDiff_Rep1', 2.75),
    '10T_half_mock_diff_biorep2.bed': ('10T1/2_MockDiff_Rep2', 2.75),
}
bedframes = {}
for filename in bed_file_info:
    rep_prefix, threshold = bed_file_info[filename]
    bed = makebed(replicates, rep_prefix, threshold)
    bedframes[filename] = bed
    bed.to_csv(os.path.join('/tmp', filename), sep='\t', header=False, index=False)

Convert beds to bigBed

```for a in *.bed; do echo $a ;  /woldlab/castor/proj/programs/x86_64/bedToBigBed -as=transfection.as -type=bed9+6 $a mm10.sizes $(basename $a .bed).bigBed ; done```


In [13]:
import collections

scores = collections.Counter()
for filename in bedframes:
    scores.update(collections.Counter(bedframes[filename].score))
#scores

In [14]:
maxscore = 11

# Make wiggles

In [15]:
import pyBigWig

In [16]:
def remove_collisions(bed):
    collisions = set()
    collidedwith = set()
    last_size = None

    while last_size != len(collisions):
        last_size = len(collisions)
        filtered = bed[bed['name'].isin(collisions) == False]
        last = None    
        for i, row in filtered[['chrom', 'chromStart', 'chromEnd', 'name']].iterrows():
            if last is None or last.chrom != row.chrom:
                last = row
            else:
                if (row.chromStart >= last.chromStart) and (row.chromStart <= last.chromEnd):
                    print('|', last.chrom, '|', last.chromStart, '|', last.chromEnd, '|', 
                          last['name'], '|', row.chromStart, '|', row.chromEnd, '|', row['name'], '|', )
                    collisions.add(row['name'])
                    collidedwith.add(last['name'])
                    
                last = row
                
    return filtered, collisions, collidedwith

def fix_overlaps(bed):
    #| chr1 | 134275871 | 134276983 | UP_15 | 134276734 | 134277378 | GS_Myog_41 |
    #| chr1 | 134284098 | 134285495 | GS_Myog_39 | 134285494 | 134286451 | UP_16 |
    #| chr3 | 88138276 | 88138625 | NML_8 | 88138602 | 88139038 | NML_10 |
    #| chr6 | 108668244 | 108668981 | WOLD_LR_018 | 108668244 | 108668804 | WOLD_LR_027 |
    #| chr7 | 46313331 | 46314018 | NEG_22 | 46313389 | 46314120 | WOLD_LR_045 |
    #| chr7 | 46337340 | 46338183 | NEG_20 | 46338108 | 46338476 | GS_Myod_07 |
    #| chr8 | 123882187 | 123882932 | WOLD_LR_008 | 123882909 | 123883847 | UP_13 |
    #| chr8 | 126583914 | 126584586 | WOLD_LR_002 | 126583914 | 126584586 | WOLD_LR_002 |    
    bed = bed.copy()
    for name, new_start in [
        ('GS_Myog_41', 134276984),
        ('UP_16', 134285496),
        ('NML_10', 88138626),
    #    ('WOLD_LR_027', 108668982),
    #    ('WOLD_LR_045', 46314019),
        ('GS_Myod_07', 46338184),
    #    ('UP_13', 123882933),
    #    ('WOLD_LR_002', 126584587)
    ]:
        bed_index = bed[bed['name'] == name].index
        if len(bed_index == 1):
            i = bed_index[0]
            old_start = bed.loc[i, 'chromStart']
            assert bed.loc[i, 'name'] == name
            bed.loc[i, 'chromStart'] = new_start
            bed.loc[i, 'thickStart'] = new_start
            print(old_start, new_start)
        else:
            print(bed_index, name)
    return bed
    


In [17]:
def read_sizes(filename):
    sizes = []
    for line in open(filename, 'rt'):
        records = line.strip().split()
        sizes.append((records[0], int(records[1])))
    return sizes
                     
def make_bigwig(name, bed):
    sizes = read_sizes('mm10.sizes')
    nomarked = bed[bed['predictionCat'].isin(('TEST_ELEMENT', 'CONTROL_ELEMENT'))]
    fixed = fix_overlaps(nomarked)
    filtered, collisions, collidedwith = remove_collisions(fixed)
    bw = pyBigWig.open(name, 'w')
    bw.addHeader(sizes, maxZooms=5)
    #bw.addEntries(["chr1", "chr1", "chr1"], [0, 100, 125], ends=[5, 120, 126], values=[0.0, 1.0, 200.0])
    order = ['chr1',  'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9',
             'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19',
             'chrX']
    for chrom in order:
        f = filtered[filtered['chrom'].isin([chrom])]
        names = list(f['name'])
        chroms = list(f['chrom'])
        starts = list(f['chromStart'])
        ends = list(f['chromEnd'])
        values = list(f['aveDeterm'])

        bw.addEntries(chroms, starts, ends=ends, values=values)
    bw.close()
    return filtered, collisions

def make_error_bigwig(name, bed):
    sizes = read_sizes('mm10.sizes')
    nomarked = bed[bed['predictionCat'].isin(('TEST_ELEMENT', 'CONTROL_ELEMENT'))]
    fixed = fix_overlaps(nomarked)
    filtered, collisions, collidedwith = remove_collisions(fixed)
    bw = pyBigWig.open(name, 'w')
    bw.addHeader(sizes, maxZooms=5)
    #bw.addEntries(["chr1", "chr1", "chr1"], [0, 100, 125], ends=[5, 120, 126], values=[0.0, 1.0, 200.0])
    order = ['chr1',  'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9',
             'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19',
             'chrX']
    for chrom in order:
        f = filtered[filtered['chrom'].isin([chrom])]
        names = list(f['name'])
        chroms = list(f['chrom'])
        starts = list(f['chromStart'])
        ends = list(f['chromEnd'])
        values = list((f['aveDeterm']-f['stdDeterm']).clip_lower(0))

        bw.addEntries(chroms, starts, ends=ends, values=values)
    bw.close()
    return filtered, collisions

def make_bedgraph(name, bed, l=None):
    filtered, collisions, collidedwith = remove_collisions(bed[bed['predictionCat'].isin(('TEST_ELEMENT', 'CONTROL_ELEMENT'))])
    with open(name, 'wt') as outstream:
        for i, row in filtered.iterrows():
            cols = [row['chrom'], str(row['chromStart']), str(row['chromEnd']), str(row['aveDeterm'])]
            outstream.write('\t'.join(cols))
            outstream.write('\n')
            if l and l >= i:
                return
        

In [18]:
for filename in bedframes:
    base, ext = os.path.splitext(filename)
    bwname = os.path.join('/tmp', base + '.bw')
    f = make_bigwig(bwname, bed)
    bwerror = os.path.join('/tmp', base + '-error.bw')
    f = make_error_bigwig(bwerror, bed)


134276734 134276984
134285494 134285496
88138602 88138626
46338108 46338184
| chr6 | 108668244 | 108668981 | WOLD_LR_018 | 108668244 | 108668804 | WOLD_LR_027 |
| chr7 | 46313331 | 46314018 | NEG_22 | 46313389 | 46314120 | WOLD_LR_045 |
| chr8 | 123882187 | 123882932 | WOLD_LR_008 | 123882909 | 123883847 | UP_13 |
| chr8 | 126583914 | 126584586 | WOLD_LR_002 | 126583914 | 126584586 | WOLD_LR_002 |
134276734 134276984
134285494 134285496
88138602 88138626
46338108 46338184
| chr6 | 108668244 | 108668981 | WOLD_LR_018 | 108668244 | 108668804 | WOLD_LR_027 |
| chr7 | 46313331 | 46314018 | NEG_22 | 46313389 | 46314120 | WOLD_LR_045 |
| chr8 | 123882187 | 123882932 | WOLD_LR_008 | 123882909 | 123883847 | UP_13 |
| chr8 | 126583914 | 126584586 | WOLD_LR_002 | 126583914 | 126584586 | WOLD_LR_002 |
134276734 134276984
134285494 134285496
88138602 88138626
46338108 46338184
| chr6 | 108668244 | 108668981 | WOLD_LR_018 | 108668244 | 108668804 | WOLD_LR_027 |
| chr7 | 46313331 | 46314018 | NEG_22

In [19]:
import sys

TH_DIR = os.path.expanduser('~/proj/trackhub')
if TH_DIR not in sys.path:
    sys.path.append(TH_DIR)

In [20]:
from trackhub import AggregateTrack, CompositeTrack, SuperTrack, Track, ViewTrack, SubGroupDefinition, default_hub

import os
from urllib.parse import urljoin

In [21]:
URLBASE='https://woldlab.caltech.edu/~diane/encode3-transfection/'
GENOME = 'mm10'

bed_attributes = {
    'c2c12_myoblast_biorep1.bed': {'cell_line': "c2", 'state': "myoblat"},
    'c2c12_myoblast_biorep2.bed': {'cell_line': "c2", 'state': "myoblat"},
    'c2c12_myocyte_biorep1.bed': {'cell_line': "c2", 'state': "myocyte"},
    'c2c12_myocyte_biorep2.bed': {'cell_line': "c2", 'state': "myocyte"},
    '10T_half_biorep1.bed': {'cell_line': "ten", 'state': "diff"},
    '10T_half_biorep2.bed': {'cell_line': "ten", 'state': 'diff'},
    '10T_half_mock_diff_biorep1.bed': {'cell_line': "ten", 'state': 'mock'},
    '10T_half_mock_diff_biorep2.bed': {'cell_line': "ten", 'state': 'mock'},
}

hub, genomes_file, genome, trackdb = default_hub(
    hub_name='transfection',
    short_label='Transfection',
    long_label='ENCODE3 Transfection Validations',
    genome='mm10',
    email='diane@caltech.edu',
)

psusuper = SuperTrack(
    name="psutrans",
    short_label="PSU trans",
    long_label="PSU Transfection tracks",
    visibility='dense',
)

track1 = Track(
    name="psu",
    url='https://woldlab.caltech.edu/~diane/ENCFF926ITQ.bigBed',
    tracktype='bigBed 9',
    short_label='psutrans',
    long_label='PSU Transfection',
    visibility='dense',
    itemRgb='on',
    exonNumbers="off",
    #spectrum='on',
    #scoreMin=0,
    #scoreMax=maxscore,
    )

psusuper.add_track(track1)
trackdb.add_tracks(psusuper)

woldsuper = SuperTrack(
    name="woldsuper",
    short_label="Wold-lab trans",
    long_label="Wold Lab Transfection tracks",
    visibility='dense',
)
trackdb.add_tracks(woldsuper)

bedcomposite = CompositeTrack(
    name="woldtrans_bed",
    short_label="Transfection beds",
    long_label="Transfection track",
    tracktype="bigBed 9",
    dragAndDrop='subtracks',
    visibility='dense',
)
woldsuper.add_track(bedcomposite)

subgroups = [
    SubGroupDefinition(
        name="cell_line",
        label="Cell Line",
        mapping=dict(ten="10T 1/2", c2="C2C12")),

    SubGroupDefinition(
        name="state",
        label="Cell State",
        mapping=dict(diff="Differentiated", 
                     mock="Mock Differentiated",
                     myoblat="Myoblat",
                     myocyte="Myocyte",
                    )),
]
bedcomposite.add_subgroups(subgroups)

bed_view = ViewTrack(
    name="bedViewTrack",
    view="Bed",
    visibility="full",
    tracktype="bigBed 9",
    short_label="Transfection",
    long_label="Transfection Beds")
bedcomposite.add_view(bed_view)

for filename in bed_file_info:
    rep_prefix, threshold = bed_file_info[filename]
    basename, _ = os.path.splitext(filename)
    track = Track(
        name=basename,
        short_label = rep_prefix,
        long_label = rep_prefix,
        tracktype = 'bigBed 9',
        url = os.path.join(URLBASE, GENOME, basename + '.bigBed'),
        visibility='dense',
        itemRgb='on',
        exonNumbers='off',
        #spectrum='on',
        #scoreMin=0,
        #scoreMax=maxscore,
        subgroups=dict(
            cell_line=bed_attributes[filename]['cell_line'],
            state=bed_attributes[filename]['state']))
    bed_view.add_tracks(track)
    
wig_view = ViewTrack(
    name="wigViewTrack",
    view="Wig",
    visibility="squish",
    tracktype="bigWig",
    short_label="signal",
    long_label="Transfection Signal")

for filename in bedframes:
    basename, ext = os.path.splitext(filename)
    aveDeterm = bedframes[filename]['aveDeterm']
    min_determ = numpy.floor(aveDeterm.min())
    max_determ = numpy.ceil(aveDeterm.max())
    most = numpy.ceil(aveDeterm.quantile([.8])[.8])
    threshold = bed_file_info[filename][1]
    
    bwname = os.path.join(URLBASE,GENOME, base + '.bw')
    aggregate = AggregateTrack(
        name=basename+'_agg',
        tracktype='bigWig {} {}'.format(min_determ, max_determ),
        short_label=basename,
        long_label=basename + 'aggregate',
        visibility='full',        
        aggregate='transparentOverlay',
    )
    bw_track = Track(
        name=basename+'_signal',
        short_label = rep_prefix + '_signal',
        long_label = rep_prefix+ ' signal',
        tracktype='bigWig {} {}'.format(min_determ, max_determ),
        viewLimits='{}:{}'.format(min_determ, most),
        autoScale='off',
        yLineMark = max([most, threshold]),
        yLineOnOff = 'on',
        url = bwname,
        visibility='full',
        itemRgb='on',
        #color='222,235,247',
        color='147,202,225',
        exonNumbers='off',
        #spectrum='on',
        subgroups=dict(
            cell_line=bed_attributes[filename]['cell_line'],
            state=bed_attributes[filename]['state']))
    
    bwerr = os.path.join(URLBASE,GENOME, base + '-error.bw')
    bwerr_track = Track(
        name=basename+'_error',
        short_label = rep_prefix + '_error',
        long_label = rep_prefix+ ' error',
        tracktype='bigWig {} {}'.format(min_determ, max_determ),
        viewLimits='{}:{}'.format(min_determ, most),
        autoScale='off',
        url = bwerr,
        visibility='full',
        itemRgb='on',
        color='49,130,189',
        exonNumbers='off',
        #spectrum='on',
        subgroups=dict(
            cell_line=bed_attributes[filename]['cell_line'],
            state=bed_attributes[filename]['state']))

    aggregate.add_subtrack(bw_track)
    aggregate.add_subtrack(bwerr_track)
    #wig_view.add_tracks([aggregate])
    woldsuper.add_track(aggregate)

## Add extra track info

In [22]:
dnase = pandas.read_excel('transfection-additional-tracks.xlsx', sheetname='DNAse')
dnase_super = SuperTrack(
    name="dnase_super",
    short_label="DNAse",
    long_label="DNAse",
    visibility='pack',
)
for i, row in dnase.iterrows():
    dnase_track = Track(**row)
    dnase_super.add_track(dnase_track)
trackdb.add_tracks(dnase_super)

In [23]:
h3k27_super = SuperTrack(
    name="H3K27_super",
    short_label="H3K27ac",
    long_label="H3K27ac",
    visibility='pack',
)
trackdb.add_tracks(h3k27_super)

h3k27_bed = CompositeTrack(
    name="h3k27_bed",
    short_label="H3K27ac regions",
    long_label="H3K27ac regions",
    tracktype="bigBed 9",
    dragAndDrop='subtracks',
    visibility='dense',
)
h3k27_super.add_track(h3k27_bed)

h3k27_regions = pandas.read_excel('transfection-additional-tracks.xlsx', sheetname='H3K27ac regions')
for i, row in h3k27_regions.iterrows():
    h3k27_track = Track(**row, exonNumbers='off')
    h3k27_bed.add_subtrack(h3k27_track)

h3k27_signal = CompositeTrack(
    name="h3k27_wig",
    short_label="H3K27ac signal",
    long_label="H3K27ac signal",
    tracktype="bigWig",
    dragAndDrop='subtracks',
    visibility='dense',
)
h3k27_super.add_track(h3k27_signal)

h3k27_regions = pandas.read_excel('transfection-additional-tracks.xlsx', sheetname='H3K27ac signal')
for i, row in h3k27_regions.iterrows():
    h3k27_track = Track(**row, exonNumbers='off')
    h3k27_signal.add_subtrack(h3k27_track)


In [24]:
with open('/tmp/trackDb.txt', 'wt') as outstream:
    outstream.write(str(trackdb))