# Introduction

Barbara would like to be able to see the expression level of the overlapping regions, Gigio suggested some of the bar chart plots supported by the browser might work nicely. https://genome.ucsc.edu/goldenpath/help/barChart.html

I modified <a href="http://localhost:8888/notebooks/format%20ENCODE3%20Woldlab%20transfection%20data.ipynb">format ENCODE3 Woldlab transfection data.ipynb</a> to output the merged set of locations from the google doc and the transfection-merged.csv file into transfection-annotated.csv

In [1]:
import pandas
import numpy
import collections
import itertools
import os
import pyBigWig
from urllib.parse import urljoin

# Find collisions

In [2]:
transfection = pandas.read_csv('transfection-annotated.csv')

Is ID unique? (yes)

In [3]:
len(set(transfection['#ID'])), transfection['#ID'].shape

(367, (367,))

In [4]:
transfection.columns

Index(['#ID', 'Type', 'CHR(mm10)', 'Start(mm10)', 'Stop(mm10)',
       'C2C12_Myoblast_Rep1-Tech_Rep_1', 'C2C12_Myoblast_Rep1-Tech_Rep_2',
       'C2C12_Myoblast_Rep1-Tech_Rep_3', 'C2C12_Myoblast_Rep1-Tech_Rep_4',
       '10T1/2_Rep1-Tech_Rep_1', '10T1/2_Rep1-Tech_Rep_2',
       '10T1/2_Rep1-Tech_Rep_3', '10T1/2_Rep1-Tech_Rep_4',
       'C2C12_Myocyte_Rep1-Tech_Rep_1', 'C2C12_Myocyte_Rep1-Tech_Rep_2',
       'C2C12_Myocyte_Rep1-Tech_Rep_3', 'C2C12_Myocyte_Rep1-Tech_Rep_4',
       '10T1/2_MockDiff_Rep1-Tech_Rep_1', '10T1/2_MockDiff_Rep1-Tech_Rep_2',
       '10T1/2_MockDiff_Rep1-Tech_Rep_3', '10T1/2_MockDiff_Rep1-Tech_Rep_4',
       'C2C12_Myoblast_Rep2-Tech_Rep_1', 'C2C12_Myoblast_Rep2-Tech_Rep_2',
       'C2C12_Myoblast_Rep2-Tech_Rep_3', 'C2C12_Myoblast_Rep2-Tech_Rep_4',
       '10T1/2_Rep2-Tech_Rep_1', '10T1/2_Rep2-Tech_Rep_2',
       '10T1/2_Rep2-Tech_Rep_3', '10T1/2_Rep2-Tech_Rep_4',
       'C2C12_Myocyte_Rep2-Tech_Rep_1', 'C2C12_Myocyte_Rep2-Tech_Rep_2',
       'C2C12_Myocyte_Rep2-

In [5]:
Segment = collections.namedtuple('Segment', ['start', 'stop'])

In [6]:
def build_basepair_index(transfection):
    locations = {}
    for i, (name, chrom, start, stop) in transfection[['#ID', 'CHR(mm10)', 'Start(mm10)', 'Stop(mm10)']].iterrows():
        region = locations.setdefault(chrom, {})
        for base in range(start, stop + 1):
            region.setdefault(base, []).append(name)
    return locations

In [7]:
def merge_locations(locations):
    start = None
    stop = None
    prev = None
    names = set()
    regions = {}
    for chrom in locations:
        for base in sorted(locations[chrom]):
            if start is None:
                start = base
                names.update(locations[chrom][base])
            elif prev + 1 != base:
                stop = prev
                #print(start, stop, names)
                regions.setdefault(chrom, {})[Segment(start, stop)] = names
                start = None
                stop = None
                names = set()
            prev = base
    return regions



In [8]:
regions = merge_locations(build_basepair_index(transfection))

We need to make bed files that look like this.
<pre>table bigBarChart
"bigBarChart bar graph display"  
    ( 
    string chrom;               "Reference sequence chromosome or scaffold"
    uint chromStart;            "Start position in chromosome"
    uint chromEnd;              "End position in chromosome"
    string name;                "Name or ID of item"
    uint score;                 "Score (0-1000)"
    char[1] strand;             "'+','-' or '.'. Indicates whether the query aligns to the + or - strand on the reference"
    string name2;               "Alternate name of item"
    uint expCount;              "Number of bar graphs in display, must be <= 100"
    float[expCount] expScores;  "Comma separated list of category values."
    bigint _dataOffset;         "Offset of sample data in data matrix file, for boxplot on details page, optional only for barChart format"
    int _dataLen;               "Length of sample data row in data matrix file, optional only for barChart format"
    )</pre>

In [9]:
def get_rep_names():
    return ['-Tech_Rep_1', '-Tech_Rep_2', '-Tech_Rep_3', '-Tech_Rep_4']

def filter_table(table, limit):
    if limit is not None:
        result = table[table['#ID'].isin(limit)].copy()
    else:
        result = table.copy()
    result.sort_values('#ID', inplace=True)
    return result

In [10]:
def compute_experiment_score(table, experiment, limit=None):
    rep_names = get_rep_names()
    table = filter_table(table, limit)
    tech_reps = table[[experiment + r for r in rep_names]]
    result = pandas.DataFrame({
        '#ID':table['#ID'], 
        experiment: tech_reps.mean(axis=1),
    })
    
    return result

In [11]:
def format_region(chromosome, segment):
    return '{}-{}-{}'.format(chromosome, segment.start, segment.stop)

In [12]:
def make_overlapped_region_bed(transfection, experiment, chrom, segment, names):
    average = compute_experiment_score(transfection, experiment, names)
    max_name = average.max()['#ID']
    expCount = average.shape[0]
    expScores = ','.join([str(x) for x in average[experiment].values])
    return(['\t'.join([
        chrom,
        str(segment.start),
        str(segment.stop),
        format_region(chrom, segment),
        '999',
        '.',
        max_name,
        str(expCount),
        expScores,
        '0',
        '0',
    ])])

In [13]:
def combine_exp_matrix_multi_index(index):
    return [x[1] + '_' + x[0] for x in index]

def create_exp_matrix(table, experiment, chrom, segment, limit=None):
    rep_names = get_rep_names()
    filtered_table = filter_table(table, limit)
    tech_reps = filtered_table[['#ID'] + [experiment + r for r in rep_names]]
    flat = tech_reps.set_index('#ID').unstack()
    flat.name = format_region(chrom, segment)
    return flat

def save_exp_matrix(filename, exp_matrix):
    result = exp_matrix.dropna()
    result.index = combine_exp_matrix_multi_index(result.index)
    result = result.to_frame().T
    result.to_csv(filename, sep='\t', index_label='Name')
    
def save_categories(filename, exp_matrix):
    columns = combine_exp_matrix_multi_index(exp_matrix.index)
    
    with open(filename, 'wt') as outstream:
        for column_name, (tech_rep, construct) in zip(columns, exp_matrix.index):
            outstream.write('{}\t{}\n'.format(column_name, construct))    

In [14]:
def build_track(table, experiment, chromosome, segment, names, TMPDIR='/tmp'):
    GENOME='mm10/'
    URLBASE = 'https://woldlab.caltech.edu/~diane/encode3-transfection-overlap/' + GENOME
    
    matrix_ext = '.Matrix.txt'
    sample_ext = '.Sample.txt'

    basename = '-'.join([experiment, chromosome, str(segment.start), str(segment.stop)])
    basename = basename.replace('1/2', 'half')
    
    with open(os.path.join(TMPDIR, basename + '.bed'), 'wt') as outstream:
        for row in make_overlapped_region_bed(table, experiment, chromosome, segment, names):
            outstream.write(row)
            outstream.write(os.linesep)
            
    exp_matrix = create_exp_matrix(table, experiment, chromosome, segment, names)
    save_exp_matrix(os.path.join(TMPDIR, basename + matrix_ext), exp_matrix)
    save_categories(os.path.join(TMPDIR, basename + sample_ext), exp_matrix)

    template = """
track {experiment}_{name}
type bigBarChart
visibility full
shortLabel {short_label}
longLabel {long_label}
barChartBars {categories}
barChartMetric median
barChartLabel Constructs
barChartUnit activity
bigDataUrl {data_url}
barChartMatrixUrl {matrix_url}
barChartSampleUrl {sample_url}
parent transfection_overlaps
"""        
#subGroups region={short_label} experiment={experiment}
    
    trackdb = template.format(
        experiment=experiment.replace('1/2', 'half'),
        name=sorted(names)[0],
        short_label=format_region(chromosome, segment),
        long_label=basename,
        categories=' '.join(names),
        data_url=urljoin(URLBASE, basename + '.bb'),
        matrix_url=urljoin(URLBASE, basename + matrix_ext),
        sample_url=urljoin(URLBASE, basename + sample_ext)
    )

    return trackdb

In [15]:
def make_hub(table, regions):
    TMPDIR = '/tmp/transfection/'
    experiments = [
        'C2C12_Myoblast_Rep1',
        'C2C12_Myoblast_Rep2',
        'C2C12_Myocyte_Rep1',
        'C2C12_Myocyte_Rep2',
        '10T1/2_Rep1',
        '10T1/2_Rep2',
        '10T1/2_MockDiff_Rep1',
        '10T1/2_MockDiff_Rep2',
    ]

    composite = """
track transfection_overlaps
compositeTrack on
type bigBarChart
shortLabel transfection overlaps
longLabel Transfection overlapping regions
subGroup1 regions {regions}
subGroup2 experiment {experiments} 
dimensions dimX=regions dimY=experiment
sortOrder regions=+ experiment=+
"""        

    supertrack = """
track transfection_overlaps
superTrack on show
shortLabel transfection overlaps
longLabel Transfection overlapping regions
"""     
    block = [supertrack]
    overlapping_regions = []
    for chrom in regions:
        for s in regions[chrom]:
            if len(regions[chrom][s]) > 1:
                overlapping_regions.append(format_region(chrom, s))
                for experiment in experiments:
                    score = compute_experiment_score(transfection, experiment, regions[chrom][s]).dropna()
                    names = list(score['#ID'])
                    if len(names) > 0:
                        block.append(build_track(table, experiment, chrom, s, names, TMPDIR))
    #block.insert(0, composite.format(
    #    regions=' '.join(overlapping_regions), 
    #    experiments=' '.join(experiments)))
    with open(os.path.join(TMPDIR, 'trackDb.txt'), 'wt') as outstream:
        for b in block:
            outstream.write(b)
make_hub(transfection, regions)

In [18]:
overlapping = set()
for chrom in regions:
    for s in regions[chrom]:
        if len(regions[chrom][s]) > 1:
            print(chrom, s.start, s.stop, regions[chrom][s])
            overlapping.update(regions[chrom][s])
print(len(overlapping))


chr1 91413902 91414660 {'WOLD_LR_014', 'NML_12'}
chr1 92787054 92787878 {'WOLD_LR_013', 'WOLD_LR_086'}
chr1 134071266 134073689 {'GS_Myog_09', 'GS_Myog_07'}
chr1 134082949 134083616 {'GS_Myog_04', 'GS_Myog_06'}
chr1 134091601 134093367 {'GS_Myog_01', 'GS_Myog_02'}
chr1 134195092 134196873 {'BJW1004', 'BJW1003', 'BJW1007', 'BJW1006', 'BJW1005'}
chr1 134201956 134203654 {'BJW1012', 'BJW1011'}
chr1 134244220 134245912 {'BJW1000B', 'BJW1000A'}
chr1 134275872 134277378 {'UP_15', 'GS_Myog_40'}
chr1 134282949 134286451 {'BJW1016', 'GS_Myog_38'}
chr1 134288925 134290101 {'GS_Myog_31', 'GS_Myog_30'}
chr1 134320414 134321433 {'WOLD_LR_103', 'UP_20'}
chr7 46300939 46301614 {'WOLD_LR_098', 'WOLD_LR_094'}
chr7 46314527 46317458 {'WOLD2014JAN_047', 'WOLD_LR_095'}
chr7 46352697 46354168 {'WOLD_LR_089', 'FT_7'}
chr6 108668245 108668981 {'WOLD_LR_027', 'WOLD_LR_018'}
chr8 34679747 34680516 {'DWR_7', 'DWR_2'}
chr12 25097312 25099472 {'WOLD2014JAN_073', 'WOLD2014JAN_048'}
39


In [17]:
compute_experiment_score(transfection, 'C2C12_Myoblast_Rep2', ['WOLD_LR_027', 'WOLD_LR_018']).dropna()

Unnamed: 0,#ID,C2C12_Myoblast_Rep2
