# Computational Genomics - Project 2
# Our solution - detection of TADs
## Authors: Kacper Grzymkowski, Mikołaj Malec, Piotr Marciniak

In [1]:
import hicstraw
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import numpy as np

In [2]:
stuff = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
 "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
 "21", "22", "X"]

In [3]:
# Settings for rest
MAX_JUMP_BINS = 50
TAD_SIZE_MIN_INTERACTIONS = 50
BIN_SIZE = 5000

In [4]:
def cd_tad(data, threshold=0):
    df_list = []
    for chr in data.getChromosomes():
        if chr.name in stuff:
            print(f"Processing chr{chr.name}")
            mzd = data.getMatrixZoomData(chr.name, chr.name, "observed", "NONE", "BP", 5000)
            record_list = mzd.getRecords(0, chr.length, 0, chr.length)
            g = nx.Graph()
            g.add_weighted_edges_from([(r.binX, r.binY, r.counts) for r in record_list if r.counts > threshold])
            comms = nx.community.louvain_communities(g)
            lod = [
                {"chr":chr.name, "bin": bin, "community": f"{chr.name}_{comm_i}"}
                for comm_i, comm in enumerate(comms)
                for bin in comm 
            ]
            df = pd.DataFrame(lod)
            df_list.append(df)
    return pd.concat(df_list).reset_index()

In [5]:
def trim_communities_to_tads_window(df):
    df.loc[:, "tad"] = -1
    if len(df) < 10:
        return df
    df = df.sort_values("bin").reset_index(drop=True)
    gradient = np.gradient(df["bin"])/BIN_SIZE
    jumps, = np.where(gradient > MAX_JUMP_BINS)
    for i, (start, end) in enumerate(zip(jumps[:-1], jumps[1:])):
        if len(df.loc[start:end, "tad"]) > TAD_SIZE_MIN_INTERACTIONS:
            df.loc[start:end, "tad"] = i
    return df

In [6]:
def trim_communities_to_tads(df):
    df.loc[:, "tad"] = -1
    if len(df) < 10:
        return pd.DataFrame({"start": [], "end": []})
    df = df.sort_values("bin").reset_index(drop=True)
    gradient = np.gradient(df["bin"])/BIN_SIZE
    jumps, = np.where(gradient > MAX_JUMP_BINS)
    tad_list = []
    for i, (start, end) in enumerate(zip(jumps[:-1], jumps[1:])):
        if len(df.loc[start:end, "tad"]) > TAD_SIZE_MIN_INTERACTIONS:
            tad_list.append({"start": df.loc[start, "bin"], "end": df.loc[end, "bin"]})
    return pd.DataFrame(tad_list)

In [7]:
def process(name):
    data_hic = hicstraw.HiCFile(f"data/{name}.hic")
    df = cd_tad(data_hic)
    df.to_csv(f"data/intermediates/{name}_CD.csv")
    df.groupby("community").apply(trim_communities_to_tads_window, include_groups=False).to_csv(f"data/intermediates/{name}_CD_window.csv")
    df.groupby("community").apply(trim_communities_to_tads, include_groups=False).to_csv(f"data/intermediates/{name}_CD_tads.csv")

In [8]:
%%time
process("GM12878")

Processing chr1
Processing chr2
Processing chr3
Processing chr4
Processing chr5
Processing chr6
Processing chr7
Processing chr8
Processing chr9
Processing chr10
Processing chr11
Processing chr12
Processing chr13
Processing chr14
Processing chr15
Processing chr16
Processing chr17
Processing chr18
Processing chr19
Processing chr20
Processing chr21
Processing chr22
Processing chrX
CPU times: user 1min 52s, sys: 333 ms, total: 1min 52s
Wall time: 1min 52s


In [9]:
%%time
process("ENCFF629KXF")

Processing chr1
Processing chr2
Processing chr3
Processing chr4
Processing chr5
Processing chr6
Processing chr7
Processing chr8
Processing chr9
Processing chr10
Processing chr11
Processing chr12
Processing chr13
Processing chr14
Processing chr15
Processing chr16
Processing chr17
Processing chr18
Processing chr19
Processing chr20
Processing chr21
Processing chr22
Processing chrX
CPU times: user 4min 8s, sys: 580 ms, total: 4min 9s
Wall time: 4min 9s


In [10]:
def process_one_at_time(name, threshold=0):
    data = hicstraw.HiCFile(f"data/{name}.hic")
    for chr in data.getChromosomes():
        if chr.name in stuff:
            print(f"Processing chr{chr.name}")
            mzd = data.getMatrixZoomData(chr.name, chr.name, "observed", "NONE", "BP", BIN_SIZE)
            record_list = mzd.getRecords(0, chr.length, 0, chr.length)
            print(f"Loaded records", end='\r')
            g = nx.Graph()
            g.add_weighted_edges_from([(r.binX, r.binY, r.counts) for r in record_list if r.counts > threshold])
            del record_list
            print(f"Loaded graph", end='\r')
            comms = nx.community.louvain_communities(g)
            del g
            lod = [
                {"chr":chr.name, "bin": bin, "community": f"{chr.name}_{comm_i}"}
                for comm_i, comm in enumerate(comms)
                for bin in comm 
            ]
            print(f"CD done", end='\r')
            df = pd.DataFrame(lod)
            del lod
            df.to_csv(f"data/intermediates/{name}_chr{chr.name}_CD.csv")
            df.groupby("community").apply(trim_communities_to_tads_window, include_groups=False).to_csv(f"data/intermediates/{name}_{chr.name}_CD_window.csv")
            df.groupby("community").apply(trim_communities_to_tads, include_groups=False).to_csv(f"data/intermediates/{name}_chr{chr.name}CD_tads.csv")

In [11]:
%%time
process_one_at_time("inter_30")

Processing chr1
Processing chr2
Processing chr3
Processing chr4
Processing chr5
Processing chr6
Processing chr7
Processing chr8
Processing chr9
Processing chr10
Processing chr11
Processing chr12
Processing chr13
Processing chr14
Processing chr15
Processing chr16
Processing chr17
Processing chr18
Processing chr19
Processing chr20
Processing chr21
Processing chr22
Processing chrX
CPU times: user 31min 46s, sys: 7.83 s, total: 31min 53s
Wall time: 31min 53s


In [12]:
# Settings for the big boy
MAX_JUMP_BINS = 10
TAD_SIZE_MIN_INTERACTIONS = 10
BIN_SIZE = 25000

In [13]:
%%time
process_one_at_time("4DNFI1UEG1HD")

Processing chr1
Processing chr2
Processing chr3
Processing chr4
Processing chr5
Processing chr6
Processing chr7
Processing chr8
Processing chr9
Processing chr10
Processing chr11
Processing chr12
Processing chr13
Processing chr14
Processing chr15
Processing chr16
Processing chr17
Processing chr18
Processing chr19
Processing chr20
Processing chr21
Processing chr22
Processing chrX
CPU times: user 1h 16min 24s, sys: 23.8 s, total: 1h 16min 48s
Wall time: 1h 16min 48s
