# Obtain clean reference gene
Use CNV individual level data (del and dup) and gene data to obtain individual (real, column) by gene (real, row) matrix (ideally two bed files, bedtools) - reference X matrix

In [1]:
import pandas as pd, numpy as np
import os
from pandasql import sqldf
from collections import Counter
from more_itertools import unique_everseen
cwd = os.path.expanduser("~/GIT/cnv-gene-mapping/data")

## Import genes

In [2]:
ref_gene = pd.read_table(f"{cwd}/refGene.txt.gz", compression="gzip", sep = "\t", header = None, usecols = (1,2,4,5,12), names = ["tx_name", "chrom", "start", "end", "gene_name"])

In [3]:
ref_gene.shape

(74873, 5)

In [4]:
ref_gene = ref_gene[ref_gene["chrom"].isin(["chr"+str(x+1) for x in range(22)] + ["chrX"])]

In [5]:
ref_gene.shape

(71184, 5)

In [6]:
ref_gene["CHR"] = ref_gene["chrom"].apply(lambda x: int(x.split("chr")[1]) if x.split("chr")[1] != "X" else 23)

In [7]:
ref_gene = ref_gene.drop_duplicates(subset = ("CHR", "start", "end"))

In [8]:
ref_gene = ref_gene.sort_values(by = ["CHR", "start", "end"])[["CHR", "start", "end", "gene_name"]]

In [9]:
ref_gene.shape

(45789, 4)

In [10]:
ref_gene.head()

Unnamed: 0,CHR,start,end,gene_name
0,1,11868,14362,LOC102725121
1,1,11873,14409,DDX11L1
42692,1,14361,29370,WASH7P
3,1,17368,17436,MIR6859-1
7,1,30365,30503,MIR1302-2


In [219]:
## not correct
# start = ref_gene[["CHR", "gene_name", "start"]].groupby(["CHR", "gene_name"]).min().reset_index(drop = False)
# end = ref_gene[["CHR", "gene_name", "end"]].groupby(["CHR", "gene_name"]).max().reset_index(drop = False)

In [65]:
# ref_gene[ref_gene["gene_name"] == "OR4F16"]

In [11]:
ref_gene[ref_gene["gene_name"] == "TTLL10"]

Unnamed: 0,CHR,start,end,gene_name
21769,1,1109285,1133313,TTLL10
55,1,1115076,1121243,TTLL10


In [12]:
ref_gene[ref_gene["gene_name"] == "LINC01128"]

Unnamed: 0,CHR,start,end,gene_name
25,1,762970,778984,LINC01128
26,1,762970,794826,LINC01128
30,1,763177,794826,LINC01128


In [32]:
ref_gene[ref_gene["gene_name"] == "CAMTA1"]

Unnamed: 0,CHR,start,end,gene_name
291,1,6845383,6932107,CAMTA1
294,1,6845383,6948261,CAMTA1
298,1,6845383,7829766,CAMTA1
7246,1,6845513,7829766,CAMTA1
305,1,7729789,7829766,CAMTA1
309,1,7729803,7829766,CAMTA1
310,1,7729824,7829766,CAMTA1
311,1,7740147,7829766,CAMTA1
312,1,7764864,7829766,CAMTA1
314,1,7764897,7829766,CAMTA1


In [49]:
ref_gene["match"] = (ref_gene.CHR != ref_gene.CHR.shift()) | ((ref_gene.start >= ref_gene.end.shift()) & (ref_gene.gene_name == ref_gene.gene_name.shift())) | \
                    ((ref_gene.start >= ref_gene.end.shift()) & (ref_gene.gene_name != ref_gene.gene_name.shift()))

In [50]:
num = 1
gene = list()
for i in range(23):
    tmp = ref_gene[ref_gene["CHR"] == i+1]
    for index, row in tmp.iterrows():
        if row["match"] == True:
            gene.append("gene" + str(num))
            num += 1
        else:
            gene.append("gene" + str(num-1))
ref_gene["gene"] = gene

In [81]:
ref_gene_new = pd.DataFrame(columns = ["CHR", "start", "end", "gene"])
gene_list = list(unique_everseen(gene))

In [82]:
for gn in gene_list:
    tmp = ref_gene[ref_gene["gene"] == gn]
    if tmp.shape[0] == 1:
        ref_gene_new = ref_gene_new.append(tmp[["CHR", "start", "end", "gene"]])
    else:
        ref_gene_new = ref_gene_new.append({"CHR": tmp["CHR"].tolist()[0], "start": min(tmp["start"]), "end": max(tmp["end"]), "gene": gn}, ignore_index=True)

In [83]:
ref_gene_new.head()

Unnamed: 0,CHR,start,end,gene
0,1,11868,29370,gene1
1,1,30365,30503,gene2
2,1,34610,36081,gene3
3,1,69090,70008,gene4
4,1,134772,140566,gene5


In [84]:
ref_gene_new.shape

(23343, 4)

In [86]:
ref_gene_new.to_csv(f"{cwd}/refGene.clean.gz", compression = "gzip", sep = "\t", index = False, header = True)