# Obtain clean reference gene
Use CNV individual level data (del and dup) and gene data to obtain individual (real, column) by gene (real, row) matrix (ideally two bed files, bedtools) - reference X matrix

In [1]:
import pandas as pd, numpy as np
import os
from pandasql import sqldf
from collections import Counter
from more_itertools import unique_everseen
cwd = os.path.expanduser("~/GIT/cnv-gene-mapping/data")

## Import genes

In [2]:
ref_gene = pd.read_table(f"{cwd}/refGene.txt.gz", compression="gzip", sep = "\t", header = None, usecols = (1,2,4,5,12), names = ["tx_name", "chrom", "start", "end", "gene_name"])

In [3]:
ref_gene[ref_gene["gene_name"].str.contains("OR4F16")]

Unnamed: 0,tx_name,chrom,start,end,gene_name
6597,NM_001005277,chr1,367658,368597,OR4F16
38096,NM_001005277,chr5,180794287,180795226,OR4F16
39136,NM_001005277,chr1,621095,622034,OR4F16


In [4]:
ref_gene.head()

Unnamed: 0,tx_name,chrom,start,end,gene_name
0,NR_148357,chr1,11868,14362,LOC102725121
1,NR_046018,chr1,11873,14409,DDX11L1
2,NM_012154,chr8,141530254,141645732,AGO2
3,NR_106918,chr1,17368,17436,MIR6859-1
4,NR_107062,chr1,17368,17436,MIR6859-2


In [5]:
ref_gene.shape

(78288, 5)

In [6]:
ref_gene = ref_gene[ref_gene["chrom"].isin(["chr"+str(x+1) for x in range(22)] + ["chrX"])]

In [7]:
ref_gene.shape

(74551, 5)

In [8]:
ref_gene["CHR"] = ref_gene["chrom"].apply(lambda x: int(x.split("chr")[1]) if x.split("chr")[1] != "X" else 23)

In [9]:
ref_gene.shape

(74551, 6)

In [10]:
ref_gene = ref_gene.sort_values(by = ["CHR", "start", "end"])[["CHR", "start", "end", "gene_name", "tx_name"]]

## Drop genes that share same chr, start, end and exon/gene name (44322 left)

In [11]:
ref_gene = ref_gene.drop_duplicates(subset = ("CHR", "start", "end", "gene_name"))

In [12]:
ref_gene.shape

(44322, 5)

In [22]:
ref_gene[ref_gene["gene_name"].str.contains("OR4F")]

Unnamed: 0,CHR,start,end,gene_name,tx_name
70411,1,69090,70008,OR4F5,NM_001005484
6597,1,367658,368597,OR4F16,NM_001005277
23110,1,367658,368597,OR4F29,NM_001005221
65910,1,367658,368597,OR4F3,NM_001005224
23111,1,621095,622034,OR4F29,NM_001005221
39136,1,621095,622034,OR4F16,NM_001005277
53605,1,621095,622034,OR4F3,NM_001005224
8216,5,180794287,180795226,OR4F3,NM_001005224
23112,5,180794287,180795226,OR4F29,NM_001005221
38096,5,180794287,180795226,OR4F16,NM_001005277


In [23]:
ref_gene[ref_gene["start"] == 180794287]

Unnamed: 0,CHR,start,end,gene_name,tx_name
8216,5,180794287,180795226,OR4F3,NM_001005224
23112,5,180794287,180795226,OR4F29,NM_001005221
38096,5,180794287,180795226,OR4F16,NM_001005277


In [24]:
ref_gene[ref_gene["start"] == 621095]

Unnamed: 0,CHR,start,end,gene_name,tx_name
23111,1,621095,622034,OR4F29,NM_001005221
39136,1,621095,622034,OR4F16,NM_001005277
53605,1,621095,622034,OR4F3,NM_001005224


## Drop genes that share same chr, start and end (43442 left)

In [25]:
ref_gene = ref_gene.drop_duplicates(subset = ("CHR", "start", "end"))

In [26]:
ref_gene.shape

(43442, 5)

In [27]:
ref_gene[ref_gene["gene_name"].str.contains("OR4F16")]

Unnamed: 0,CHR,start,end,gene_name,tx_name
6597,1,367658,368597,OR4F16,NM_001005277


In [28]:
ref_gene[ref_gene["start"] == 180794287]

Unnamed: 0,CHR,start,end,gene_name,tx_name
8216,5,180794287,180795226,OR4F3,NM_001005224


In [13]:
## not correct
# start = ref_gene[["CHR", "gene_name", "start"]].groupby(["CHR", "gene_name"]).min().reset_index(drop = False)
# end = ref_gene[["CHR", "gene_name", "end"]].groupby(["CHR", "gene_name"]).max().reset_index(drop = False)

In [30]:
ref_gene[ref_gene["gene_name"] == "TTLL10"]

Unnamed: 0,CHR,start,end,gene_name,tx_name
10588,1,1109259,1133315,TTLL10,NM_001130045
62676,1,1115060,1121243,TTLL10,NM_153254


In [31]:
ref_gene[ref_gene["gene_name"] == "LINC01128"]

Unnamed: 0,CHR,start,end,gene_name,tx_name
24,1,762970,778984,LINC01128,NR_047526
25,1,762970,794826,LINC01128,NR_047519
29,1,763177,794826,LINC01128,NR_047525


In [32]:
ref_gene[ref_gene["gene_name"] == "CAMTA1"]

Unnamed: 0,CHR,start,end,gene_name,tx_name
24151,1,6845383,6948261,CAMTA1,NR_146202
27295,1,6845513,6932074,CAMTA1,NR_146204
9741,1,6845513,6948261,CAMTA1,NR_146203
13648,1,6845513,7829766,CAMTA1,NM_001349610
57536,1,7729789,7829766,CAMTA1,NM_001349623
7467,1,7729803,7829766,CAMTA1,NM_001349625
63410,1,7729824,7829766,CAMTA1,NM_001349617
63406,1,7740147,7829766,CAMTA1,NM_001349613
15217,1,7764795,7829766,CAMTA1,NM_001349619


In [33]:
# ref_gene["match"] = (ref_gene.CHR != ref_gene.CHR.shift()) | ((ref_gene.start >= ref_gene.end.shift()) & (ref_gene.gene_name == ref_gene.gene_name.shift())) | \
#                     ((ref_gene.start >= ref_gene.end.shift()) & (ref_gene.gene_name != ref_gene.gene_name.shift()))
ref_gene["match"] = (ref_gene.CHR != ref_gene.CHR.shift()) | ((ref_gene.end >= ref_gene.start.shift()) & (ref_gene.gene_name != ref_gene.gene_name.shift()))

In [34]:
ref_gene.head()

Unnamed: 0,CHR,start,end,gene_name,tx_name,match
0,1,11868,14362,LOC102725121,NR_148357,True
1,1,11873,14409,DDX11L1,NR_046018,True
41329,1,14361,29370,WASH7P,NR_024540,True
3,1,17368,17436,MIR6859-1,NR_106918,True
7,1,30365,30503,MIR1302-2,NR_036051,True


In [35]:
Counter(ref_gene["match"])

Counter({True: 28957, False: 14485})

In [36]:
num = 1
gene = list()
for i in range(23):
    tmp = ref_gene[ref_gene["CHR"] == i+1]
    for index, row in tmp.iterrows():
        if row["match"] == True:
            gene.append("gene" + str(num))
            num += 1
        else:
            gene.append("gene" + str(num-1))
ref_gene["gene"] = gene

In [38]:
ref_gene_new = pd.DataFrame(columns = ["CHR", "start", "end", "gene_name"])
gene_list = list(unique_everseen(gene))

In [39]:
for gn in gene_list:
    tmp = ref_gene[ref_gene["gene"] == gn]
    if tmp.shape[0] == 1:
        ref_gene_new = ref_gene_new.append(tmp[["CHR", "start", "end", "gene_name"]])
    else:
        ref_gene_new = ref_gene_new.append({"CHR": tmp["CHR"].tolist()[0], "start": min(tmp["start"]), "end": max(tmp["end"]), "gene_name": tmp["gene_name"].tolist()[0]}, ignore_index=True)

In [40]:
ref_gene_new.shape

(28957, 4)

In [41]:
gene_name = ref_gene_new["gene_name"].tolist()

In [42]:
gene_name_new = []
for i, v in enumerate(gene_name):
    totalcount = gene_name.count(v)
    count = gene_name[:i].count(v)
    gene_name_new.append(v + ":" + str(count + 1) if totalcount > 1 else v)

In [43]:
ref_gene_new["gene_name"] = gene_name_new

In [44]:
ref_gene_new.head()

Unnamed: 0,CHR,start,end,gene_name
0,1,11868,14362,LOC102725121:1
1,1,11873,14409,DDX11L1
2,1,14361,29370,WASH7P
3,1,17368,17436,MIR6859-1:1
4,1,30365,30503,MIR1302-2:1


In [46]:
ref_gene_new[ref_gene_new["gene_name"].str.contains("MIR6859-1")]

Unnamed: 0,CHR,start,end,gene_name
3,1,17368,17436,MIR6859-1:1
20802,15,102513726,102513794,MIR6859-1:2
20806,16,67051,67119,MIR6859-1:3


In [50]:
## 20200316
ref_gene_new.to_csv(f"{cwd}/refGene.clean.gz", compression = "gzip", sep = "\t", index = False, header = ["CHR", "start", "end", "gene"])