# Obtain clean reference gene
Use CNV individual level data (del and dup) and gene data to obtain individual (real, column) by gene (real, row) matrix (ideally two bed files, bedtools) - reference X matrix

In [None]:
import pandas as pd, numpy as np
import os
from pandasql import sqldf
from collections import Counter
from more_itertools import unique_everseen
cwd = os.path.expanduser("~/GIT/cnv-gene-mapping/data")

## Import original genes/exons

In [2]:
ref_gene = pd.read_table(f"{cwd}/refGene.txt.gz", compression="gzip", sep = "\t", header = None, usecols = (1,2,4,5,12), names = ["tx_name", "chrom", "start", "end", "gene_name"])

In [3]:
ref_gene.head()

Unnamed: 0,tx_name,chrom,start,end,gene_name
0,NR_148357,chr1,11868,14362,LOC102725121
1,NR_046018,chr1,11873,14409,DDX11L1
2,NM_012154,chr8,141530254,141645732,AGO2
3,NR_106918,chr1,17368,17436,MIR6859-1
4,NR_107062,chr1,17368,17436,MIR6859-2


In [4]:
ref_gene.shape

(78288, 5)

In [5]:
ref_gene[ref_gene["gene_name"].str.contains("GSTT2")]

Unnamed: 0,tx_name,chrom,start,end,gene_name
93,NR_126445,chr22,24322324,24326106,GSTT2
1279,NR_126445,chr22,24299600,24303382,GSTT2
7664,NM_001302670,chr22,24322218,24326106,GSTT2
8447,NM_001080843,chr22,24299600,24303382,GSTT2B
20162,NM_000854,chr22,24322218,24326106,GSTT2
23076,NM_001363804,chr22,24322313,24326106,GSTT2B
25695,NM_001363804,chr22,24299600,24303393,GSTT2B
39248,NM_001080843,chr22,24322324,24326106,GSTT2B
62166,NR_126442,chr22,24322218,24326106,GSTT2
62167,NR_126446,chr22,24322218,24326106,GSTT2


## Remove chromosome Y, M and unmatched

In [6]:
ref_gene = ref_gene[ref_gene["chrom"].isin(["chr"+str(x+1) for x in range(22)] + ["chrX"])]

In [7]:
ref_gene.shape

(74551, 5)

In [8]:
ref_gene["CHR"] = ref_gene["chrom"].apply(lambda x: int(x.split("chr")[1]) if x.split("chr")[1] != "X" else 23)

In [9]:
ref_gene.shape

(74551, 6)

In [10]:
ref_gene = ref_gene.sort_values(by = ["CHR", "gene_name", "start", "end"])[["CHR", "start", "end", "gene_name", "tx_name"]]

## Drop genes that share same chr, start, end and exon/gene name (44322 left)

In [11]:
ref_gene = ref_gene.drop_duplicates(subset = ("CHR", "start", "end", "gene_name"))

In [12]:
ref_gene.shape

(44322, 5)

In [14]:
ref_gene[ref_gene["gene_name"].str.contains("GSTT2")]

Unnamed: 0,CHR,start,end,gene_name,tx_name
1279,22,24299600,24303382,GSTT2,NR_126445
7664,22,24322218,24326106,GSTT2,NM_001302670
93,22,24322324,24326106,GSTT2,NR_126445
8447,22,24299600,24303382,GSTT2B,NM_001080843
25695,22,24299600,24303393,GSTT2B,NM_001363804
23076,22,24322313,24326106,GSTT2B,NM_001363804
39248,22,24322324,24326106,GSTT2B,NM_001080843


## Mark the genes and adjust gene names
1. If chromosome for a gene (in a row) is different from the previous (previous row), then mark it as new gene;
2. If gene name in a row is the same from the previous and start position >= previous end position
3. If gene name in a row is different from the previous

In [15]:
## shift() or shift(1): previous row
## shift(-1): next row
# ref_gene["match"] = (ref_gene.CHR != ref_gene.CHR.shift()) | ((ref_gene.start >= ref_gene.end.shift()) & (ref_gene.gene_name == ref_gene.gene_name.shift())) | \
#                     ((ref_gene.start >= ref_gene.end.shift()) & (ref_gene.gene_name != ref_gene.gene_name.shift()))
while (True):
    ref_gene = ref_gene.set_index([[i for i in range(ref_gene.shape[0])]])
    ref_gene["match"] = (ref_gene.CHR != ref_gene.CHR.shift()) | ((ref_gene.gene_name == ref_gene.gene_name.shift()) & (ref_gene.start >= ref_gene.end.shift())) | \
                        (ref_gene.gene_name != ref_gene.gene_name.shift())
    if ref_gene[ref_gene["match"] == True].shape[0] == ref_gene.shape[0]:
        break
    else:
        num = 1
        gene = list()
        for i in range(23):
            tmp = ref_gene[ref_gene["CHR"] == i+1]
            for index, row in tmp.iterrows():
                if row["match"] == True:
                    gene.append("gene" + str(num))
                    num += 1
                else:
                    gene.append("gene" + str(num-1))
        ref_gene["gene"] = gene
        ref_gene_fromlist = list()
        gene_list = list(unique_everseen(gene))
        for gn in gene_list:
            tmp = ref_gene[ref_gene["gene"] == gn]
            if tmp.shape[0] == 1:
                ref_gene_fromlist.append([tmp["CHR"].tolist()[0], tmp["start"].tolist()[0], tmp["end"].tolist()[0], tmp["gene_name"].tolist()[0]])
            else:
                ref_gene_fromlist.append([tmp["CHR"].tolist()[0], min(tmp["start"]), max(tmp["end"]), tmp["gene_name"].tolist()[0]])
        ref_gene = pd.DataFrame(ref_gene_fromlist, columns = ["CHR", "start", "end", "gene_name"])

In [16]:
ref_gene.head()

Unnamed: 0,CHR,start,end,gene_name,match
0,1,33772366,33786699,A3GALT2,True
1,1,12776112,12788727,AADACL3,True
2,1,12704565,12727097,AADACL4,True
3,1,94458389,94586704,ABCA4,True
4,1,229652328,229694454,ABCB10,True


In [17]:
Counter(ref_gene["match"])

Counter({True: 29337})

## Rename duplicated genes

In [19]:
gene_name = ref_gene["gene_name"].tolist()

In [20]:
gene_name_new = []
for i, v in enumerate(gene_name):
    totalcount = gene_name.count(v)
    count = gene_name[:i].count(v)
    gene_name_new.append(v + "@" + str(count + 1) if totalcount > 1 else v)

In [21]:
ref_gene["gene_name"] = gene_name_new

In [22]:
query = '''
select CHR, start, end, group_concat(gene_name) as gene_name
from ref_gene
group by CHR, start, end
'''

In [23]:
ref_gene = sqldf(query)

In [24]:
ref_gene.shape

(28488, 4)

In [25]:
ref_gene[ref_gene["gene_name"].str.contains("OR4F16")]

Unnamed: 0,CHR,start,end,gene_name
9,1,367658,368597,"OR4F16@1,OR4F29@1,OR4F3@1"
12,1,621095,622034,"OR4F16@2,OR4F29@2,OR4F3@2"
8646,5,180794287,180795226,"OR4F16@3,OR4F29@3,OR4F3@3"


In [26]:
for idx, item in ref_gene.iterrows():
    ref_gene.iloc[idx, 3] = str(sorted(item["gene_name"].split(","))).strip('[]').replace('\'', '').replace(' ', '')

In [27]:
ref_gene.head()

Unnamed: 0,CHR,start,end,gene_name
0,1,11868,14362,LOC102725121@1
1,1,11873,14409,DDX11L1
2,1,14361,29370,WASH7P
3,1,17368,17436,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1"
4,1,30365,30503,"MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1"


In [28]:
ref_gene[ref_gene["gene_name"].str.contains("OR4F16")]

Unnamed: 0,CHR,start,end,gene_name
9,1,367658,368597,"OR4F16@1,OR4F29@1,OR4F3@1"
12,1,621095,622034,"OR4F16@2,OR4F29@2,OR4F3@2"
8646,5,180794287,180795226,"OR4F16@3,OR4F29@3,OR4F3@3"


In [29]:
ref_gene[ref_gene["gene_name"].str.contains("MIR6859-1")]

Unnamed: 0,CHR,start,end,gene_name
3,1,17368,17436,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1"
20455,15,102513726,102513794,"MIR6859-1@2,MIR6859-2@2,MIR6859-3@2,MIR6859-4@2"
20459,16,67051,67119,"MIR6859-1@3,MIR6859-2@3,MIR6859-3@3,MIR6859-4@3"


## Drop genes that share same chr, start and end (43442 left)

In [26]:
# ref_gene = ref_gene.drop_duplicates(subset = ("CHR", "start", "end"))
# ref_gene.shape

(43442, 5)

In [13]:
## not correct
# start = ref_gene[["CHR", "gene_name", "start"]].groupby(["CHR", "gene_name"]).min().reset_index(drop = False)
# end = ref_gene[["CHR", "gene_name", "end"]].groupby(["CHR", "gene_name"]).max().reset_index(drop = False)

In [30]:
ref_gene[ref_gene["gene_name"].str.contains("TTLL10")]

Unnamed: 0,CHR,start,end,gene_name
35,1,1109259,1133315,TTLL10


In [31]:
ref_gene[ref_gene["gene_name"].str.contains("LINC01128")]

Unnamed: 0,CHR,start,end,gene_name
17,1,762970,794826,LINC01128


In [32]:
ref_gene[ref_gene["gene_name"].str.contains("CAMTA1")]

Unnamed: 0,CHR,start,end,gene_name
149,1,6843951,6845040,CAMTA1-DT
150,1,6845383,7829766,CAMTA1
151,1,7442547,7449814,CAMTA1-AS2


In [34]:
ref_gene["CHR"] = [item if item != 23 else "X" for item in ref_gene["CHR"]]

In [35]:
ref_gene.tail()

Unnamed: 0,CHR,start,end,gene_name
28483,X,154842225,155012119,SPRY3
28484,X,155111007,155173433,VAMP7
28485,X,155227360,155240482,IL9R
28486,X,155244228,155246495,WASIR1
28487,X,155255322,155257848,DDX11L16


In [36]:
ref_gene[ref_gene["gene_name"].str.contains("MIR6859-1")]

Unnamed: 0,CHR,start,end,gene_name
3,1,17368,17436,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1"
20455,15,102513726,102513794,"MIR6859-1@2,MIR6859-2@2,MIR6859-3@2,MIR6859-4@2"
20459,16,67051,67119,"MIR6859-1@3,MIR6859-2@3,MIR6859-3@3,MIR6859-4@3"


In [55]:
ref_gene[ref_gene["gene_name"].str.contains("LOC102725121")]

Unnamed: 0,CHR,start,end,gene_name
0,1,11868,14362,LOC102725121@1
20457,15,102516807,102519301,LOC102725121@2


In [38]:
ref_gene[ref_gene["gene_name"].str.contains("MIR1302-10")]

Unnamed: 0,CHR,start,end,gene_name
4,1,30365,30503,"MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1"
12533,9,30143,30281,"MIR1302-10@2,MIR1302-11@2,MIR1302-2@2,MIR1302-9@2"
20453,15,102500661,102500799,"MIR1302-10@3,MIR1302-11@3,MIR1302-2@3,MIR1302-9@3"
23662,19,71972,72110,"MIR1302-10@4,MIR1302-11@4,MIR1302-2@4,MIR1302-9@4"


In [39]:
ref_gene = ref_gene.sort_values(by = ["CHR", "start", "end", "gene_name"])

In [40]:
## 20200317
ref_gene.to_csv(f"{cwd}/refGene.clean.gz", compression = "gzip", sep = "\t", index = False, header = ["CHR", "start", "end", "gene"])

## Check for validity

In [41]:
ref_gene.drop_duplicates(subset = ("CHR", "start", "end")).shape

(28488, 4)

In [42]:
tmp1 = ref_gene[ref_gene["CHR"] != "X"]

In [43]:
tmp1.head()

Unnamed: 0,CHR,start,end,gene_name
0,1,11868,14362,LOC102725121@1
1,1,11873,14409,DDX11L1
2,1,14361,29370,WASH7P
3,1,17368,17436,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1"
4,1,30365,30503,"MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1"


In [44]:
tmp2 = pd.read_csv(f"{cwd}/refGene.clean.20200317_3.gz", sep = "\t", header = 0, names = ["CHR", "start", "end", "gene_name"])

In [45]:
tmp2.head()

Unnamed: 0,CHR,start,end,gene_name
0,1,11868,14362,LOC102725121@1
1,1,11873,14409,DDX11L1
2,1,14361,29370,WASH7P
3,1,17368,17436,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1"
4,1,30365,30503,"MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1"


In [46]:
tmp2 = tmp2[tmp2["CHR"] != "X"]

In [47]:
tmp2["CHR"] = tmp2["CHR"].astype(int)

In [48]:
df1 = tmp1.merge(tmp2, how = 'outer', on = ["CHR", "start", "end"], indicator=True).loc[lambda x : x['_merge']=='left_only']

In [49]:
df1.head(60)

Unnamed: 0,CHR,start,end,gene_name_x,gene_name_y,_merge
1041,1.0,65886334,66107242,LEPR,,left_only
1246,1.0,97543299,98386615,DPYD,,left_only
1509,1.0,144146811,148346791,NBPF19,,left_only
1564,1.0,146644325,146767447,CHD1L,,left_only
1914,1.0,157090982,157108383,ETV3,,left_only
1955,1.0,159367254,159410757,OR10J1,,left_only
3496,2.0,85829983,85876407,USP39,,left_only
3599,2.0,98081675,98094824,LOC100506076@2,,left_only
3615,2.0,99613723,99771429,TSGA10,,left_only
3783,2.0,114726898,114785659,LINC01191,,left_only


In [50]:
tmp1[tmp1["gene_name"].str.contains("AREG")]

Unnamed: 0,CHR,start,end,gene_name
6716,4,75310852,75320721,AREG@1
6717,4,75480628,75490480,AREG@2


In [51]:
tmp2[tmp2["gene_name"].str.contains("AREG")]

Unnamed: 0,CHR,start,end,gene_name
6843,4,75310852,75490480,AREG@1


In [52]:
tmp1.iloc[3598:3602]

Unnamed: 0,CHR,start,end,gene_name
3598,2,97949671,97957533,LOC100506123@1
3599,2,98081675,98094824,LOC100506076@2
3600,2,98081774,98089620,LOC100506123@2
3601,2,98121260,98206428,ANKRD36B


In [53]:
tmp1[tmp1["end"] == 25223870]

Unnamed: 0,CHR,start,end,gene_name
19495,15,25068754,25223870,SNRPN
19497,15,25200039,25223870,SNURF


In [54]:
ref_gene[ref_gene["gene_name"].str.contains("SNORD103A@")]

Unnamed: 0,CHR,start,end,gene_name
587,1,31408535,31408623,"SNORD103A@1,SNORD103B@1"
588,1,31421964,31422052,"SNORD103A@2,SNORD103B@2"
