# Obtain X matrix
column: individual

row: gene

## Import phenotype

In [1]:
import pandas as pd, numpy as np
import os
from pandasql import sqldf
from collections import Counter
from more_itertools import unique_everseen
cwd = os.path.expanduser("~/GIT/cnv-gene-mapping/data")

In [2]:
pheno1 = pd.read_table(f"{cwd}/swcnv/swcnv.pheno", header = 0, sep = "\s+", usecols = [0,2,3,4])

In [3]:
pheno1.shape

(12406, 4)

In [4]:
Counter(pheno1["BPSCZ"]), Counter(pheno1["SCZ"]), Counter(pheno1["BP"])

(Counter({1: 6256, 2: 6150}),
 Counter({1: 6256, 2: 4978, -9: 1172}),
 Counter({1: 6256, -9: 4978, 2: 1172}))

In [5]:
pheno2 = pd.read_table(f"{cwd}/swcnv/swcnv.qc6.cnv.indiv", header = 0, sep = "\s+", usecols = [0,2])

In [6]:
pheno2.shape

(10636, 2)

In [7]:
pheno2["FID"] = [x.split("_")[0] for x in pheno2["FID"]]

In [8]:
pheno = pd.merge(pheno1, pheno2, how = "outer", on = "FID")

In [9]:
pheno["PHENO"] = pheno.apply(lambda x: int(x["SCZ"]) if str(x["SCZ"]) != "nan" else int(x["PHE"]), axis = 1)

In [10]:
pheno.head()

Unnamed: 0,FID,BPSCZ,SCZ,BP,PHE,PHENO
0,PT-1RTW,1.0,1.0,1.0,1.0,1
1,PT-1RTX,1.0,1.0,1.0,1.0,1
2,PT-1RTY,1.0,1.0,1.0,1.0,1
3,PT-1RTZ,2.0,2.0,-9.0,2.0,2
4,PT-1RU1,1.0,1.0,1.0,1.0,1


In [11]:
Counter(pheno["PHENO"])

Counter({1: 6355, 2: 5097, -9: 1172})

## Import individual CNV
TYPE: 1 is deletion, 3 is duplication

In [12]:
cnv = pd.read_table(f"{cwd}/swcnv/swcnv.qc6.cnv", header = 0, sep = "\s+", usecols = [0,2,3,4,5])

In [13]:
cnv.shape

(9723, 5)

In [14]:
cnv["FID"] = [x.split("_")[0] for x in cnv["FID"]]

In [15]:
cnv = pd.merge(cnv, pheno[["FID", "PHENO"]], how = "left", on = "FID")

In [16]:
cnv.head()

Unnamed: 0,FID,CHR,BP1,BP2,TYPE,PHENO
0,PT-8K8C,8,30371917,30849740,3,1
1,PT-8UHJ,7,143218408,143532998,3,1
2,PT-8UHU,2,117775371,117925547,3,1
3,PT-8UHU,2,230714025,230902238,3,1
4,PT-8UHU,15,24578499,24778891,4,1


In [17]:
Counter(cnv["PHENO"])

Counter({1: 5200, 2: 4523})

In [18]:
cnv.drop_duplicates(subset = ["FID", "CHR", "BP2", "TYPE", "PHENO"]).shape

(9723, 6)

In [19]:
dele = cnv[cnv["TYPE"] == 1]
dup = cnv[cnv["TYPE"] == 3]

In [20]:
dele = dele.sort_values(by = ["CHR", "BP1", "BP2"])
dup = dup.sort_values(by = ["CHR", "BP1", "BP2"])

In [21]:
dup.head()

Unnamed: 0,FID,CHR,BP1,BP2,TYPE,PHENO
6229,PT-ERQ6,1,768448,894573,3,2
746,PT-8UXN,1,824136,1017216,3,2
336,PT-8TWI,1,1804302,2047584,3,1
5258,PT-CDFW,1,1804302,2047584,3,2
3231,PT-9ZDV,1,2251160,2982621,3,1


In [21]:
# dele[dele["FID"].isin(list(set(dele[dele.duplicated(subset = ["FID", "CHR"])]["FID"].tolist())))]

## Import reference genes

In [22]:
ref_gene = pd.read_table(f"{cwd}/refGene.clean.gz", compression = "gzip", sep = "\t", header = 0)

In [23]:
ref_gene.head()

Unnamed: 0,CHR,start,end,gene
0,1,11868,14362,LOC102725121@1
1,1,11873,14409,DDX11L1
2,1,14361,29370,WASH7P
3,1,17368,17436,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1"
4,1,30365,30503,"MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1"


In [24]:
ref_gene.shape

(28488, 4)

In [157]:
def get_Xmatrix(data):
    query = """
        SELECT data.FID, data.CHR, data.BP1, data.BP2, data.PHENO, ref_gene.gene, ref_gene.start, ref_gene.end
        FROM data LEFT JOIN ref_gene
        WHERE data.CHR == ref_gene.CHR
        AND (
        (data.BP1 >= ref_gene.start AND data.BP1 <= ref_gene.end)
        OR
        (data.BP2 >= ref_gene.start AND data.BP2 <= ref_gene.end)
        OR
        (data.BP1 <= ref_gene.start AND data.BP2 >= ref_gene.end)
        )
    """
    res = sqldf(query)
    ## res
    # 	FID	    CHR	BP1	    BP2	  PHENO	gene	start	end
    # 0	PT-L1HP	1	1106784	1220136	2	TTLL10	1109259	1133315
    # 1	PT-L1HP	1	1106784	1220136	2	TNFRSF18	1138887	1141972
    # 2	PT-L1HP	1	1106784	1220136	2	TNFRSF4	1146719	1149533
    fill = list()
    for s in list(unique_everseen(data["FID"])):
        tmp = res[res["FID"] == s]["gene"].tolist()
        # phenotype + status if gene interrupted by CNV
        tmp = [data[data["FID"] == s]["PHENO"].tolist()[0] - 1] + [1 if i in tmp else 0 for i in ref_gene["gene"]]
        fill.append(tmp)
    X = pd.DataFrame(data = fill, index = list(unique_everseen(data["FID"])), columns = ["PHENO"] + list(unique_everseen(ref_gene["gene"])))
    return X[["PHENO"]], X.iloc[:,1:]

In [177]:
res.head()

Unnamed: 0,FID,CHR,BP1,BP2,PHENO,gene,start,end
0,PT-L1HP,1,1106784,1220136,2,TTLL10,1109259,1133315
1,PT-L1HP,1,1106784,1220136,2,TNFRSF18,1138887,1141972
2,PT-L1HP,1,1106784,1220136,2,TNFRSF4,1146719,1149533
3,PT-L1HP,1,1106784,1220136,2,SDF4,1152287,1167447
4,PT-L1HP,1,1106784,1220136,2,B3GALT6,1167616,1170420


In [178]:
res.shape

(10328, 8)

In [158]:
ydel, Xdel = get_Xmatrix(dele)

In [159]:
Xdel.head()

Unnamed: 0,LOC102725121@1,DDX11L1,WASH7P,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1","MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1","FAM138A@1,FAM138C@1,FAM138F@1",OR4F5,LOC729737,"LOC100132062@1,LOC100132287@1","OR4F16@1,OR4F29@1,OR4F3@1",LOC101928626,MIR12136,"OR4F16@2,OR4F29@2,OR4F3@2",LOC100133331,LOC100288069,FAM87B,LINC00115,LINC01128,FAM41C,LINC02593,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,ISG15,AGRN,LOC100288175,RNF223,C1orf159,LINC01342,MIR200B,MIR200A,MIR429,TTLL10,TNFRSF18,TNFRSF4,SDF4,B3GALT6,...,DKC1,SNORA36A,MIR664B,SNORA56,MPP1,SMIM9,F8,H2AB1@1,"H2AB2@1,H2AB3@1",F8A3@1,F8A1@1,F8A2@1,"MIR1184-1@1,MIR1184-2@1,MIR1184-3@1",FUNDC2,CMC4,MTCP1,BRCC3,VBP1,RAB39B,CLIC2,"LOC101927830@1,TMLHE-AS1@1",H2AB1@2,"H2AB2@2,H2AB3@2",F8A3@2,F8A1@2,F8A2@2,"MIR1184-1@2,MIR1184-2@2,MIR1184-3@2",F8A1@3,F8A3@3,F8A2@3,"MIR1184-1@3,MIR1184-2@3,MIR1184-3@3","H2AB2@3,H2AB3@3",H2AB1@3,"LOC101927830@2,TMLHE-AS1@2",TMLHE,SPRY3,VAMP7,IL9R,WASIR1,DDX11L16
PT-L1HP,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-8TK1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-8UY5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-BQF5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-8WFK,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [160]:
ydel.head()

Unnamed: 0,PHENO
PT-L1HP,1
PT-8TK1,0
PT-8UY5,0
PT-BQF5,1
PT-8WFK,0


In [161]:
ydup, Xdup = get_Xmatrix(dup)

In [162]:
Xdup.shape

(4235, 28488)

In [163]:
Xdel.to_csv(f"{cwd}/deletion.X.colrownames.gz", compression = "gzip", sep = "\t", header = True, index = True)
Xdup.to_csv(f"{cwd}/duplication.X.colrownames.gz", compression = "gzip", sep = "\t", header = True, index = True)

In [180]:
Xdel.to_csv(f"{cwd}/deletion.X.no_colrownames.gz", compression = "gzip", sep = "\t", header = False, index = False)
Xdup.to_csv(f"{cwd}/duplication.X.no_colrownames.gz", compression = "gzip", sep = "\t", header = False, index = False)

In [164]:
ydel.to_csv(f"{cwd}/deletion.y.colrownames.gz", compression = "gzip", sep = "\t", header = True, index = True)
ydup.to_csv(f"{cwd}/duplication.y.colrownames.gz", compression = "gzip", sep = "\t", header = True, index = True)

In [179]:
ydel.to_csv(f"{cwd}/deletion.y.no_colrownames.gz", compression = "gzip", sep = "\t", header = False, index = False)
ydup.to_csv(f"{cwd}/duplication.y.no_colrownames.gz", compression = "gzip", sep = "\t", header = False, index = False)

## Read X and y
Set `index_col=0`, the individual IDs will be the indices.

In [171]:
tmp_y = pd.read_csv(f"{cwd}/deletion.y.colrownames.gz", sep = "\t", header = 0, index_col = 0)

In [172]:
tmp_y.head()

Unnamed: 0,PHENO
PT-L1HP,1
PT-8TK1,0
PT-8UY5,0
PT-BQF5,1
PT-8WFK,0


In [173]:
tmp_X = pd.read_csv(f"{cwd}/deletion.X.colrownames.gz", sep = "\t", header = 0, index_col = 0)

In [174]:
tmp_X.head()

Unnamed: 0,LOC102725121@1,DDX11L1,WASH7P,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1","MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1","FAM138A@1,FAM138C@1,FAM138F@1",OR4F5,LOC729737,"LOC100132062@1,LOC100132287@1","OR4F16@1,OR4F29@1,OR4F3@1",LOC101928626,MIR12136,"OR4F16@2,OR4F29@2,OR4F3@2",LOC100133331,LOC100288069,FAM87B,LINC00115,LINC01128,FAM41C,LINC02593,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,ISG15,AGRN,LOC100288175,RNF223,C1orf159,LINC01342,MIR200B,MIR200A,MIR429,TTLL10,TNFRSF18,TNFRSF4,SDF4,B3GALT6,...,DKC1,SNORA36A,MIR664B,SNORA56,MPP1,SMIM9,F8,H2AB1@1,"H2AB2@1,H2AB3@1",F8A3@1,F8A1@1,F8A2@1,"MIR1184-1@1,MIR1184-2@1,MIR1184-3@1",FUNDC2,CMC4,MTCP1,BRCC3,VBP1,RAB39B,CLIC2,"LOC101927830@1,TMLHE-AS1@1",H2AB1@2,"H2AB2@2,H2AB3@2",F8A3@2,F8A1@2,F8A2@2,"MIR1184-1@2,MIR1184-2@2,MIR1184-3@2",F8A1@3,F8A3@3,F8A2@3,"MIR1184-1@3,MIR1184-2@3,MIR1184-3@3","H2AB2@3,H2AB3@3",H2AB1@3,"LOC101927830@2,TMLHE-AS1@2",TMLHE,SPRY3,VAMP7,IL9R,WASIR1,DDX11L16
PT-L1HP,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-8TK1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-8UY5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-BQF5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-8WFK,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
