# Obtain X matrix and y
- deletion
- duplication
- both deletion and duplication

column: genes

row: individuals

## Import phenotype

In [None]:
import pandas as pd, numpy as np
import os
from pandasql import sqldf
from collections import Counter
from more_itertools import unique_everseen
cwd = os.path.expanduser("~/GIT/cnv-gene-mapping/data")

In [None]:
pheno1 = pd.read_table(f"{cwd}/swcnv/swcnv.pheno", header = 0, sep = "\s+", usecols = [0,2,3,4])

In [None]:
pheno1.head()

In [None]:
pheno1.shape

BPSCZ: either BP or SCZ
- $6256$ controls
- $4978$ SCZ cases
- $1172$ BP cases
- $4978$ + $1172$ = $6150$ BPSCZ cases

In [5]:
Counter(pheno1["BPSCZ"]), Counter(pheno1["SCZ"]), Counter(pheno1["BP"])

(Counter({1: 6256, 2: 6150}),
 Counter({1: 6256, 2: 4978, -9: 1172}),
 Counter({1: 6256, -9: 4978, 2: 1172}))

In [6]:
pheno2 = pd.read_table(f"{cwd}/swcnv/swcnv.qc6.cnv.indiv", header = 0, sep = "\s+", usecols = [0,2])

"NSEG": number of CNV in each sample

In [7]:
pheno2.head()

Unnamed: 0,FID,PHE
0,PT-8K6O,1
1,PT-8K8C,1
2,PT-8UHJ,1
3,PT-8UHU,1
4,PT-8K6N,1


In [8]:
pheno2.shape

(10636, 2)

In [9]:
Counter(pheno2["PHE"])

Counter({1: 5917, 2: 4719})

In [10]:
pheno2["FID"] = [x.split("_")[0] for x in pheno2["FID"]]

In [11]:
pheno = pd.merge(pheno1, pheno2, how = "outer", on = "FID")

Phenotype in `swcnv.qc6.cnv.indiv` is not NA, but in `swcnv.pheno` SCZ is NA.

In [12]:
inds = pheno[(pheno["SCZ"] != pheno["PHE"]) & ((pheno["SCZ"].isna()))]["FID"].tolist()

In [13]:
len(inds)

218

In [14]:
pheno[(pheno["SCZ"] != pheno["PHE"]) & ((pheno["PHE"] == 2) | (pheno["PHE"] == 1))]

Unnamed: 0,FID,BPSCZ,SCZ,BP,PHE
12406,PT-8K6O,,,,1.0
12407,PT-8K8C,,,,1.0
12408,PT-8K6N,,,,1.0
12409,PT-8K7F,,,,1.0
12410,PT-8K71,,,,1.0
...,...,...,...,...,...
12619,PT-CP1D,,,,2.0
12620,PT-CP33,,,,2.0
12621,PT-COOS,,,,2.0
12622,PT-CDKY,,,,2.0


In [15]:
pheno["PHENO"] = pheno.apply(lambda x: int(x["SCZ"]) if str(x["SCZ"]) != "nan" else int(x["PHE"]), axis = 1)

In [16]:
pheno.head()

Unnamed: 0,FID,BPSCZ,SCZ,BP,PHE,PHENO
0,PT-1RTW,1.0,1.0,1.0,1.0,1
1,PT-1RTX,1.0,1.0,1.0,1.0,1
2,PT-1RTY,1.0,1.0,1.0,1.0,1
3,PT-1RTZ,2.0,2.0,-9.0,2.0,2
4,PT-1RU1,1.0,1.0,1.0,1.0,1


In [17]:
pheno.shape

(12624, 6)

There are $5097$ cases and $6355$ controls.

In [18]:
Counter(pheno["PHENO"])

Counter({1: 6355, 2: 5097, -9: 1172})

## Import individual CNV
TYPE: 1 is deletion, 3 is duplication

In [19]:
cnv = pd.read_table(f"{cwd}/swcnv/swcnv.qc6.cnv", header = 0, sep = "\s+", usecols = [0,2,3,4,5])

In [20]:
Counter(cnv["TYPE"])

Counter({3: 5826, 4: 170, 1: 3714, 0: 13})

In [21]:
cnv.head()

Unnamed: 0,FID,CHR,BP1,BP2,TYPE
0,PT-8K8C,8,30371917,30849740,3
1,PT-8UHJ,7,143218408,143532998,3
2,PT-8UHU,2,117775371,117925547,3
3,PT-8UHU,2,230714025,230902238,3
4,PT-8UHU,15,24578499,24778891,4


In [22]:
cnv["FID"] = [x.split("_")[0] for x in cnv["FID"]]

In [23]:
cnv = pd.merge(cnv, pheno[["FID", "PHENO"]], how = "left", on = "FID")

In [24]:
cnv.head()

Unnamed: 0,FID,CHR,BP1,BP2,TYPE,PHENO
0,PT-8K8C,8,30371917,30849740,3,1
1,PT-8UHJ,7,143218408,143532998,3,1
2,PT-8UHU,2,117775371,117925547,3,1
3,PT-8UHU,2,230714025,230902238,3,1
4,PT-8UHU,15,24578499,24778891,4,1


In [25]:
cnv[cnv["FID"] == "PT-CDMY"]

Unnamed: 0,FID,CHR,BP1,BP2,TYPE,PHENO
9721,PT-CDMY,1,16869363,16986851,3,2
9722,PT-CDMY,22,16077141,16399151,3,2


In [26]:
Counter(cnv["TYPE"])

Counter({3: 5826, 4: 170, 1: 3714, 0: 13})

In [27]:
Counter(cnv["PHENO"])

Counter({1: 5200, 2: 4523})

In [28]:
cnv.drop_duplicates(subset = ["FID", "CHR", "BP2", "TYPE", "PHENO"]).shape

(9723, 6)

In [29]:
dele = cnv[cnv["TYPE"] == 1]
dup = cnv[cnv["TYPE"] == 3]

In [30]:
dele = dele.sort_values(by = ["CHR", "BP1", "BP2"])
dup = dup.sort_values(by = ["CHR", "BP1", "BP2"])

In [32]:
dele = dele.set_index([[i for i in range(dele.shape[0])]])
dup = dup.set_index([[i for i in range(dup.shape[0])]])

In [33]:
dele.head()

Unnamed: 0,FID,CHR,BP1,BP2,TYPE,PHENO
0,PT-L1HP,1,1106784,1220136,1,2
1,PT-L1HP,1,1307872,1450947,1,2
2,PT-8TK1,1,7656503,7952404,1,1
3,PT-8UY5,1,12841928,12998268,1,1
4,PT-BQF5,1,12841928,12998268,1,2


In [22]:
# dele[dele["FID"].isin(list(set(dele[dele.duplicated(subset = ["FID", "CHR"])]["FID"].tolist())))]

## Import reference genes

In [34]:
ref_gene = pd.read_table(f"{cwd}/refGene.clean.gz", compression = "gzip", sep = "\t", header = 0)

In [35]:
ref_gene.head()

Unnamed: 0,CHR,start,end,gene
0,1,11868,14362,LOC102725121@1
1,1,11873,14409,DDX11L1
2,1,14361,29370,WASH7P
3,1,17368,17436,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1"
4,1,30365,30503,"MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1"


In [36]:
ref_gene.shape

(28488, 4)

In [41]:
def get_Xmatrix(data):
    query = """
        SELECT data.FID, data.CHR, data.BP1, data.BP2, data.PHENO, ref_gene.gene, ref_gene.start, ref_gene.end
        FROM data LEFT JOIN ref_gene
        WHERE data.CHR == ref_gene.CHR
        AND (
        (data.BP1 >= ref_gene.start AND data.BP1 <= ref_gene.end)
        OR
        (data.BP2 >= ref_gene.start AND data.BP2 <= ref_gene.end)
        OR
        (data.BP1 <= ref_gene.start AND data.BP2 >= ref_gene.end)
        )
    """
    res = sqldf(query)
    ## res
    # 	FID	    CHR	BP1	    BP2	  PHENO	gene	start	end
    # 0	PT-L1HP	1	1106784	1220136	2	TTLL10	1109259	1133315
    # 1	PT-L1HP	1	1106784	1220136	2	TNFRSF18	1138887	1141972
    # 2	PT-L1HP	1	1106784	1220136	2	TNFRSF4	1146719	1149533
    fill = list()
    for s in list(unique_everseen(data["FID"])):
        tmp = res[res["FID"] == s]["gene"].tolist()
        # phenotype + status if gene interrupted by CNV
        tmp = [data[data["FID"] == s]["PHENO"].tolist()[0] - 1] + [1 if i in tmp else 0 for i in ref_gene["gene"]]
        fill.append(tmp)
    X = pd.DataFrame(data = fill, index = list(unique_everseen(data["FID"])), columns = ["PHENO"] + list(unique_everseen(ref_gene["gene"])))
    return X[["PHENO"]], X.iloc[:,1:]

In [47]:
ydel, Xdel = get_Xmatrix(dele)

In [49]:
Xdel.shape

(3086, 28488)

In [31]:
ydup, Xdup = get_Xmatrix(dup)

In [32]:
Xdup.shape

(4235, 28488)

Specific individuals only in duplication cases and controls

In [34]:
dupind = [x for x in list(Xdup.index) if x not in list(Xdel.index)]
len(dupind)

2964

Specific individuals only in deletion cases and controls

In [36]:
delind = [x for x in list(Xdel.index) if x not in list(Xdup.index)]
len(delind)

1815

Common individuals in both deletion and duplication

In [44]:
deldup = [x for x in list(Xdel.index) if x not in delind]
len(deldup)

1271

In [111]:
ydeldup = pd.merge(ydel.reset_index(), ydup.reset_index(), how = "outer", on = ["index", "PHENO"])

In [112]:
ydeldup.head()

Unnamed: 0,index,PHENO
0,PT-L1HP,1
1,PT-8TK1,0
2,PT-8UY5,0
3,PT-BQF5,1
4,PT-8WFK,0


In [45]:
Xdel_com = Xdel.loc[deldup]
Xdup_com = Xdup.loc[deldup]

In [47]:
Xdel_com.shape

(1271, 28488)

In [92]:
Xdel_ind = Xdel.loc[[i for i in list(Xdel.index) if i not in deldup]]
Xdup_ind = Xdup.loc[[i for i in list(Xdup.index) if i not in deldup]]

In [100]:
dat = list()
for i in list(Xdel_com.index):
    dat.append([any([x,y])*1 for x,y in zip(list(Xdel_com.loc[i]), list(Xdup_com.loc[i]))])   

In [104]:
Xdeldup_com = pd.DataFrame(dat, index = list(Xdel_com.index), columns = list(Xdel_com.columns))
Xdeldup_com.head()

Unnamed: 0,LOC102725121@1,DDX11L1,WASH7P,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1","MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1","FAM138A@1,FAM138C@1,FAM138F@1",OR4F5,LOC729737,"LOC100132062@1,LOC100132287@1","OR4F16@1,OR4F29@1,OR4F3@1",LOC101928626,MIR12136,"OR4F16@2,OR4F29@2,OR4F3@2",LOC100133331,LOC100288069,FAM87B,LINC00115,LINC01128,FAM41C,LINC02593,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,ISG15,AGRN,LOC100288175,RNF223,C1orf159,LINC01342,MIR200B,MIR200A,MIR429,TTLL10,TNFRSF18,TNFRSF4,SDF4,B3GALT6,...,DKC1,SNORA36A,MIR664B,SNORA56,MPP1,SMIM9,F8,H2AB1@1,"H2AB2@1,H2AB3@1",F8A3@1,F8A1@1,F8A2@1,"MIR1184-1@1,MIR1184-2@1,MIR1184-3@1",FUNDC2,CMC4,MTCP1,BRCC3,VBP1,RAB39B,CLIC2,"LOC101927830@1,TMLHE-AS1@1",H2AB1@2,"H2AB2@2,H2AB3@2",F8A3@2,F8A1@2,F8A2@2,"MIR1184-1@2,MIR1184-2@2,MIR1184-3@2",F8A1@3,F8A3@3,F8A2@3,"MIR1184-1@3,MIR1184-2@3,MIR1184-3@3","H2AB2@3,H2AB3@3",H2AB1@3,"LOC101927830@2,TMLHE-AS1@2",TMLHE,SPRY3,VAMP7,IL9R,WASIR1,DDX11L16
PT-L1HP,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-8TK1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-8UY5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-BQF5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-8WFK,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Individuals that has both deletion and duplication at certain genes location.

In [185]:
tmp = list()
for x in list(Xdeldup_com.index):
    if not sum(Xdeldup_com.loc[x]) == sum(Xdel_com.loc[x]) + sum(Xdup_com.loc[x]):
        deldup_gene = list({k: v for k, v in dict(Xdeldup_com.loc[x]).items() if v}.keys())
        del_gene = list({k: v for k, v in dict(Xdel_com.loc[x]).items() if v}.keys())
        dup_gene = list({k: v for k, v in dict(Xdup_com.loc[x]).items() if v}.keys())
        gene = [i for i in deldup_gene if i in del_gene and i in dup_gene]
        tmp.append([x, sum(Xdeldup_com.loc[x]), sum(Xdel_com.loc[x]), sum(Xdup_com.loc[x]), gene])

Sample index, # of genes interrupted by either del or dup, # of genes interrupted by only del, # of genes interrupted by only dup, gene(s) that interrupted by both del and dup

In [187]:
tmp

[['PT-BPV3', 28, 22, 7, ['NBPF19']],
 ['PT-8W4D', 39, 4, 36, ['NBPF19']],
 ['PT-BRMG', 30, 3, 28, ['NBPF19']],
 ['PT-297E', 25, 18, 8, ['OPCML']]]

In [162]:
Xdeldup = pd.concat([Xdeldup_com, Xdel_ind, Xdup_ind])
Xdeldup = Xdeldup.reset_index()
Xdeldup.head()

Unnamed: 0,index,LOC102725121@1,DDX11L1,WASH7P,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1","MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1","FAM138A@1,FAM138C@1,FAM138F@1",OR4F5,LOC729737,"LOC100132062@1,LOC100132287@1","OR4F16@1,OR4F29@1,OR4F3@1",LOC101928626,MIR12136,"OR4F16@2,OR4F29@2,OR4F3@2",LOC100133331,LOC100288069,FAM87B,LINC00115,LINC01128,FAM41C,LINC02593,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,ISG15,AGRN,LOC100288175,RNF223,C1orf159,LINC01342,MIR200B,MIR200A,MIR429,TTLL10,TNFRSF18,TNFRSF4,SDF4,...,DKC1,SNORA36A,MIR664B,SNORA56,MPP1,SMIM9,F8,H2AB1@1,"H2AB2@1,H2AB3@1",F8A3@1,F8A1@1,F8A2@1,"MIR1184-1@1,MIR1184-2@1,MIR1184-3@1",FUNDC2,CMC4,MTCP1,BRCC3,VBP1,RAB39B,CLIC2,"LOC101927830@1,TMLHE-AS1@1",H2AB1@2,"H2AB2@2,H2AB3@2",F8A3@2,F8A1@2,F8A2@2,"MIR1184-1@2,MIR1184-2@2,MIR1184-3@2",F8A1@3,F8A3@3,F8A2@3,"MIR1184-1@3,MIR1184-2@3,MIR1184-3@3","H2AB2@3,H2AB3@3",H2AB1@3,"LOC101927830@2,TMLHE-AS1@2",TMLHE,SPRY3,VAMP7,IL9R,WASIR1,DDX11L16
0,PT-L1HP,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,PT-8TK1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,PT-8UY5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,PT-BQF5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,PT-8WFK,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [143]:
Xydeldup = pd.merge(ydeldup, Xdeldup, how = "inner", on = ["index"])
Xydeldup.head()

Unnamed: 0,index,PHENO,LOC102725121@1,DDX11L1,WASH7P,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1","MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1","FAM138A@1,FAM138C@1,FAM138F@1",OR4F5,LOC729737,"LOC100132062@1,LOC100132287@1","OR4F16@1,OR4F29@1,OR4F3@1",LOC101928626,MIR12136,"OR4F16@2,OR4F29@2,OR4F3@2",LOC100133331,LOC100288069,FAM87B,LINC00115,LINC01128,FAM41C,LINC02593,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,ISG15,AGRN,LOC100288175,RNF223,C1orf159,LINC01342,MIR200B,MIR200A,MIR429,TTLL10,TNFRSF18,TNFRSF4,...,DKC1,SNORA36A,MIR664B,SNORA56,MPP1,SMIM9,F8,H2AB1@1,"H2AB2@1,H2AB3@1",F8A3@1,F8A1@1,F8A2@1,"MIR1184-1@1,MIR1184-2@1,MIR1184-3@1",FUNDC2,CMC4,MTCP1,BRCC3,VBP1,RAB39B,CLIC2,"LOC101927830@1,TMLHE-AS1@1",H2AB1@2,"H2AB2@2,H2AB3@2",F8A3@2,F8A1@2,F8A2@2,"MIR1184-1@2,MIR1184-2@2,MIR1184-3@2",F8A1@3,F8A3@3,F8A2@3,"MIR1184-1@3,MIR1184-2@3,MIR1184-3@3","H2AB2@3,H2AB3@3",H2AB1@3,"LOC101927830@2,TMLHE-AS1@2",TMLHE,SPRY3,VAMP7,IL9R,WASIR1,DDX11L16
0,PT-L1HP,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,PT-8TK1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,PT-8UY5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,PT-BQF5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,PT-8WFK,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [147]:
ydeldup_genename = Xydeldup[["index", "PHENO"]]
ydeldup_genename = ydeldup_genename.set_index("index")
ydeldup_genename.index.name = ""

In [148]:
ydeldup_genename.head()

Unnamed: 0,PHENO
,
PT-L1HP,1.0
PT-8TK1,0.0
PT-8UY5,0.0
PT-BQF5,1.0
PT-8WFK,0.0


In [154]:
Xdeldup_genename = Xydeldup.drop(columns = ["PHENO"])
Xdeldup_genename = Xdeldup_genename.set_index("index")
Xdeldup_genename.index.name = ""

In [161]:
Xdeldup_genename.head()

Unnamed: 0,LOC102725121@1,DDX11L1,WASH7P,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1","MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1","FAM138A@1,FAM138C@1,FAM138F@1",OR4F5,LOC729737,"LOC100132062@1,LOC100132287@1","OR4F16@1,OR4F29@1,OR4F3@1",LOC101928626,MIR12136,"OR4F16@2,OR4F29@2,OR4F3@2",LOC100133331,LOC100288069,FAM87B,LINC00115,LINC01128,FAM41C,LINC02593,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,ISG15,AGRN,LOC100288175,RNF223,C1orf159,LINC01342,MIR200B,MIR200A,MIR429,TTLL10,TNFRSF18,TNFRSF4,SDF4,B3GALT6,...,DKC1,SNORA36A,MIR664B,SNORA56,MPP1,SMIM9,F8,H2AB1@1,"H2AB2@1,H2AB3@1",F8A3@1,F8A1@1,F8A2@1,"MIR1184-1@1,MIR1184-2@1,MIR1184-3@1",FUNDC2,CMC4,MTCP1,BRCC3,VBP1,RAB39B,CLIC2,"LOC101927830@1,TMLHE-AS1@1",H2AB1@2,"H2AB2@2,H2AB3@2",F8A3@2,F8A1@2,F8A2@2,"MIR1184-1@2,MIR1184-2@2,MIR1184-3@2",F8A1@3,F8A3@3,F8A2@3,"MIR1184-1@3,MIR1184-2@3,MIR1184-3@3","H2AB2@3,H2AB3@3",H2AB1@3,"LOC101927830@2,TMLHE-AS1@2",TMLHE,SPRY3,VAMP7,IL9R,WASIR1,DDX11L16
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
PT-L1HP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PT-8TK1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PT-8UY5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PT-BQF5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PT-8WFK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [156]:
Xdeldup_genename.to_csv(f"{cwd}/del_dup.X.colrownames.gz", compression = "gzip", sep = "\t", header = True, index = True)
ydeldup_genename.to_csv(f"{cwd}/del_dup.y.colrownames.gz", compression = "gzip", sep = "\t", header = True, index = True)

In [157]:
Xdeldup_genename.to_csv(f"{cwd}/del_dup.X.colnames.gz", compression = "gzip", sep = "\t", header = True, index = False)
ydeldup_genename.to_csv(f"{cwd}/del_dup.y.colnames.gz", compression = "gzip", sep = "\t", header = True, index = False)

In [158]:
Xdeldup_genename.to_csv(f"{cwd}/del_dup.X.no_colrownames.gz", compression = "gzip", sep = "\t", header = False, index = False)
ydeldup_genename.to_csv(f"{cwd}/del_dup.y.no_colrownames.gz", compression = "gzip", sep = "\t", header = False, index = False)

## X with row (individulas) and col (genes) names, y with row (individulas) and col ("PHENO") names

In [36]:
Xdel.to_csv(f"{cwd}/deletion.X.colrownames.gz", compression = "gzip", sep = "\t", header = True, index = True)
Xdup.to_csv(f"{cwd}/duplication.X.colrownames.gz", compression = "gzip", sep = "\t", header = True, index = True)

In [37]:
ydel.to_csv(f"{cwd}/deletion.y.colrownames.gz", compression = "gzip", sep = "\t", header = True, index = True)
ydup.to_csv(f"{cwd}/duplication.y.colrownames.gz", compression = "gzip", sep = "\t", header = True, index = True)

## X with col (genes) names, y with col ("PHENO") names

In [38]:
Xdel.to_csv(f"{cwd}/deletion.X.colnames.gz", compression = "gzip", sep = "\t", header = True, index = False)
Xdup.to_csv(f"{cwd}/duplication.X.colnames.gz", compression = "gzip", sep = "\t", header = True, index = False)

In [39]:
ydel.to_csv(f"{cwd}/deletion.y.colnames.gz", compression = "gzip", sep = "\t", header = True, index = False)
ydup.to_csv(f"{cwd}/duplication.y.colnames.gz", compression = "gzip", sep = "\t", header = True, index = False)

## X and y without row and col names

In [180]:
Xdel.to_csv(f"{cwd}/deletion.X.no_colrownames.gz", compression = "gzip", sep = "\t", header = False, index = False)
Xdup.to_csv(f"{cwd}/duplication.X.no_colrownames.gz", compression = "gzip", sep = "\t", header = False, index = False)

In [179]:
ydel.to_csv(f"{cwd}/deletion.y.no_colrownames.gz", compression = "gzip", sep = "\t", header = False, index = False)
ydup.to_csv(f"{cwd}/duplication.y.no_colrownames.gz", compression = "gzip", sep = "\t", header = False, index = False)

## Read X and y
Set `index_col=0`, the individual IDs will be the indices.

In [171]:
tmp_y = pd.read_csv(f"{cwd}/deletion.y.colrownames.gz", sep = "\t", header = 0, index_col = 0)

In [172]:
tmp_y.head()

Unnamed: 0,PHENO
PT-L1HP,1
PT-8TK1,0
PT-8UY5,0
PT-BQF5,1
PT-8WFK,0


In [173]:
tmp_X = pd.read_csv(f"{cwd}/deletion.X.colrownames.gz", sep = "\t", header = 0, index_col = 0)

In [174]:
tmp_X.head()

Unnamed: 0,LOC102725121@1,DDX11L1,WASH7P,"MIR6859-1@1,MIR6859-2@1,MIR6859-3@1,MIR6859-4@1","MIR1302-10@1,MIR1302-11@1,MIR1302-2@1,MIR1302-9@1","FAM138A@1,FAM138C@1,FAM138F@1",OR4F5,LOC729737,"LOC100132062@1,LOC100132287@1","OR4F16@1,OR4F29@1,OR4F3@1",LOC101928626,MIR12136,"OR4F16@2,OR4F29@2,OR4F3@2",LOC100133331,LOC100288069,FAM87B,LINC00115,LINC01128,FAM41C,LINC02593,SAMD11,NOC2L,KLHL17,PLEKHN1,PERM1,HES4,ISG15,AGRN,LOC100288175,RNF223,C1orf159,LINC01342,MIR200B,MIR200A,MIR429,TTLL10,TNFRSF18,TNFRSF4,SDF4,B3GALT6,...,DKC1,SNORA36A,MIR664B,SNORA56,MPP1,SMIM9,F8,H2AB1@1,"H2AB2@1,H2AB3@1",F8A3@1,F8A1@1,F8A2@1,"MIR1184-1@1,MIR1184-2@1,MIR1184-3@1",FUNDC2,CMC4,MTCP1,BRCC3,VBP1,RAB39B,CLIC2,"LOC101927830@1,TMLHE-AS1@1",H2AB1@2,"H2AB2@2,H2AB3@2",F8A3@2,F8A1@2,F8A2@2,"MIR1184-1@2,MIR1184-2@2,MIR1184-3@2",F8A1@3,F8A3@3,F8A2@3,"MIR1184-1@3,MIR1184-2@3,MIR1184-3@3","H2AB2@3,H2AB3@3",H2AB1@3,"LOC101927830@2,TMLHE-AS1@2",TMLHE,SPRY3,VAMP7,IL9R,WASIR1,DDX11L16
PT-L1HP,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-8TK1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-8UY5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-BQF5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
PT-8WFK,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
