# Obtain X matrix
column: individual

row: gene

## Import phenotype

In [1]:
import pandas as pd, numpy as np
import os
from pandasql import sqldf
from collections import Counter
from more_itertools import unique_everseen
cwd = os.path.expanduser("~/GIT/cnv-gene-mapping/data")

In [2]:
pheno1 = pd.read_table(f"{cwd}/swcnv/swcnv.pheno", header = 0, sep = "\s+", usecols = [0,2,3,4])

In [4]:
pheno1.shape

(12406, 4)

In [5]:
Counter(pheno1["BPSCZ"]), Counter(pheno1["SCZ"]), Counter(pheno1["BP"])

(Counter({1: 6256, 2: 6150}),
 Counter({-9: 1172, 1: 6256, 2: 4978}),
 Counter({-9: 4978, 1: 6256, 2: 1172}))

In [3]:
pheno2 = pd.read_table(f"{cwd}/swcnv/swcnv.qc6.cnv.indiv", header = 0, sep = "\s+", usecols = [0,2])

In [6]:
pheno2.shape

(10636, 2)

In [7]:
pheno2["FID"] = [x.split("_")[0] for x in pheno2["FID"]]

In [8]:
pheno = pd.merge(pheno1, pheno2, how = "outer", on = "FID")

In [9]:
pheno["PHENO"] = pheno.apply(lambda x: int(x["SCZ"]) if str(x["SCZ"]) != "nan" else int(x["PHE"]), axis = 1)

In [10]:
pheno.head()

Unnamed: 0,FID,BPSCZ,SCZ,BP,PHE,PHENO
0,PT-1RTW,1.0,1.0,1.0,1.0,1
1,PT-1RTX,1.0,1.0,1.0,1.0,1
2,PT-1RTY,1.0,1.0,1.0,1.0,1
3,PT-1RTZ,2.0,2.0,-9.0,2.0,2
4,PT-1RU1,1.0,1.0,1.0,1.0,1


In [11]:
Counter(pheno["PHENO"])

Counter({-9: 1172, 1: 6355, 2: 5097})

## Import individual CNV
TYPE: 1 is deletion, 3 is duplication

In [12]:
cnv = pd.read_table(f"{cwd}/swcnv/swcnv.qc6.cnv", header = 0, sep = "\s+", usecols = [0,2,3,4,5])

In [13]:
cnv.shape

(9723, 5)

In [14]:
cnv["FID"] = [x.split("_")[0] for x in cnv["FID"]]

In [15]:
cnv = pd.merge(cnv, pheno[["FID", "PHENO"]], how = "left", on = "FID")

In [16]:
cnv.head(10)

Unnamed: 0,FID,CHR,BP1,BP2,TYPE,PHENO
0,PT-8K8C,8,30371917,30849740,3,1
1,PT-8UHJ,7,143218408,143532998,3,1
2,PT-8UHU,2,117775371,117925547,3,1
3,PT-8UHU,2,230714025,230902238,3,1
4,PT-8UHU,15,24578499,24778891,4,1
5,PT-8K6N,4,2058475,2316874,3,1
6,PT-8K6N,7,157925600,158260724,1,1
7,PT-8TUA,2,83597659,83749480,3,1
8,PT-8UCU,7,57481104,57636170,3,1
9,PT-8UD3,7,143444216,143568059,1,1


In [17]:
Counter(cnv["PHENO"])

Counter({1: 5200, 2: 4523})

In [18]:
cnv.drop_duplicates(subset = ["FID", "CHR", "BP2", "TYPE", "PHENO"]).shape

(9723, 6)

In [19]:
dele = cnv[cnv["TYPE"] == 1]
dup = cnv[cnv["TYPE"] == 3]

In [20]:
dele = dele.sort_values(by = ["CHR", "BP1", "BP2"])
dup = dup.sort_values(by = ["CHR", "BP1", "BP2"])

In [21]:
# dele[dele["FID"].isin(list(set(dele[dele.duplicated(subset = ["FID", "CHR"])]["FID"].tolist())))]

In [22]:
ref_gene = pd.read_table(f"{cwd}/refGene.clean.gz", compression = "gzip", sep = "\t", header = 0)

In [25]:
ref_gene.head()

Unnamed: 0,CHR,start,end,gene
0,1,11868,29370,gene1
1,1,30365,30503,gene2
2,1,34610,36081,gene3
3,1,69090,70008,gene4
4,1,134772,140566,gene5


In [26]:
query = """
    SELECT dele.FID, dele.CHR, dele.BP1, dele.BP2, dele.PHENO, ref_gene.gene, ref_gene.start, ref_gene.end
    FROM dele LEFT JOIN ref_gene
    WHERE dele.CHR == ref_gene.CHR
    AND (
    (dele.BP1 >= ref_gene.start AND dele.BP1 <= ref_gene.end)
    OR
    (dele.BP2 >= ref_gene.start AND dele.BP2 <= ref_gene.end)
    OR
    (dele.BP1 <= ref_gene.start AND dele.BP2 >= ref_gene.end)
    )
"""

In [28]:
res = sqldf(query)

In [29]:
res.head()

Unnamed: 0,FID,CHR,BP1,BP2,PHENO,gene,start,end
0,PT-L1HP,1,1106784,1220136,2,gene31,1109285,1133313
1,PT-L1HP,1,1106784,1220136,2,gene32,1138887,1142089
2,PT-L1HP,1,1106784,1220136,2,gene33,1146719,1149533
3,PT-L1HP,1,1106784,1220136,2,gene34,1152287,1167447
4,PT-L1HP,1,1106784,1220136,2,gene35,1167616,1170420
