In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from collections import Counter

In [2]:
sw_pheno = pd.read_table("../swcnv/swcnv.pheno", sep="\s+")
sw_sum = pd.read_table("../swcnv/swcnv.qc6.cnv.summary", sep="\s+") # "CHR"
sw_fam = pd.read_table("../swcnv/swcnv.qc6.fam", sep="\s+", header = None, 
                       usecols = (0,1,4,5), names = ["FID", "IID", "sex", "Case-Ctrl"]) # sex and case-ctrl
sw_cnv = pd.read_table("../swcnv/swcnv.qc6.cnv", sep="\s+")
sw_cluster = pd.read_table("../swcnv/swcnv.clusters", sep="\s+") # Sw1, Sw2-4, Sw5-6
sw_cluster = sw_cluster.rename(columns={"uFID": "FID", "uIID": "IID"})
sw_indiv = pd.read_table("../swcnv/swcnv.qc6.cnv.indiv", sep="\s+") # "NSEG": number of CNV in each sample
sw_cluster_indiv = pd.merge(sw_cluster, sw_indiv, how="inner", on=["FID", "IID"])

In [3]:
# Number of Cases and Controls, 2: cases, 1: controls
sw_fam.groupby("Case-Ctrl").size()

Case-Ctrl
1    5917
2    4719
dtype: int64

In [4]:
# sex by status
sex_by_casectrl = sw_fam.groupby(["Case-Ctrl", "sex"]).size()
print (sex_by_casectrl)

Case-Ctrl  sex
1          1      3034
           2      2883
2          1      2826
           2      1893
dtype: int64


In [5]:
prp_male_case = sex_by_casectrl[2][1] / (sex_by_casectrl[2][1] + sex_by_casectrl[2][2])
print (prp_male_case)
prp_male_ctrl = sex_by_casectrl[1][1] / (sex_by_casectrl[1][1] + sex_by_casectrl[1][2])
print (prp_male_ctrl)

0.598855689765
0.512759844516


In [6]:
print (sw_cluster_indiv.groupby(["tr", "PHE"]).size())

tr            PHE
scz_s234_eur  1      2137
              2      1847
scz_swe1_eur  1       206
              2       207
scz_swe5_eur  1      2456
              2      1674
scz_swe6_eur  1      1118
              2       991
dtype: int64


In [7]:
# "NSEG": number of CNV in each subject
mean_cnv_case_sw1 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe1_eur"]
                            [sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl_sw1 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe1_eur"]
                            [sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case_sw1, mean_cnv_ctrl_sw1)
mean_cnv_case_sw234 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_s234_eur"]
                              [sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl_sw234 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_s234_eur"]
                              [sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case_sw234, mean_cnv_ctrl_sw234)
mean_cnv_case_sw56 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe5_eur"]
                            [sw_cluster_indiv["PHE"]==2].tolist()
                            + sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe6_eur"]
                            [sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl_sw56 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]==("scz_swe5_eur")]
                             [sw_cluster_indiv["PHE"]==1].tolist()
                            + sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe6_eur"]
                            [sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case_sw56, mean_cnv_ctrl_sw56)
mean_cnv_case = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case, mean_cnv_ctrl)

1.05797101449 0.844660194175
1.2360584732 1.20870379036
0.758348968105 0.683547845551
0.958465776648 0.878823728241


In [8]:
sw_cluster_cnv = pd.merge(sw_cluster, sw_cnv, how="inner", on=["FID", "IID"])
sw_cluster_cnv_indiv = pd.merge(sw_cluster_cnv, sw_indiv, how="inner", on=["FID", "IID"])
sw_cluster_cnv_indiv["CNV_LEN"] = sw_cluster_cnv_indiv["BP2"] - sw_cluster_cnv_indiv["BP1"]

In [9]:
print (sw_cnv)

               FID  IID  CHR        BP1        BP2  TYPE     SCORE  SITES
0          PT-8K8C    1    8   30371917   30849740     3    95.760     97
1          PT-8UHJ    1    7  143218408  143532998     3    20.780     74
2          PT-8UHU    1    2  117775371  117925547     3    69.060     73
3          PT-8UHU    1    2  230714025  230902238     3    53.580     73
4          PT-8UHU    1   15   24578499   24778891     4    91.330    102
5          PT-8K6N    1    4    2058475    2316874     3    29.670     51
6          PT-8K6N    1    7  157925600  158260724     1    52.935     73
7          PT-8TUA    1    2   83597659   83749480     3    46.110     82
8          PT-8UCU    1    7   57481104   57636170     3    54.280    117
9          PT-8UD3    1    7  143444216  143568059     1    13.750     51
10         PT-8UD3    1   21   10736871   11002646     3    18.440     80
11         PT-8K79    1   18    1725758    1839387     1   110.920     42
12         PT-8UI7    1    2  11083364

In [10]:
prp_del_case_sw234 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist()
prp_del_ctrl_sw234 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist()

In [11]:
print (Counter(prp_del_case_sw234))
print (Counter(prp_del_ctrl_sw234))

Counter({3: 1394, 1: 819, 4: 67, 0: 3})
Counter({3: 1549, 1: 946, 4: 86, 0: 2})


In [12]:
sw_cnv.groupby(["TYPE"]).size()

TYPE
0      13
1    3714
3    5826
4     170
dtype: int64

In [13]:
print (sw_cluster_cnv_indiv["CNV_LEN"].tolist())

[101080, 499114, 136615, 244253, 137559, 1191878, 1236306, 237448, 263118, 168521, 1380084, 132210, 568050, 423322, 2011121, 426650, 193342, 841612, 348567, 292751, 185713, 370653, 219518, 103338, 309690, 140140, 590237, 1276586, 962967, 120001, 118293, 139702, 102318, 196657, 133021, 451163, 202573, 383979, 147108, 204747, 181231, 550256, 789758, 113805, 103343, 260421, 162265, 137559, 181795, 178937, 156048, 579694, 150039, 188213, 152067, 143052, 138146, 183041, 391594, 127614, 1912322, 219391, 167853, 216573, 192539, 117049, 106771, 143424, 136615, 100749, 135663, 379128, 278956, 487524, 183419, 254107, 624598, 438471, 184282, 192059, 1218809, 127174, 110255, 686493, 178020, 161988, 150354, 139702, 328542, 221220, 2110981, 264927, 484516, 180420, 192039, 121530, 160875, 300103, 140541, 239144, 183552, 156340, 120591, 321205, 136901, 810822, 231541, 118276, 1454650, 152067, 198677, 254107, 208690, 102709, 102166, 106711, 158194, 297796, 188388, 149297, 160073, 323920, 112520, 483122

In [14]:
sw_mds = pd.read_table("../swcnv/swcnv.mds", sep="\s+")

In [15]:
sw_mds["C_sum"] = sw_mds["C1"] + sw_mds["C2"] + sw_mds["C3"] + sw_mds["C4"] + sw_mds["C5"] + sw_mds["C6"] + sw_mds["C7"] + sw_mds["C8"] + sw_mds["C9"] + sw_mds["C10"]

In [16]:
print (np.mean(np.exp(sw_mds["C_sum"].tolist())))

1.00015782373
