In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from collections import Counter

In [2]:
sw_pheno = pd.read_table("../swcnv/swcnv.pheno", sep="\s+")
sw_sum = pd.read_table("../swcnv/swcnv.qc6.cnv.summary", sep="\s+") # "CHR"
sw_fam = pd.read_table("../swcnv/swcnv.qc6.fam", sep="\s+", header = None, 
                       usecols = (0,1,4,5), names = ["FID", "IID", "sex", "Case-Ctrl"]) # sex and case-ctrl
sw_cnv = pd.read_table("../swcnv/swcnv.qc6.cnv", sep="\s+")
sw_cluster = pd.read_table("../swcnv/swcnv.clusters", sep="\s+") # Sw1, Sw2-4, Sw5-6
sw_cluster = sw_cluster.rename(columns={"uFID": "FID", "uIID": "IID"})
sw_indiv = pd.read_table("../swcnv/swcnv.qc6.cnv.indiv", sep="\s+") # "NSEG": number of CNV in each sample
sw_cluster_indiv = pd.merge(sw_cluster, sw_indiv, how="inner", on=["FID", "IID"])
sw_cluster_cnv = pd.merge(sw_cluster, sw_cnv, how="inner", on=["FID", "IID"])
sw_cluster_cnv_indiv = pd.merge(sw_cluster_cnv, sw_indiv, how="inner", on=["FID", "IID"])
sw_cluster_cnv_indiv["CNV_LEN"] = (sw_cluster_cnv_indiv["BP2"] - sw_cluster_cnv_indiv["BP1"])/1000

In [3]:
# Number of Cases and Controls, 2: cases, 1: controls
sw_fam.groupby("Case-Ctrl").size()

Case-Ctrl
1    5917
2    4719
dtype: int64

In [4]:
# sex by status
sex_by_casectrl = sw_fam.groupby(["Case-Ctrl", "sex"]).size()
print (sex_by_casectrl)

Case-Ctrl  sex
1          1      3034
           2      2883
2          1      2826
           2      1893
dtype: int64


In [5]:
prp_male_case = sex_by_casectrl[2][1] / (sex_by_casectrl[2][1] + sex_by_casectrl[2][2])
print (prp_male_case)
prp_male_ctrl = sex_by_casectrl[1][1] / (sex_by_casectrl[1][1] + sex_by_casectrl[1][2])
print (prp_male_ctrl)

0.598855689765
0.512759844516


In [6]:
print (sw_cluster_indiv.groupby(["tr", "PHE"]).size())

tr            PHE
scz_s234_eur  1      2137
              2      1847
scz_swe1_eur  1       206
              2       207
scz_swe5_eur  1      2456
              2      1674
scz_swe6_eur  1      1118
              2       991
dtype: int64


In [7]:
# "NSEG": number of CNV in each subject
mean_cnv_case_sw1 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe1_eur"]
                            [sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl_sw1 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe1_eur"]
                            [sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case_sw1, mean_cnv_ctrl_sw1)
mean_cnv_case_sw234 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_s234_eur"]
                              [sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl_sw234 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_s234_eur"]
                              [sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case_sw234, mean_cnv_ctrl_sw234)
mean_cnv_case_sw56 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe5_eur"]
                            [sw_cluster_indiv["PHE"]==2].tolist()
                            + sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe6_eur"]
                            [sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl_sw56 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]==("scz_swe5_eur")]
                             [sw_cluster_indiv["PHE"]==1].tolist()
                            + sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe6_eur"]
                            [sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case_sw56, mean_cnv_ctrl_sw56)
mean_cnv_case = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case, mean_cnv_ctrl)

1.05797101449 0.844660194175
1.2360584732 1.20870379036
0.758348968105 0.683547845551
0.958465776648 0.878823728241


In [8]:
prp_del_case = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==2].tolist()
prp_del_ctrl = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1].tolist()
print (prp_del_case.count(1)/len(prp_del_case))
print (prp_del_ctrl.count(1)/len(prp_del_ctrl))

0.38182622153437984
0.38211538461538463


In [9]:
prp_del_case_sw1 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe1_eur"].tolist()
prp_del_ctrl_sw1 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe1_eur"].tolist()
print (prp_del_case_sw1.count(1)/len(prp_del_case_sw1))
print (prp_del_ctrl_sw1.count(1)/len(prp_del_ctrl_sw1))

0.4794520547945205
0.39080459770114945


In [10]:
prp_del_case_sw234 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist()
prp_del_ctrl_sw234 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist()
print (prp_del_case_sw234.count(1)/len(prp_del_case_sw234))
print (prp_del_ctrl_sw234.count(1)/len(prp_del_ctrl_sw234))

0.3587385019710907
0.36624080526519553


In [11]:
prp_del_case_sw56 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe5_eur"].tolist() + sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe6_eur"].tolist()
prp_del_ctrl_sw56 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe5_eur"].tolist() + sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe6_eur"].tolist()
print (prp_del_case_sw56.count(1)/len(prp_del_case_sw56))
print (prp_del_ctrl_sw56.count(1)/len(prp_del_ctrl_sw56))

0.39732805541810984
0.3982808022922636


In [12]:
median_cnv_len_case = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==2].tolist())
median_cnv_len_ctrl = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1].tolist())
print (median_cnv_len_case)
print (median_cnv_len_ctrl)

188.797
182.711


In [13]:
median_cnv_len_case_sw1 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe1_eur"].tolist())
median_cnv_len_ctrl_sw1 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe1_eur"].tolist())
print (median_cnv_len_case_sw1)
print (median_cnv_len_ctrl_sw1)

178.016
181.313


In [14]:
median_cnv_len_case_sw234 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist())
median_cnv_len_ctrl_sw234 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist())
print (median_cnv_len_case_sw234)
print (median_cnv_len_ctrl_sw234)

188.213
182.211


In [15]:
median_cnv_len_case_sw56 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe5_eur"].tolist() + sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe6_eur"].tolist())
median_cnv_len_ctrl_sw56 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe5_eur"].tolist() + sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe6_eur"].tolist())
print (median_cnv_len_case_sw56)
print (median_cnv_len_ctrl_sw56)

190.739
183.127


In [16]:
sw_mds = pd.read_table("../swcnv/swcnv.mds", sep="\s+")

In [17]:
sw_mds["C_sum"] = sw_mds["C1"] + sw_mds["C2"] + sw_mds["C3"] + sw_mds["C4"] + sw_mds["C5"] + sw_mds["C6"] + sw_mds["C7"] + sw_mds["C8"] + sw_mds["C9"] + sw_mds["C10"]

In [18]:
print (np.mean(np.exp(sw_mds["C_sum"].tolist())))

1.00015782373
