### Table 1 and Figure 3 in CNV in schizophrenia in Sweden

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from collections import Counter
from pandasql import sqldf
from scipy import stats

#### Table 1

In [2]:
sw_pheno = pd.read_table("../data/swcnv/swcnv.pheno", sep="\s+")
sw_sum = pd.read_table("../data/swcnv/swcnv.qc6.cnv.summary", sep="\s+") # "CHR"
sw_fam = pd.read_table("../data/swcnv/swcnv.qc6.fam", sep="\s+", header = None, 
                       usecols = (0,1,4,5), names = ["FID", "IID", "sex", "Case-Ctrl"]) # sex and case-ctrl
sw_cnv = pd.read_table("../data/swcnv/swcnv.qc6.cnv", sep="\s+")
sw_cluster = pd.read_table("../data/swcnv/swcnv.clusters", sep="\s+") # Sw1, Sw2-4, Sw5-6
sw_cluster = sw_cluster.rename(columns={"uFID": "FID", "uIID": "IID"})
sw_indiv = pd.read_table("../data/swcnv/swcnv.qc6.cnv.indiv", sep="\s+") # "NSEG": number of CNV in each sample
sw_cluster_indiv = pd.merge(sw_cluster, sw_indiv, how="inner", on=["FID", "IID"])
sw_cluster_cnv = pd.merge(sw_cluster, sw_cnv, how="inner", on=["FID", "IID"])
sw_cluster_cnv_indiv = pd.merge(sw_cluster_cnv, sw_indiv, how="inner", on=["FID", "IID"])
sw_cluster_cnv_indiv["CNV_LEN"] = (sw_cluster_cnv_indiv["BP2"] - sw_cluster_cnv_indiv["BP1"])/1000

In [3]:
# Number of Cases and Controls, 2: cases, 1: controls
sw_fam.groupby("Case-Ctrl").size()

Case-Ctrl
1    5917
2    4719
dtype: int64

In [4]:
# sex by status
sex_by_casectrl = sw_fam.groupby(["Case-Ctrl", "sex"]).size()
print (sex_by_casectrl)

Case-Ctrl  sex
1          1      3034
           2      2883
2          1      2826
           2      1893
dtype: int64


In [5]:
prp_male_case = sex_by_casectrl[2][1] / (sex_by_casectrl[2][1] + sex_by_casectrl[2][2])
print (prp_male_case)
prp_male_ctrl = sex_by_casectrl[1][1] / (sex_by_casectrl[1][1] + sex_by_casectrl[1][2])
print (prp_male_ctrl)

0.598855689765
0.512759844516


In [6]:
print (sw_cluster_indiv.groupby(["tr", "PHE"]).size())

tr            PHE
scz_s234_eur  1      2137
              2      1847
scz_swe1_eur  1       206
              2       207
scz_swe5_eur  1      2456
              2      1674
scz_swe6_eur  1      1118
              2       991
dtype: int64


In [7]:
# "NSEG": number of CNV in each subject
mean_cnv_case_sw1 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe1_eur"]
                            [sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl_sw1 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe1_eur"]
                            [sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case_sw1, mean_cnv_ctrl_sw1)
mean_cnv_case_sw234 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_s234_eur"]
                              [sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl_sw234 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_s234_eur"]
                              [sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case_sw234, mean_cnv_ctrl_sw234)
mean_cnv_case_sw56 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe5_eur"]
                            [sw_cluster_indiv["PHE"]==2].tolist()
                            + sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe6_eur"]
                            [sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl_sw56 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]==("scz_swe5_eur")]
                             [sw_cluster_indiv["PHE"]==1].tolist()
                            + sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe6_eur"]
                            [sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case_sw56, mean_cnv_ctrl_sw56)
mean_cnv_case = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case, mean_cnv_ctrl)

1.05797101449 0.844660194175
1.2360584732 1.20870379036
0.758348968105 0.683547845551
0.958465776648 0.878823728241


In [8]:
prp_del_case = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==2].tolist()
prp_del_ctrl = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1].tolist()
# 1 is deletion, 3 is duplication
print (prp_del_case.count(1)/len(prp_del_case))
print (prp_del_ctrl.count(1)/len(prp_del_ctrl))

0.38182622153437984
0.38211538461538463


In [9]:
prp_del_case_sw1 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe1_eur"].tolist()
prp_del_ctrl_sw1 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe1_eur"].tolist()
print (prp_del_case_sw1.count(1)/len(prp_del_case_sw1))
print (prp_del_ctrl_sw1.count(1)/len(prp_del_ctrl_sw1))

0.4794520547945205
0.39080459770114945


In [10]:
prp_del_case_sw234 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist()
prp_del_ctrl_sw234 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist()
print (prp_del_case_sw234.count(1)/len(prp_del_case_sw234))
print (prp_del_ctrl_sw234.count(1)/len(prp_del_ctrl_sw234))

0.3587385019710907
0.36624080526519553


In [11]:
prp_del_case_sw56 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe5_eur"].tolist() + sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe6_eur"].tolist()
prp_del_ctrl_sw56 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe5_eur"].tolist() + sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe6_eur"].tolist()
print (prp_del_case_sw56.count(1)/len(prp_del_case_sw56))
print (prp_del_ctrl_sw56.count(1)/len(prp_del_ctrl_sw56))

0.39732805541810984
0.3982808022922636


In [12]:
median_cnv_len_case = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==2].tolist())
median_cnv_len_ctrl = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1].tolist())
print (median_cnv_len_case)
print (median_cnv_len_ctrl)

188.797
182.711


In [13]:
median_cnv_len_case_sw1 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe1_eur"].tolist())
median_cnv_len_ctrl_sw1 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe1_eur"].tolist())
print (median_cnv_len_case_sw1)
print (median_cnv_len_ctrl_sw1)

178.016
181.313


In [14]:
median_cnv_len_case_sw234 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist())
median_cnv_len_ctrl_sw234 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist())
print (median_cnv_len_case_sw234)
print (median_cnv_len_ctrl_sw234)

188.213
182.211


In [15]:
median_cnv_len_case_sw56 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe5_eur"].tolist() + sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe6_eur"].tolist())
median_cnv_len_ctrl_sw56 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe5_eur"].tolist() + sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe6_eur"].tolist())
print (median_cnv_len_case_sw56)
print (median_cnv_len_ctrl_sw56)

190.739
183.127


#### Figure 3

In [16]:
# test = sw_cluster_cnv_indiv[["FID", "PHE", "TYPE"]]
# new = test.loc[test["TYPE"].isin([1,3])]
# new["PHE_NEW"] = new["PHE"] - 1
# print (new)
# new["TYPE"][new["TYPE"]==3] = 0
# print (new)

In [17]:
sw_cluster_indiv = pd.merge(sw_cluster, sw_indiv, how="inner", on=["FID", "IID"])
sw_cluster_indiv_cnv = pd.merge(sw_cluster_indiv, sw_cnv, how="outer", on=["FID", "IID"])

In [18]:
sw_cluster_indiv_cnv

Unnamed: 0,FID,IID,tr,PHE,NSEG,KB,KBAVG,CHR,BP1,BP2,TYPE,SCORE,SITES
0,PT-BSLD,1,scz_s234_eur,2,0,0.000,0.000,,,,,,
1,PT-BPTP,1,scz_s234_eur,2,1,101.080,101.080,7.0,9128802.0,9229882.0,1.0,65.010,86.0
2,PT-BPAI,1,scz_s234_eur,2,1,499.114,499.114,16.0,18228118.0,18727232.0,3.0,13.130,41.0
3,PT-BSLJ,1,scz_s234_eur,2,1,136.615,136.615,1.0,16869363.0,17005978.0,3.0,38.680,55.0
4,PT-BPIS,1,scz_s234_eur,2,0,0.000,0.000,,,,,,
5,PT-BQP5,1,scz_s234_eur,2,1,244.253,244.253,16.0,18830938.0,19075191.0,3.0,97.890,113.0
6,PT-BSXQ,1,scz_s234_eur,2,0,0.000,0.000,,,,,,
7,PT-BP9I,1,scz_s234_eur,2,1,137.559,137.559,21.0,37481955.0,37619514.0,4.0,147.000,100.0
8,PT-BR4P,1,scz_s234_eur,2,0,0.000,0.000,,,,,,
9,PT-BQGK,1,scz_s234_eur,2,1,1191.880,1191.880,15.0,97984319.0,99176197.0,3.0,811.310,934.0


In [36]:
query = '''
SELECT FID, IID, PHE-1, NSEG, TYPE, tr, CHR, BP1, BP2
FROM sw_cluster_indiv_cnv
GROUP BY FID, IID, PHE-1, NSEG, tr
ORDER BY PHE-1 DESC
'''
# FIXME: needs to combine TYPE
allcnv = sqldf(query)
allcnv = allcnv.rename(columns={"PHE-1": "PHE"})
allcnv["hasCNV"] = allcnv.apply(lambda row: 1 if row["NSEG"]>0 else 0, axis=1)
allcnv

Unnamed: 0,FID,IID,PHE,NSEG,TYPE,tr,CHR,BP1,BP2,hasCNV
0,PT-1RTZ,1,1,0,,scz_swe1_eur,,,,0
1,PT-1RU7,1,1,0,,scz_swe1_eur,,,,0
2,PT-1RUJ,1,1,1,3.0,scz_swe1_eur,7.0,72731641.0,73822588.0,1
3,PT-1RUO,1,1,1,3.0,scz_swe1_eur,9.0,138147150.0,138309006.0,1
4,PT-1RV2,1,1,3,3.0,scz_swe1_eur,6.0,161913437.0,162369838.0,1
5,PT-1RV4,1,1,1,3.0,scz_swe1_eur,17.0,71834001.0,72443710.0,1
6,PT-1RVE,1,1,0,,scz_swe1_eur,,,,0
7,PT-1RVH,1,1,0,,scz_swe1_eur,,,,0
8,PT-1RVS,1,1,0,,scz_swe1_eur,,,,0
9,PT-1RW3,1,1,1,1.0,scz_swe1_eur,2.0,97838130.0,98025643.0,1


In [20]:
import statsmodels.api as sm
new_logit = sm.Logit(allcnv["PHE"], allcnv["hasCNV"])
res = new_logit.fit()
print (res.summary())
print (np.exp(res.params))

Optimization terminated successfully.
         Current function value: 0.691142
         Iterations 3
                           Logit Regression Results                           
Dep. Variable:                    PHE   No. Observations:                10636
Model:                          Logit   Df Residuals:                    10635
Method:                           MLE   Df Model:                            0
Date:                Fri, 14 Jul 2017   Pseudo R-squ.:               -0.006336
Time:                        19:52:30   Log-Likelihood:                -7351.0
converged:                       True   LL-Null:                       -7304.7
                                        LLR p-value:                       nan
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
hasCNV        -0.1675      0.026     -6.520      0.000      -0.218      -0.117
hasCNV    0.845782
dtype: flo

In [21]:
# all CNV
query = '''
SELECT PHE, TYPE, count(FID)
FROM allcnv
GROUP BY PHE, TYPE
ORDER BY count(FID) DESC
'''
print (sqldf(query))
allcnv.groupby(["PHE", "TYPE"]).size()

   PHE  TYPE  count(FID)
0    0   NaN        2610
1    0   3.0        1969
2    1   NaN        1922
3    1   3.0        1659
4    0   1.0        1288
5    1   1.0        1083
6    1   4.0          50
7    0   4.0          46
8    1   0.0           5
9    0   0.0           4


PHE  TYPE
0    0.0        4
     1.0     1288
     3.0     1969
     4.0       46
1    0.0        5
     1.0     1083
     3.0     1659
     4.0       50
dtype: int64

In [22]:
stats.fisher_exact([[1659+1083+50+5, 1969+1288+46+4], [1922, 2610]])[0]

1.1485380709477924

In [52]:
query = '''
SELECT CHR, BP1, BP2, PHE-1, count(FID)
FROM sw_cluster_indiv_cnv
WHERE NSEG != 0
GROUP BY CHR, BP1, BP2, PHE-1
ORDER BY PHE-1 DESC, count(FID) DESC
'''
n_CNV_occur = sqldf(query)
n_CNV_occur = n_CNV_occur.rename(columns={"PHE-1": "PHE", "count(FID)": "n_CNV_occur"})
print (n_CNV_occur)
# single occurrence CNVs
query = '''
SELECT PHE, count(n_CNV_occur)
FROM n_CNV_occur
WHERE n_CNV_occur == 1
GROUP BY PHE
'''
single_CNV = sqldf(query)
print (single_CNV)
stats.fisher_exact([[2541, 2832], [4719-2541, 5917-2832]])

       CHR          BP1          BP2  PHE  n_CNV_occur
0      3.0   35826707.0   35938795.0    1           21
1     22.0   24283097.0   24396622.0    1           20
2      3.0  100340068.0  100442478.0    1           17
3      1.0  104155643.0  104268222.0    1           15
4     12.0   19474770.0   19576936.0    1           15
5      1.0  104109238.0  104268222.0    1           14
6      3.0   35803775.0   35941770.0    1           14
7      7.0    9128070.0    9229882.0    1           14
8      8.0    2346867.0    2582764.0    1           14
9      1.0  161513759.0  161638530.0    1           12
10     9.0   43315670.0   43800186.0    1           12
11    19.0   20829591.0   20986315.0    1           12
12     6.0   95389290.0   95588052.0    1           11
13     6.0  162724247.0  162914986.0    1           11
14     8.0   16416409.0   16526958.0    1           11
15    10.0   45210690.0   45357798.0    1           11
16     1.0   12852872.0   12961019.0    1           10
17     1.0

(1.2708921845574388, 8.7996377714629825e-10)

In [46]:
# 2-6 occurrence CNVs
query = '''
SELECT PHE, count(n_CNV_occur)
FROM n_CNV_occur
WHERE n_CNV_occur >= 2 AND n_CNV_occur <= 6
GROUP BY PHE
'''
sqldf(query)

Unnamed: 0,PHE,count(n_CNV_occur)
0,0,458
1,1,423


In [None]:
# res1 = pd.DataFrame(columns=("FID", "IID", "PHE", "n_cnv"))
# i = 0
# for tup in sw_indiv.itertuples():
#     if tup.NSEG>0:
#         for d in range(tup.NSEG):
#             res1.loc[i] = [tup.FID, tup.IID, tup.PHE, 1]
#             i += 1
#     else:
#         res1.loc[i] = [tup.FID, tup.IID, tup.PHE, 0]
#         i += 1
# print (res1)

In [None]:
sw_indiv["NSEG"][sw_indiv["NSEG"]!=0].count()

In [None]:
sw_indiv["NSEG"][sw_indiv["NSEG"]==0].count()

In [None]:
sw_mds = pd.read_table("../data/swcnv/swcnv.mds", sep="\s+")

In [None]:
sw_mds["C_sum"] = sw_mds["C1"] + sw_mds["C2"] + sw_mds["C3"] + sw_mds["C4"] + sw_mds["C5"] + sw_mds["C6"] + sw_mds["C7"] + sw_mds["C8"] + sw_mds["C9"] + sw_mds["C10"]

In [None]:
print (np.mean(np.exp(sw_mds["C_sum"].tolist())))