### Table 1 and Figure 3 in CNV in schizophrenia in Sweden

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from collections import Counter
from pandasql import sqldf
from scipy import stats
import statsmodels.api as sm

#### Table 1

In [2]:
sw_pheno = pd.read_table("../data/swcnv/swcnv.pheno", sep="\s+")
sw_sum = pd.read_table("../data/swcnv/swcnv.qc6.cnv.summary", sep="\s+") # "CHR"
sw_fam = pd.read_table("../data/swcnv/swcnv.qc6.fam", sep="\s+", header = None, 
                       usecols = (0,1,4,5), names = ["FID", "IID", "sex", "Case-Ctrl"]) # sex and case-ctrl
sw_cnv = pd.read_table("../data/swcnv/swcnv.qc6.cnv", sep="\s+")
sw_cluster = pd.read_table("../data/swcnv/swcnv.clusters", sep="\s+") # Sw1, Sw2-4, Sw5-6
sw_cluster = sw_cluster.rename(columns={"uFID": "FID", "uIID": "IID"})
sw_indiv = pd.read_table("../data/swcnv/swcnv.qc6.cnv.indiv", sep="\s+") # "NSEG": number of CNV in each sample
sw_cluster_indiv = pd.merge(sw_cluster, sw_indiv, how="inner", on=["FID", "IID"])
sw_cluster_cnv = pd.merge(sw_cluster, sw_cnv, how="inner", on=["FID", "IID"])
# One line for each individual with at least one CNV
sw_cluster_cnv_indiv = pd.merge(sw_cluster_cnv, sw_indiv, how="inner", on=["FID", "IID"])
sw_cluster_cnv_indiv["PHE"] = sw_cluster_cnv_indiv.apply(lambda row: 1 if row["PHE"]==2 else 0, axis=1)
sw_cluster_cnv_indiv["CNV_LEN"] = (sw_cluster_cnv_indiv["BP2"] - sw_cluster_cnv_indiv["BP1"])/1000

In [3]:
# Number of Cases and Controls, 2: cases, 1: controls
sw_fam.groupby("Case-Ctrl").size()

Case-Ctrl
1    5917
2    4719
dtype: int64

In [4]:
# sex by status
sex_by_casectrl = sw_fam.groupby(["Case-Ctrl", "sex"]).size()
print (sex_by_casectrl)

Case-Ctrl  sex
1          1      3034
           2      2883
2          1      2826
           2      1893
dtype: int64


In [5]:
prp_male_case = sex_by_casectrl[2][1] / (sex_by_casectrl[2][1] + sex_by_casectrl[2][2])
print (prp_male_case)
prp_male_ctrl = sex_by_casectrl[1][1] / (sex_by_casectrl[1][1] + sex_by_casectrl[1][2])
print (prp_male_ctrl)

0.598855689765
0.512759844516


In [6]:
print (sw_cluster_indiv.groupby(["tr", "PHE"]).size())

tr            PHE
scz_s234_eur  1      2137
              2      1847
scz_swe1_eur  1       206
              2       207
scz_swe5_eur  1      2456
              2      1674
scz_swe6_eur  1      1118
              2       991
dtype: int64


In [7]:
# "NSEG": number of CNV in each subject
mean_cnv_case_sw1 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe1_eur"]
                            [sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl_sw1 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe1_eur"]
                            [sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case_sw1, mean_cnv_ctrl_sw1)
mean_cnv_case_sw234 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_s234_eur"]
                              [sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl_sw234 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_s234_eur"]
                              [sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case_sw234, mean_cnv_ctrl_sw234)
mean_cnv_case_sw56 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe5_eur"]
                            [sw_cluster_indiv["PHE"]==2].tolist()
                            + sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe6_eur"]
                            [sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl_sw56 = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]==("scz_swe5_eur")]
                             [sw_cluster_indiv["PHE"]==1].tolist()
                            + sw_cluster_indiv["NSEG"][sw_cluster_indiv["tr"]=="scz_swe6_eur"]
                            [sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case_sw56, mean_cnv_ctrl_sw56)
mean_cnv_case = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["PHE"]==2].tolist())
mean_cnv_ctrl = np.mean(sw_cluster_indiv["NSEG"][sw_cluster_indiv["PHE"]==1].tolist())
print (mean_cnv_case, mean_cnv_ctrl)

1.05797101449 0.844660194175
1.2360584732 1.20870379036
0.758348968105 0.683547845551
0.958465776648 0.878823728241


In [8]:
prp_del_case = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1].tolist()
prp_del_ctrl = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==0].tolist()
# 1 is deletion, 3 is duplication
print (prp_del_case.count(1)/len(prp_del_case))
print (prp_del_ctrl.count(1)/len(prp_del_ctrl))

0.38182622153437984
0.38211538461538463


In [9]:
prp_del_case_sw1 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe1_eur"].tolist()
prp_del_ctrl_sw1 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==0][sw_cluster_cnv_indiv["tr"]=="scz_swe1_eur"].tolist()
print (prp_del_case_sw1.count(1)/len(prp_del_case_sw1))
print (prp_del_ctrl_sw1.count(1)/len(prp_del_ctrl_sw1))

0.4794520547945205
0.39080459770114945


In [10]:
prp_del_case_sw234 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist()
prp_del_ctrl_sw234 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==0][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist()
print (prp_del_case_sw234.count(1)/len(prp_del_case_sw234))
print (prp_del_ctrl_sw234.count(1)/len(prp_del_ctrl_sw234))

0.3587385019710907
0.36624080526519553


In [11]:
prp_del_case_sw56 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe5_eur"].tolist() + sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe6_eur"].tolist()
prp_del_ctrl_sw56 = sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==0][sw_cluster_cnv_indiv["tr"]=="scz_swe5_eur"].tolist() + sw_cluster_cnv_indiv["TYPE"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe6_eur"].tolist()
print (prp_del_case_sw56.count(1)/len(prp_del_case_sw56))
print (prp_del_ctrl_sw56.count(1)/len(prp_del_ctrl_sw56))

0.39864355689525244
0.4088235294117647


In [12]:
median_cnv_len_case = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1].tolist())
median_cnv_len_ctrl = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==0].tolist())
print (median_cnv_len_case)
print (median_cnv_len_ctrl)

188.797
182.711


In [13]:
median_cnv_len_case_sw1 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe1_eur"].tolist())
median_cnv_len_ctrl_sw1 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==0][sw_cluster_cnv_indiv["tr"]=="scz_swe1_eur"].tolist())
print (median_cnv_len_case_sw1)
print (median_cnv_len_ctrl_sw1)

178.016
181.313


In [14]:
median_cnv_len_case_sw234 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist())
median_cnv_len_ctrl_sw234 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==0][sw_cluster_cnv_indiv["tr"]=="scz_s234_eur"].tolist())
print (median_cnv_len_case_sw234)
print (median_cnv_len_ctrl_sw234)

188.213
182.211


In [15]:
median_cnv_len_case_sw56 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe5_eur"].tolist() + sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==2][sw_cluster_cnv_indiv["tr"]=="scz_swe6_eur"].tolist())
median_cnv_len_ctrl_sw56 = np.median(sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==0][sw_cluster_cnv_indiv["tr"]=="scz_swe5_eur"].tolist() + sw_cluster_cnv_indiv["CNV_LEN"][sw_cluster_cnv_indiv["PHE"]==1][sw_cluster_cnv_indiv["tr"]=="scz_swe6_eur"].tolist())
print (median_cnv_len_case_sw56)
print (median_cnv_len_ctrl_sw56)

191.854
182.507


#### Figure 3

In [16]:
sw_cluster_indiv = pd.merge(sw_cluster, sw_indiv, how="inner", on=["FID", "IID"])
sw_cluster_indiv_cnv = pd.merge(sw_cluster_indiv, sw_cnv, how="outer", on=["FID", "IID"])
sw_cluster_indiv_cnv["PHE"] = sw_cluster_indiv_cnv.apply(lambda row: 1 if row["PHE"]==2 else 0, axis=1)
# sw_cluster_indiv_cnv

In [17]:
# One line for each individual, with CNV or without CNV
query = '''
SELECT FID, IID, PHE, NSEG, TYPE, tr
FROM sw_cluster_indiv_cnv
GROUP BY FID, IID, PHE, NSEG, tr
ORDER BY PHE DESC
'''
# FIXME: needs to combine TYPE
sw_indiv_adj = sqldf(query)
# sw_indiv_adj = sw_indiv_adj.rename(columns={"PHE_NEW": "PHE"})
sw_indiv_adj["hasCNV"] = sw_indiv_adj.apply(lambda row: 1 if row["NSEG"]>0 else 0, axis=1)
sw_indiv_adj["tr_adj"] = sw_indiv_adj.apply(lambda row: 1 if row["tr"]=="scz_swe1_eur" 
                                else 2 if row["tr"]=="scz_s234_eur" else 3, axis=1)
sw_indiv_adj

Unnamed: 0,FID,IID,PHE,NSEG,TYPE,tr,hasCNV,tr_adj
0,PT-1RTZ,1,1,0,,scz_swe1_eur,0,1
1,PT-1RU7,1,1,0,,scz_swe1_eur,0,1
2,PT-1RUJ,1,1,1,3.0,scz_swe1_eur,1,1
3,PT-1RUO,1,1,1,3.0,scz_swe1_eur,1,1
4,PT-1RV2,1,1,3,3.0,scz_swe1_eur,1,1
5,PT-1RV4,1,1,1,3.0,scz_swe1_eur,1,1
6,PT-1RVE,1,1,0,,scz_swe1_eur,0,1
7,PT-1RVH,1,1,0,,scz_swe1_eur,0,1
8,PT-1RVS,1,1,0,,scz_swe1_eur,0,1
9,PT-1RW3,1,1,1,1.0,scz_swe1_eur,1,1


In [18]:
# all CNVs logistic regression
indep_cols = sw_indiv_adj[["NSEG", "tr_adj"]]
all_CNV_logit = sm.Logit(sw_indiv_adj["PHE"], indep_cols)
res = all_CNV_logit.fit()
print (res.summary())
print (np.exp(res.params))

Optimization terminated successfully.
         Current function value: 0.685450
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                    PHE   No. Observations:                10636
Model:                          Logit   Df Residuals:                    10634
Method:                           MLE   Df Model:                            1
Date:                Tue, 18 Jul 2017   Pseudo R-squ.:                0.001951
Time:                        19:49:47   Log-Likelihood:                -7290.4
converged:                       True   LL-Null:                       -7304.7
                                        LLR p-value:                 9.354e-08
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
NSEG           0.0634      0.018      3.543      0.000       0.028       0.098
tr_adj        -0.1124      0.

In [19]:
# all CNV OR, Fisher's test
query = '''
SELECT PHE, TYPE, count(FID)
FROM sw_indiv_adj
GROUP BY PHE, TYPE
ORDER BY count(FID) DESC
'''
print (sqldf(query))
print (sw_indiv_adj.groupby(["PHE", "TYPE"]).size())
stats.fisher_exact([[1659+1083+50+5, 1969+1288+46+4], [1922, 2610]])[0]

   PHE  TYPE  count(FID)
0    0   NaN        2610
1    0   3.0        1969
2    1   NaN        1922
3    1   3.0        1659
4    0   1.0        1288
5    1   1.0        1083
6    1   4.0          50
7    0   4.0          46
8    1   0.0           5
9    0   0.0           4
PHE  TYPE
0    0.0        4
     1.0     1288
     3.0     1969
     4.0       46
1    0.0        5
     1.0     1083
     3.0     1659
     4.0       50
dtype: int64


1.1485380709477924

In [20]:
sw_cluster_cnv_indiv

Unnamed: 0,FID,IID,tr,CHR,BP1,BP2,TYPE,SCORE,SITES,PHE,NSEG,KB,KBAVG,CNV_LEN
0,PT-BPTP,1,scz_s234_eur,7,9128802,9229882,1,65.0100,86,1,1,101.080,101.080,101.080
1,PT-BPAI,1,scz_s234_eur,16,18228118,18727232,3,13.1300,41,1,1,499.114,499.114,499.114
2,PT-BSLJ,1,scz_s234_eur,1,16869363,17005978,3,38.6800,55,1,1,136.615,136.615,136.615
3,PT-BQP5,1,scz_s234_eur,16,18830938,19075191,3,97.8900,113,1,1,244.253,244.253,244.253
4,PT-BP9I,1,scz_s234_eur,21,37481955,37619514,4,147.0000,100,1,1,137.559,137.559,137.559
5,PT-BQGK,1,scz_s234_eur,15,97984319,99176197,3,811.3100,934,1,1,1191.880,1191.880,1191.878
6,PT-BSRL,1,scz_s234_eur,11,54773893,56010199,3,436.6850,623,1,1,1236.310,1236.310,1236.306
7,PT-BPTX,1,scz_s234_eur,6,119714337,119951785,1,171.4600,133,1,1,237.448,237.448,237.448
8,PT-BPYW,1,scz_s234_eur,9,11919831,12182949,1,313.6600,223,1,1,263.118,263.118,263.118
9,PT-BRL3,1,scz_s234_eur,16,15072787,15241308,3,41.4400,62,1,2,1548.610,774.303,168.521


In [21]:
sw_cnv_indiv = pd.merge(sw_cnv, sw_indiv, how="inner", on=["FID", "IID"])
sw_cnv_indiv["PHE"] = sw_cnv_indiv.apply(lambda row: 1 if row["PHE"]==2 else 0, axis=1)
# the occurrence of each CNV, no matter in cases or controls
query = '''
SELECT FID, CHR, BP1, BP2, PHE, count(FID)
FROM sw_cnv_indiv
WHERE NSEG != 0
GROUP BY CHR, BP1, BP2
ORDER BY CHR
'''
n_CNV_occur = sqldf(query)
n_CNV_occur = n_CNV_occur.rename(columns={"count(FID)": "n_CNV_occur"})
# print (n_CNV_occur)
### single occurrence CNVs
single_CNV = n_CNV_occur[n_CNV_occur["n_CNV_occur"]==1]
print (single_CNV)

          FID  CHR        BP1        BP2  PHE  n_CNV_occur
0     PT-ERQ6    1     768448     894573    1            1
1     PT-8UXN    1     824136    1017216    1            1
2     PT-L1HP    1    1106784    1220136    1            1
3     PT-L1HP    1    1307872    1450947    1            1
5     PT-9ZDV    1    2251160    2982621    0            1
6     PT-9YXD    1    2257863    2554287    1            1
7     PT-BR57    1    2325371    3054268    1            1
8     PT-286U    1    2363327    2468400    1            1
10    PT-L2FZ    1    2700372    2817421    0            1
11    PT-FFPM    1    2700372    2891713    0            1
12    PT-8K6T    1    2700372    3208375    0            1
13    PT-285Y    1    2701816    3126644    1            1
14    PT-FFQE    1    2727804    2891713    1            1
15    PT-FFQE    1    3000465    3135175    1            1
16    PT-28GB    1    3222113    3504073    1            1
17    PT-8UZK    1    3261682    3512516    1           

In [22]:
i = 0
single_CNV_test = pd.DataFrame(columns=("FID", "CHR", "BP1", "BP2", "PHE", "n_CNV_occur"))
for index, row in enumerate(single_CNV.values):
    if index < single_CNV.shape[0]-1 and index >= 1 and row[1] == single_CNV.values[index+1][1] == single_CNV.values[index-1][1] and row[2] >= single_CNV.values[index-1][3] and row[2] <= single_CNV.values[index+1][2] and row[3] <= single_CNV.values[index+1][2] and row[3] >= single_CNV.values[index-1][3] and row[3] <= single_CNV.values[index+1][3]:
        single_CNV_test.loc[i] = row
        i += 1
    else: continue
pd.options.display.float_format = '{:,.0f}'.format
single_CNV_test

Unnamed: 0,FID,CHR,BP1,BP2,PHE,n_CNV_occur
0,PT-L1HP,1,1106784,1220136,1,1
1,PT-L1HP,1,1307872,1450947,1,1
2,PT-FFQE,1,3000465,3135175,1,1
3,PT-1RU6,1,4116604,4614822,0,1
4,PT-8TK1,1,7656503,7952404,0,1
5,PT-ESXF,1,9070930,9182914,1,1
6,PT-BRMI,1,10846558,11100454,1,1
7,PT-BQZR,1,16096756,16731510,1,1
8,PT-OOJX,1,19586294,19798007,0,1
9,PT-3MNA,1,35453372,35575946,1,1


In [46]:
single_CNV_test1 = pd.merge(sw_indiv_adj, single_CNV_test, how="inner", on=["FID", "PHE"])
# print (single_CNV_test1)
query = '''
SELECT FID, PHE, tr_adj, count(n_CNV_occur)
FROM single_CNV_test1
GROUP BY FID
ORDER BY count(n_CNV_occur) DESC
'''
single_CNV_test2 = sqldf(query)
single_CNV_test2 = single_CNV_test2.rename(columns={"count(n_CNV_occur)": "single_CNV_count"})
print (single_CNV_test2)

              FID  PHE  tr_adj  single_CNV_count
0         PT-1S18    1       1                 5
1         PT-OQ3N    1       3                 4
2         PT-1S2B    0       1                 3
3         PT-28GE    1       2                 3
4         PT-2M7L    1       2                 3
5         PT-9ZSS    1       2                 3
6         PT-L1X3    0       3                 3
7         PT-L2JZ    1       3                 3
8         PT-OOZT    0       3                 3
9         PT-1SCR    1       1                 2
10        PT-286U    1       2                 2
11        PT-3MHC    0       2                 2
12        PT-3MNV    1       2                 2
13        PT-8K6N    0       3                 2
14        PT-8K7B    0       3                 2
15        PT-8TFH    1       2                 2
16        PT-8U7T    1       2                 2
17        PT-8UZK    1       2                 2
18   PT-8VHE_dup1    1       3                 2
19        PT-8VSE   

In [50]:
indep_cols2 = single_CNV_test2[["single_CNV_count", "tr_adj"]]
single_CNV_logit_test = sm.Logit(single_CNV_test2["PHE"], indep_cols2)
res2 = single_CNV_logit_test.fit()
print (res2.summary())
print (float(np.exp(res2.params[0])))

Optimization terminated successfully.
         Current function value: 0.688422
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                    PHE   No. Observations:                  868
Model:                          Logit   Df Residuals:                      866
Method:                           MLE   Df Model:                            1
Date:                Tue, 18 Jul 2017   Pseudo R-squ.:                0.006508
Time:                        19:54:17   Log-Likelihood:                -597.55
converged:                       True   LL-Null:                       -601.47
                                        LLR p-value:                  0.005141
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
single_CNV_count     0.4538      0.173      2.618      0.009       0.114       0.793
tr_adj     

In [25]:
single_CNV_allindiv = pd.merge(sw_indiv_adj, single_CNV, how="outer", on=["FID", "PHE"])
print (single_CNV_allindiv)
query = '''
SELECT FID, PHE, NSEG, tr_adj, count(n_CNV_occur)
FROM single_CNV_allindiv
GROUP BY FID
ORDER BY count(n_CNV_occur) DESC
'''
single_CNV_indiv = sqldf(query)
single_CNV_indiv = single_CNV_indiv.rename(columns={"count(n_CNV_occur)": "single_CNV_count"})
single_CNV_indiv["tmp"] = single_CNV_indiv.apply(lambda row: 1 if row["single_CNV_count"]>0 else 0, axis=1)
print (single_CNV_indiv)

           FID  IID  PHE  NSEG  TYPE            tr  hasCNV  tr_adj  CHR  \
0      PT-1RTZ    1    1     0   nan  scz_swe1_eur       0       1  nan   
1      PT-1RU7    1    1     0   nan  scz_swe1_eur       0       1  nan   
2      PT-1RUJ    1    1     1     3  scz_swe1_eur       1       1    7   
3      PT-1RUO    1    1     1     3  scz_swe1_eur       1       1  nan   
4      PT-1RV2    1    1     3     3  scz_swe1_eur       1       1    2   
5      PT-1RV2    1    1     3     3  scz_swe1_eur       1       1    6   
6      PT-1RV4    1    1     1     3  scz_swe1_eur       1       1   17   
7      PT-1RVE    1    1     0   nan  scz_swe1_eur       0       1  nan   
8      PT-1RVH    1    1     0   nan  scz_swe1_eur       0       1  nan   
9      PT-1RVS    1    1     0   nan  scz_swe1_eur       0       1  nan   
10     PT-1RW3    1    1     1     1  scz_swe1_eur       1       1    2   
11     PT-1RW7    1    1     2     1  scz_swe1_eur       1       1    2   
12     PT-1RW7    1    1 

In [43]:
indep_cols1 = single_CNV_indiv[["single_CNV_count", "tr_adj"]]
single_CNV_logit = sm.Logit(single_CNV_indiv["PHE"], indep_cols1)
res1 = single_CNV_logit.fit()
print (res1.summary())
print (np.exp(res1.params[0]))

Optimization terminated successfully.
         Current function value: 0.685408
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                    PHE   No. Observations:                10636
Model:                          Logit   Df Residuals:                    10634
Method:                           MLE   Df Model:                            1
Date:                Tue, 18 Jul 2017   Pseudo R-squ.:                0.002013
Time:                        19:51:51   Log-Likelihood:                -7290.0
converged:                       True   LL-Null:                       -7304.7
                                        LLR p-value:                 5.873e-08
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
single_CNV_count     0.0901      0.025      3.667      0.000       0.042       0.138
tr_adj     

In [27]:
n_CNV_occur.groupby(["n_CNV_occur"]).size()

n_CNV_occur
1     5373
2      529
3      167
4       92
5       44
6       43
7       25
8       24
9       14
10      13
11       5
12      11
13       3
14       9
15       5
16       1
17       3
18       5
19       5
20       5
22       3
24       3
27       1
28       1
29       1
30       1
31       1
32       1
33       1
34       1
35       2
40       1
51       1
dtype: int64

In [28]:
n_CNV_occur["n_CNV_occur"].sum()

9723

In [29]:
# the occurrence of each CNV, in cases or controls separately
query = '''
SELECT FID, CHR, BP1, BP2, PHE, count(FID)
FROM sw_cluster_indiv_cnv
WHERE NSEG != 0
GROUP BY CHR, BP1, BP2, PHE
ORDER BY count(FID) DESC
'''
n_CNV_occur_sep = sqldf(query)
n_CNV_occur_sep = n_CNV_occur_sep.rename(columns={"count(FID)": "n_CNV_occur"})
print (n_CNV_occur_sep)
# 2-6 occurrence CNVs
query = '''
SELECT PHE, CHR, BP1, BP2, count(n_CNV_occur)
FROM n_CNV_occur_sep
WHERE n_CNV_occur >= 2 AND n_CNV_occur <= 6
GROUP BY PHE, CHR, BP1, BP2
ORDER BY count(n_CNV_occur) DESC
'''
sqldf(query)

          FID  CHR         BP1         BP2  PHE  n_CNV_occur
0     PT-P29N    3  35,826,707  35,938,795    0           30
1     PT-OPVY    7   9,128,070   9,229,882    0           26
2     PT-OQ36    7  64,679,561  65,088,807    0           24
3     PT-CDFS    1 104,109,238 104,268,222    0           21
4     PT-3MID    1 161,496,900 161,638,530    0           21
5     PT-P29T    3  35,826,707  35,938,795    1           21
6     PT-1TC3   22  24,283,097  24,396,622    1           20
7     PT-CDH4   12  19,474,770  19,576,936    0           19
8     PT-P298    8   2,346,867   2,582,764    0           18
9     PT-29CN    3 100,340,068 100,442,478    1           17
10    PT-OPJ6    7   9,127,173   9,229,882    0           17
11    PT-OQ4D    8  16,416,409  16,526,958    0           17
12    PT-BQG2    9  43,315,670  43,800,186    0           17
13    PT-BQOS    1 104,155,643 104,268,222    1           15
14    PT-9ZDN   12  19,474,770  19,576,936    1           15
15    PT-1TB8   22  24,2

Unnamed: 0,PHE,CHR,BP1,BP2,count(n_CNV_occur)
0,0,1,12852748,13015495,1
1,0,1,12858053,12961019,1
2,0,1,12867090,13015495,1
3,0,1,16844932,17262247,1
4,0,1,16869363,16970456,1
5,0,1,16869363,16986851,1
6,0,1,16869363,17005978,1
7,0,1,16886135,16986851,1
8,0,1,16886135,17082591,1
9,0,1,16886135,17114712,1


In [30]:
# res1 = pd.DataFrame(columns=("FID", "IID", "PHE", "n_cnv"))
# i = 0
# for tup in sw_indiv.itertuples():
#     if tup.NSEG>0:
#         for d in range(tup.NSEG):
#             res1.loc[i] = [tup.FID, tup.IID, tup.PHE, 1]
#             i += 1
#     else:
#         res1.loc[i] = [tup.FID, tup.IID, tup.PHE, 0]
#         i += 1
# print (res1)

In [31]:
sw_indiv["NSEG"][sw_indiv["NSEG"]!=0].count()

6104

In [32]:
sw_indiv["NSEG"][sw_indiv["NSEG"]==0].count()

4532

In [33]:
single_CNV.shape[0]

5373

In [34]:
sw_mds = pd.read_table("../data/swcnv/swcnv.mds", sep="\s+")

In [35]:
sw_mds["C_sum"] = sw_mds["C1"] + sw_mds["C2"] + sw_mds["C3"] + sw_mds["C4"] + sw_mds["C5"] + sw_mds["C6"] + sw_mds["C7"] + sw_mds["C8"] + sw_mds["C9"] + sw_mds["C10"]

In [36]:
print (np.mean(np.exp(sw_mds["C_sum"].tolist())))

1.00015782373
