In [1]:
### script to annotate gCNVs calls. from PGC SCZ CNV. 

import pysam
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
import numpy as np
import scipy.stats
import pybedtools as pybed
import sys
import subprocess
import re


from utils.SegTools import *
from utils.vcf_tools_eam0 import *


sns.set_style('ticks')
pd.set_option('display.max_columns', None)


df = pd.read_csv('../data/metadata/PGC_SCZ_callset_cnv_liftOver.bed',\
                 names=["CHROM","BEG","END","sampleInfo","diagnosis","TYPE"], sep="\t")

df["RETYPE"] = "DEL"
df.RETYPE.loc[df.TYPE == "G"] = "DUP"

sample_info = df["sampleInfo"].str.split("*",n=2,expand=True)
sample_info[0] = sample_info[0].str.replace("cas_","")

df["batch"] = sample_info[0]
df.CHROM = df.CHROM.str.replace('chr','')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [2]:

#################################################
#### Gene name annotation
#################################################
df_reduce = df[["CHROM","BEG","END"]]
df_reduce.rename(columns={'CHROM':'chrom', 'BEG':'start', 'END':'end'},inplace=True)


st = SegTool(df_reduce, sample='SCZ')

## Download cytoband annotations
df_cyto = pd.read_table('utils/cytoBand.txt.gz', names=['chrom','start','end', 'cyto','stain'], compression='gzip')
df_cyto.chrom = df_cyto.chrom.str.replace('chr','')
bed_cyto = pybed.BedTool.from_dataframe(df_cyto[['chrom','start','end','cyto',]])

## Annotate data with cytobands. 
st_cyto = st.annotate(bed_cyto)

def cyto_range(cyto_annot, chrom):
    p = []
    q = []
    for cyto in cyto_annot.split(','):
        if cyto[0] == 'p':
            p.append(float(cyto[1:]))
        else:
            q.append(float(cyto[1:]))
    if p:
        start = "p{}".format(p[-1])
        if q:
            end = "q{}".format(q[-1])
        else:
            end = "p{}".format(p[0])
    else:
        start = "q{}".format(q[0])
        end = "q{}".format(q[-1])
    return "{}{}-{}".format(chrom,start,end)
    

cyto_lst = [cyto_range(row.annotation, row.chrom) for i, row in st_cyto.seg.iterrows()]

cyto_lst2 = [",".join(["{}{}".format(row.chrom, s) for s in row.annotation.split(',')]) for i, row in st_cyto.seg.iterrows()]

df['CYTOBAND'] = cyto_lst

df_annot = df_reduce.copy()
df_annot['cyto'] = cyto_lst2

## Annotate with gene names
st = SegTool(df_reduce, sample='SCZ')
f_genes= "utils/hg19_genes.bed"
df_genes = pd.read_table(f_genes, names = ['chrom','start','end','gene'])
df_genes.chrom = df_genes.chrom.str.replace("chr","")
df_genes.start = df_genes.start.astype(int)
df_genes.end = df_genes.end.astype(int)

bed_genes = pybed.BedTool.from_dataframe(df_genes)

st_an = st.annotate(bed_genes)
df_annot['genes'] = st_an.seg['annotation']
gene_lst = [",".join(sorted(set([an.split('-')[0] for an in row.genes.split(',')]))) for i, row in df_annot.iterrows()]
df['GENES'] = gene_lst

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.seg['length'] = self.seg.end - self.seg.start
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = annotations


In [3]:
df.head()

Unnamed: 0,CHROM,BEG,END,sampleInfo,diagnosis,TYPE,RETYPE,batch,CYTOBAND,GENES
0,15,62049836,62116300,cas_scz_aber_eur_A6.0*Ab_PT-22LY_cas_scz_aber_...,SCZ,L,DEL,scz_aber_eur_A6.0,15q22.2-q22.2,
1,14,20823095,21391494,cas_scz_aber_eur_A6.0*Ab_PT-22PM_cas_scz_aber_...,SCZ,L,DEL,scz_aber_eur_A6.0,14q11.2-q11.2,"ANG,APEX1,EDDM3A,EDDM3B,KLHL33,OR6S1,OSGEP,PAR..."
2,1,192874024,192894788,cas_scz_aber_eur_A6.0*Ab_PT-22PM_cas_scz_aber_...,SCZ,L,DEL,scz_aber_eur_A6.0,1q31.2-q31.2,
3,2,43494221,43554973,cas_scz_aber_eur_A6.0*Ab_PT-22PM_cas_scz_aber_...,SCZ,L,DEL,scz_aber_eur_A6.0,2p21.0-p21.0,THADA
4,12,30422705,30590427,cas_scz_aber_eur_A6.0*Ab_PT-236W_cas_scz_aber_...,SCZ,L,DEL,scz_aber_eur_A6.0,12p11.22-p11.22,


In [4]:

######################################
###### LENGTH ANNOTATION
######################################

df["LENGTH"] = df.END.values - df.BEG.values


############################################
##### Annotate number of genes affected
############################################

ngenes = np.zeros(len(df))

for i, row in df.iterrows():
	genes_affected = row.GENES.split(",")
	if '' in genes_affected:
		pass
	else:
		num_genes = len(genes_affected)
		ngenes[i] = num_genes

df['number_genes'] = ngenes
df['log_number_genes'] = np.log(df.number_genes + 1)


In [5]:
## Filter batches that were not processed 
df.batch = df.batch.str.replace("con_","")
df_filtered = df[df.batch.str.contains("omni|I650")]
df_filtered["Array"] = "Illumina_Combined"
df_filtered.Array[df_filtered.batch.str.contains("I650")] = "Illum610K"
df_filtered.batch = df_filtered.batch.str.replace("_I650","")
df_filtered.batch = df_filtered.batch.str.replace("_omni","")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [6]:
df_filtered.to_csv("../data/metadata/PGC_SCZ_anno_callset_cnv.csv",sep=",",index=False)
df_filtered.head()

Unnamed: 0,CHROM,BEG,END,sampleInfo,diagnosis,TYPE,RETYPE,batch,CYTOBAND,GENES,LENGTH,number_genes,log_number_genes,Array
611,16,1807780,1841034,cas_scz_asrb_eur_I650*697_100864,SCZ,L,DEL,scz_asrb_eur,16p13.3-p13.3,"EME2,IGFALS,MAPK8IP3,MRPS34,NME3,NUBP2,SPSB3",33254,7.0,2.079442,Illum610K
612,10,68317083,68384105,cas_scz_asrb_eur_I650*576_100779,SCZ,L,DEL,scz_asrb_eur,10q21.3-q21.3,CTNNA3,67022,1.0,0.693147,Illum610K
613,11,134158745,134198173,cas_scz_asrb_eur_I650*580_100771,SCZ,L,DEL,scz_asrb_eur,11q25.0-q25.0,"GLB1L2,GLB1L3",39428,2.0,1.098612,Illum610K
614,2,52829458,52893166,cas_scz_asrb_eur_I650*715_100679,SCZ,L,DEL,scz_asrb_eur,2p16.3-p16.3,,63708,0.0,0.0,Illum610K
615,1,1210963,1288584,cas_scz_asrb_eur_I650*604_100385,SCZ,L,DEL,scz_asrb_eur,1p36.33-p36.33,"ACAP3,CPSF3L,DVL1,GLTPD1,MXRA8,PUSL1,SCNN1D,TA...",77621,9.0,2.302585,Illum610K


In [7]:

#####################################3
## Finding the overlaps
####################################

## Change the cut-off value to generate the overlap percentage values needed for 
## Figure4C code in 6_Combined_Analysis notebook.
cutoff = 10.0

## read in filtered somatic variants

df_mcnv3 = pd.read_csv("../data/COMBINED_filtered_sCNV.csv")

df_pgc_raw = df_filtered.copy()
df_pgc_raw = df_pgc_raw[df_pgc_raw.diagnosis=="SCZ"]
df_pgc_raw = df_pgc_raw[["CHROM","BEG","END","sampleInfo","RETYPE"]]
df_pgc_dup = df_pgc_raw[df_pgc_raw.RETYPE=="DUP"]
df_pgc_del = df_pgc_raw[df_pgc_raw.RETYPE=="DEL"]

df_pgc_frequent = pd.DataFrame(columns=["CHROM","BEG","END","sampleInfo","RETYPE","overlaps"])

print(len(df_pgc_dup.sampleInfo.unique()))
for i, samp in enumerate(df_pgc_dup.sampleInfo.unique()):
	if i % 500 == 0:
		print(i)
	df1 = df_pgc_dup[(df_pgc_dup.sampleInfo==samp)]
	df2 = df_pgc_dup[(df_pgc_dup.sampleInfo!=samp)]
	bed1 = pybed.BedTool.from_dataframe(df1)
	bed2 = pybed.BedTool.from_dataframe(df2)
	bed_coverage = bed1.coverage(bed2,counts=True,f=0.50)
	df_coverage = bed_coverage.to_dataframe(names=["CHROM","BEG","END","sampleInfo","RETYPE","overlaps"], index_col=False)
	df_coverage = df_coverage.loc[df_coverage.overlaps > cutoff]
	df_pgc_frequent = pd.concat([df_pgc_frequent,df_coverage],ignore_index=True)

print(len(df_pgc_del.sampleInfo.unique()))
for i, samp in enumerate(df_pgc_del.sampleInfo.unique()):
	if i % 500 == 0:
		print(i)
	df1 = df_pgc_del[(df_pgc_del.sampleInfo==samp)]
	df2 = df_pgc_del[(df_pgc_del.sampleInfo!=samp)]
	bed1 = pybed.BedTool.from_dataframe(df1)
	bed2 = pybed.BedTool.from_dataframe(df2)
	bed_coverage = bed1.coverage(bed2,counts=True,f =0.50)
	df_coverage = bed_coverage.to_dataframe(names=["CHROM","BEG","END","sampleInfo","RETYPE","overlaps"], index_col=False)
	df_coverage = df_coverage.loc[df_coverage.overlaps > cutoff]
	df_pgc_frequent = pd.concat([df_pgc_frequent,df_coverage],ignore_index=True)

print("==============================================================\n")
print("Fraction of gCNVs present in >5 in SCZ out of all the gCNVs \n")
num_frequent = len(df_pgc_frequent)
fraction_frequent = num_frequent/len(df_pgc_raw)
print(num_frequent,"\n")
print(fraction_frequent,"\n")
print("==============================================================\n")





3946
0
500
1000
1500
2000
2500
3000
3500
3772
0
500
1000
1500
2000
2500
3000
3500

Fraction of gCNVs present in >5 in SCZ out of all the gCNVs 

4016 

0.34225328106357594 




In [8]:
## Overlaps with mCNVs from SCZ
df_counts = df_mcnv3[(df_mcnv3.label == "SCZ")]

df_pgc_overlaps = pd.DataFrame(columns=["CHROM","BEG","END","LENGTH","CYTOBAND","SAMPLE","overlaps"])
df_counts.rename(columns={'BEG_GRCh37':"BEG","END_GRCh37":"END"},inplace=True)
df_counts.BEG = df_counts.BEG.astype("int")
df_counts.END = df_counts.END.astype("int")
df_counts_dup = df_counts[df_counts.RETYPE == "DUP"]
df_counts_dup = df_counts_dup.loc[:,["CHROM","BEG","END","LENGTH","CYTOBAND","SAMPLE"]]
print(len(df_counts_dup.SAMPLE.unique()))
for i, samp in enumerate(df_counts_dup.SAMPLE.unique()):
    if i % 500 == 0:
        print(i)
    df1 = df_counts_dup[(df_counts_dup.SAMPLE==samp)]
    df2 = df_pgc_frequent[(df_pgc_frequent.RETYPE == "DUP")]
    bed1 = pybed.BedTool.from_dataframe(df1)
    bed2 = pybed.BedTool.from_dataframe(df2)
    bed_coverage = bed1.coverage(bed2,counts=True,f=0.50)
    df_coverage = bed_coverage.to_dataframe(names=["CHROM","BEG","END","LENGTH","CYTOBAND","SAMPLE","overlaps"],index_col=False)
    df_coverage = df_coverage.loc[df_coverage.overlaps >= 1]
    df_pgc_overlaps = pd.concat([df_pgc_overlaps,df_coverage],ignore_index=True)


df_counts_del = df_counts[df_counts.RETYPE == "DEL"]
df_counts_del = df_counts_del.loc[:,["CHROM","BEG","END","LENGTH","CYTOBAND","SAMPLE"]]
print(len(df_counts_del.SAMPLE.unique()))
for i, samp in enumerate(df_counts_del.SAMPLE.unique()):
    if i % 500 == 0:
        print(i)
    df1 = df_counts_del[(df_counts_del.SAMPLE==samp)]
    df2 = df_pgc_frequent[(df_pgc_frequent.RETYPE == "DEL")]
    bed1 = pybed.BedTool.from_dataframe(df1)
    bed2 = pybed.BedTool.from_dataframe(df2)
    bed_coverage = bed1.coverage(bed2,counts=True, f=0.50)
    df_coverage = bed_coverage.to_dataframe(names=["CHROM","BEG","END","LENGTH","CYTOBAND","SAMPLE","overlaps"],index_col=False)
    df_coverage = df_coverage.loc[df_coverage.overlaps >= 1]
    df_pgc_overlaps = pd.concat([df_pgc_overlaps,df_coverage],ignore_index=True)

print("==============================================================\n")
print("Fraction of the gCNVs present in >5 SCZ patients, out of sCNVs in SCZ")
num_overlaps = len(df_pgc_overlaps)
fraction_overlaps = num_overlaps/len(df_counts)
print(num_overlaps, "\n")
print(fraction_overlaps, "\n")
print("==============================================================\n")

42
0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


78
0

Fraction of the gCNVs present in >5 SCZ patients, out of sCNVs in SCZ
5 

0.03816793893129771 


