In [1]:
# Combines PennCNV calls from "PennCNV 2017 calls final".xlsx (which were calls made from Lucilla's paper) 
# and "PennCNV calls final".xlsx (new samples for the WGS paper),

# filters for size > 50 kbp,

# and filters out poor quality samples

In [2]:
import pandas as pd

In [3]:
# load in PennCNV calls
svs1 = pd.read_excel('PennCNV 2017 calls final.xlsx')
svs2 = pd.read_excel('PennCNV calls final.xlsx')

In [4]:
# rename columns in svs2 to match the column names in svs1
cols = ['Chromosome', 'Start', 'End', 'PatientID', 'NumSNPs', 'Length', 'Type',
       'StartSNP', 'EndSNP', 'Control_freq', 'Freq_label', '%CenTel', '%SD',
       'NEJM CNV', 'Genes', 'GeneList']
svs2.columns = cols

In [6]:
# keep only the columns that are in both
cols_keep = [s for s in svs1.columns if s in svs2.columns]

print(cols_keep)

svs1 = svs1[cols_keep].copy()
svs2 = svs2[cols_keep].copy()

['Chromosome', 'Start', 'End', 'NumSNPs', 'Length', 'Type', 'StartSNP', 'EndSNP', 'PatientID', '%SD', '%CenTel', 'Genes', 'GeneList']


In [7]:
# merge the two svs
svs = svs1.append(svs2).reset_index(drop=True)

In [8]:
# filter for length
svs = svs[svs.Length > 50e3].copy()

In [9]:
# remove poor quality samples

# Poor quality samples from Lucilla's paper
poor_quality_samples = ['SG002','SG006','SG014','SG015','SG023','SG025','SG044','SG046','SG090','SG091','SG134','SG159']
# Poor quality samples from the current batch
poor_quality_samples = poor_quality_samples + ['SG318']

svs = svs[(~svs.PatientID.isin(poor_quality_samples))].copy()

In [10]:
# sort by chromosome and start position
svs = svs.sort_values(['Chromosome', 'Start'])

In [11]:
# save merged dataframe
svs.to_csv('output/merged.PennCNV.tsv', sep='\t', index=False)