# Check Copy Number Variant Calls 

Uses output from CNVnator generated with the CNVcall.sh script to screen for copy number variants in the time series for experimental samples

In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
%matplotlib inline

In [91]:
#List all copy number variants
path = '/Users/chrisgraves/Documents/Yeast_data/Sequencing/alignments/CNV/'
os.listdir(path)

['C1-12_CNVcalls.tsv',
 'C1-1_CNVcalls.tsv',
 'C1-3_CNVcalls.tsv',
 'C1-5_CNVcalls.tsv',
 'C1-7_CNVcalls.tsv',
 'C1-9_CNVcalls.tsv',
 'C10-12_CNVcalls.tsv',
 'C10-1_CNVcalls.tsv',
 'C10-3_CNVcalls.tsv',
 'C10-5_CNVcalls.tsv',
 'C10-7_CNVcalls.tsv',
 'C10-9_CNVcalls.tsv',
 'C2-12_CNVcalls.tsv',
 'C2-1_CNVcalls.tsv',
 'C2-3_CNVcalls.tsv',
 'C2-5_CNVcalls.tsv',
 'C2-7_CNVcalls.tsv',
 'C2-9_CNVcalls.tsv',
 'C3-12_CNVcalls.tsv',
 'C3-1_CNVcalls.tsv',
 'C3-3_CNVcalls.tsv',
 'C3-5_CNVcalls.tsv',
 'C3-7_CNVcalls.tsv',
 'C3-9_CNVcalls.tsv',
 'C4-12_CNVcalls.tsv',
 'C4-1_CNVcalls.tsv',
 'C4-3_CNVcalls.tsv',
 'C4-5_CNVcalls.tsv',
 'C4-7_CNVcalls.tsv',
 'C4-9_CNVcalls.tsv',
 'C5-12_CNVcalls.tsv',
 'C5-1_CNVcalls.tsv',
 'C5-3_CNVcalls.tsv',
 'C5-5_CNVcalls.tsv',
 'C5-7_CNVcalls.tsv',
 'C5-9_CNVcalls.tsv',
 'C6-12_CNVcalls.tsv',
 'C6-1_CNVcalls.tsv',
 'C6-3_CNVcalls.tsv',
 'C6-5_CNVcalls.tsv',
 'C6-7_CNVcalls.tsv',
 'C6-9_CNVcalls.tsv',
 'C7-12_CNVcalls.tsv',
 'C7-1_CNVcalls.tsv',
 'C7-3_CNVcalls.ts

## Load data from ancestor

In [93]:
# Load raw data from ancestor

ancestor = pd.read_csv(path+'P2-0_CNVcalls.tsv',sep='\t', header=None) 
ancestor.shape()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,duplication,ref|NC_001133|:1-31400,31400,1.55585,0,2435550000.0,0,2461200000.0,1
1,duplication,ref|NC_001133|:166001-204600,38600,1.28734,0,2787520.0,0,3993220.0,1
2,duplication,ref|NC_001133|:205601-230000,24400,1.44584,0,2526510000.0,0,2553120000.0,1
3,duplication,ref|NC_001134|:1-34200,34200,1.53124,0,2044050000.0,0,2085070000.0,1
4,duplication,ref|NC_001134|:762801-813100,50300,1.4902,0,105.261,0,207.932,1


In [94]:
ancestor.columns = ['CNV_type','Index','Length','Norm_RD','e1','e2','e3','e4','q0']
ancestor.head()

Unnamed: 0,CNV_type,Index,Length,Norm_RD,e1,e2,e3,e4,q0
0,duplication,ref|NC_001133|:1-31400,31400,1.55585,0,2435550000.0,0,2461200000.0,1
1,duplication,ref|NC_001133|:166001-204600,38600,1.28734,0,2787520.0,0,3993220.0,1
2,duplication,ref|NC_001133|:205601-230000,24400,1.44584,0,2526510000.0,0,2553120000.0,1
3,duplication,ref|NC_001134|:1-34200,34200,1.53124,0,2044050000.0,0,2085070000.0,1
4,duplication,ref|NC_001134|:762801-813100,50300,1.4902,0,105.261,0,207.932,1


In [95]:
#create dictionary to change chromosome names
chrom_key = list(('ref|NC_001133|','ref|NC_001134|','ref|NC_001135|','ref|NC_001136|','ref|NC_001137|','ref|NC_001138|','ref|NC_001139|','ref|NC_001140|','ref|NC_001141|','ref|NC_001142|','ref|NC_001143|','ref|NC_001144|','ref|NC_001145|','ref|NC_001146|','ref|NC_001147|','ref|NC_001148|','ref|NC_001224|'))
chrom_num = list(['I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII','XIV','XV','XVI','MITO'])
chrom_dict = dict(zip(chrom_key,chrom_num))

def parse_chrom(index):
    chrom = index.split(':')[0]
    return(chrom_dict[chrom])

In [96]:
#test double split
test = 'ref|NC_001133|:1-23000'
test.split(':')[1].split('-')[1]

'23000'

In [97]:
#parse index column into chrom, start pos, end pos
ancestor['Chrom'] = ancestor['Index'].apply(parse_chrom)
ancestor['Start'] = ancestor['Index'].apply(lambda x: x.split(':')[1].split('-')[0])
ancestor['End'] = ancestor['Index'].apply(lambda x: x.split(':')[1].split('-')[1])
ancestor.head()

Unnamed: 0,CNV_type,Index,Length,Norm_RD,e1,e2,e3,e4,q0,Chrom,Start,End
0,duplication,ref|NC_001133|:1-31400,31400,1.55585,0,2435550000.0,0,2461200000.0,1,I,1,31400
1,duplication,ref|NC_001133|:166001-204600,38600,1.28734,0,2787520.0,0,3993220.0,1,I,166001,204600
2,duplication,ref|NC_001133|:205601-230000,24400,1.44584,0,2526510000.0,0,2553120000.0,1,I,205601,230000
3,duplication,ref|NC_001134|:1-34200,34200,1.53124,0,2044050000.0,0,2085070000.0,1,II,1,34200
4,duplication,ref|NC_001134|:762801-813100,50300,1.4902,0,105.261,0,207.932,1,II,762801,813100


## Load and combine all data from experimental samples

In [106]:
files = os.listdir(path)

cols = list(ancestor.columns)
cols.extend(['Treatment','Strain','Time'])
df = pd.DataFrame(columns = cols)

for f in files:
    if (f[0] == 'C') | (f[0] == 'H'):
        
        try:
            temp = pd.read_csv(path+f,sep='\t', header=None)
            temp.columns = ['CNV_type','Index','Length','Norm_RD','e1','e2','e3','e4','q0']
            temp['Chrom'] = temp['Index'].apply(parse_chrom)
            temp['Start'] = temp['Index'].apply(lambda x: x.split(':')[1].split('-')[0])
            temp['End'] = temp['Index'].apply(lambda x: x.split(':')[1].split('-')[1])
            split_ID = f.split('-')
            sample = split_ID[0]
            time = split_ID[1].split('_')[0]
            temp['Treatment'] = sample[0]
            temp['Strain'] = int(sample[1:len(sample)])
            temp['Time'] = int(time)
            df = pd.concat([df,temp])
        except ValueError:
            pass


print(df.shape)
df.head()

(9560, 15)


Unnamed: 0,CNV_type,Index,Length,Norm_RD,e1,e2,e3,e4,q0,Chrom,Start,End,Treatment,Strain,Time
0,duplication,ref|NC_001133|:1-6700,6700,1.84396,8.46815e-09,274087000.0,3.3909e-08,207892.0,1,I,1,6700,C,1,12
1,duplication,ref|NC_001133|:6901-51700,44800,1.56938,0.0,1070780000.0,0.0,1118980000.0,1,I,6901,51700,C,1,12
2,duplication,ref|NC_001133|:163901-204400,40500,1.41267,0.0,7.40054e-06,0.0,3.88773e-05,1,I,163901,204400,C,1,12
3,duplication,ref|NC_001133|:205701-230300,24600,1.61279,0.0,1670450000.0,0.0,1745640000.0,1,I,205701,230300,C,1,12
4,duplication,ref|NC_001134|:1-96300,96300,1.46712,0.0,11960000.0,0.0,13401900.0,1,II,1,96300,C,1,12


In [108]:
endpoints = df[df['Time']==12]
print(endpoints.shape)
endpoints.head()

(1518, 15)


Unnamed: 0,CNV_type,Index,Length,Norm_RD,e1,e2,e3,e4,q0,Chrom,Start,End,Treatment,Strain,Time
0,duplication,ref|NC_001133|:1-6700,6700,1.84396,8.46815e-09,274087000.0,3.3909e-08,207892.0,1,I,1,6700,C,1,12
1,duplication,ref|NC_001133|:6901-51700,44800,1.56938,0.0,1070780000.0,0.0,1118980000.0,1,I,6901,51700,C,1,12
2,duplication,ref|NC_001133|:163901-204400,40500,1.41267,0.0,7.40054e-06,0.0,3.88773e-05,1,I,163901,204400,C,1,12
3,duplication,ref|NC_001133|:205701-230300,24600,1.61279,0.0,1670450000.0,0.0,1745640000.0,1,I,205701,230300,C,1,12
4,duplication,ref|NC_001134|:1-96300,96300,1.46712,0.0,11960000.0,0.0,13401900.0,1,II,1,96300,C,1,12


## Remove rows that were found in ancestor

In [110]:
ancestor.shape

(82, 12)

In [69]:
cols = list(ancestor.columns)
cols.extend(['Treatment','Strain'])
cols

test = pd.DataFrame(columns = cols)
test

Unnamed: 0,CNV_type,Index,Length,Norm_RD,e1,e2,e3,e4,q0,Chrom,Start,End,Treatment,Strain
