In [5]:
import pandas as pd
import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
pd.options.mode.chained_assignment = None  # default='warn'

In [6]:
df = pd.read_csv('../tables/TableS3B_VST.csv')

### Dropping windows that overlap transposons:

In [7]:
repeatmasker_path = '../ref/RepeatMasker/CpSilv_core_only.fasta.out.gff'
repeatmasker = pd.read_csv(repeatmasker_path, sep='\t', header=None, comment='#')[[0,2,3,4,6,8]]
repeatmasker.columns = ['chrom', 'type', 'start', 'stop', 'direction', 'annotation']

nuc_repeats = repeatmasker[(repeatmasker['annotation'].str.contains(r'\([A-Z]+\)')) | (repeatmasker['annotation'].str.contains('-rich'))]
transposons = repeatmasker[~repeatmasker.index.isin(nuc_repeats.index)]

transposons

Unnamed: 0,chrom,type,start,stop,direction,annotation
1,CP075068.1,similarity,116,292,-,"Target ""Motif:DTX-incomp-chim_Blc56_Cpos_C735-..."
2,CP075068.1,similarity,157,1136,+,"Target ""Motif:RXX-TRIM_Blc207_Cpos_C735-L-B65-..."
3,CP075068.1,similarity,993,1672,-,"Target ""Motif:DTX-incomp-chim_Blc56_Cpos_C735-..."
5,CP075068.1,similarity,1708,9391,-,"Target ""Motif:DTX-incomp-chim_Blc56_Cpos_C735-..."
6,CP075068.1,similarity,8471,16405,-,"Target ""Motif:DTX-incomp-chim_Blc41_Cpos_C735-..."
...,...,...,...,...,...,...
11265,CP075072.1,similarity,1463409,1464429,-,"Target ""Motif:DTX-incomp-chim_Blc73_Superconti..."
11266,CP075072.1,similarity,1463830,1471523,-,"Target ""Motif:DTX-incomp_Blc54_Supercontig_3.5..."
11267,CP075072.1,similarity,1470023,1472092,+,"Target ""Motif:RXX-LARD_Blc184_Cpos_C735-L-B59-..."
11268,CP075072.1,similarity,1472093,1481680,+,"Target ""Motif:DTX-incomp-chim_Blc41_Cpos_C735-..."


In [8]:
def no_tx_overlap(window_start,chrom_transposon_intervals, window_size=250):
    ##return True if no transposons overlap the window, False if a transposon overlaps the window
        
    overlap_count = sum(chrom_transposon_intervals.overlaps(pd.Interval(window_start, window_start+window_size)))
    
    if overlap_count > 0:
        return False
    else:
        return True

In [10]:
df_no_transposons = pd.DataFrame()

for chrom in sorted(df['Chromosome'].unique()):
    
    temp_df = df[df['Chromosome']==chrom]

    chrom_transposons = repeatmasker[repeatmasker['chrom'] == chrom]
    chrom_transposons_intervals = pd.IntervalIndex.from_arrays(chrom_transposons['start'],chrom_transposons['stop'])

    temp_df = temp_df.loc[temp_df['Start'].map(lambda x: no_tx_overlap(x, chrom_transposons_intervals))]

    df_no_transposons = df_no_transposons.append(temp_df)

In [11]:
df_no_transposons['VST_AZ_TXMXSA_CB'].describe()

count    6631.000000
mean        0.116935
std         0.145311
min         0.000000
25%         0.010345
50%         0.032549
75%         0.231250
max         0.760512
Name: VST_AZ_TXMXSA_CB, dtype: float64

In [12]:
df_no_transposons['VST_AZ_TXMXSA_CB'].shape

(89303,)

In [13]:
df_no_transposons[df_no_transposons['VST_AZ_TXMXSA_CB'].isna()].shape

(82672, 45)

In [14]:
df_no_transposons[~df_no_transposons['VST_AZ_TXMXSA_CB'].isna()].shape

(6631, 45)

In [15]:
pct95 = df_no_transposons['VST_AZ_TXMXSA_CB'].quantile(.95)
pct99 = df_no_transposons['VST_AZ_TXMXSA_CB'].quantile(.99)
pct95, pct99

(0.4454228243021346, 0.5910098522167488)

### Of the 68 windows with 99h percentile VST, 35 lie in the chromosome 3 CNV region.

In [16]:
start, stop = 1237751, 1428001

chr3_high_VST_windows = df_no_transposons[(df_no_transposons['VST_AZ_TXMXSA_CB'] >= pct99) & 
                  (df_no_transposons['Chromosome']=='CP075070.1') & 
                  (df_no_transposons['Start'].isin(range(start, stop)))]

chr3_high_VST_windows.shape

(35, 45)

In [18]:
total_high_VST_windows = df_no_transposons[(df_no_transposons['VST_AZ_TXMXSA_CB'] >= pct99)]
total_high_VST_windows.shape

(68, 45)

In [19]:
chr3_region_all_windows = df_no_transposons[(df_no_transposons['Chromosome']=='CP075070.1') & 
                  (df_no_transposons['Start'].isin(range(start, stop)))]

chr3_region_total_windows = chr3_region_all_windows.shape[0]

In [20]:
df_no_transposons[(df_no_transposons['Chromosome']=='CP075070.1') & 
                  (df_no_transposons['Start'].isin(range(start, stop)))]

Unnamed: 0,Chromosome,Start,VST_AZ_TXMXSA_CB,gene,Tucson_9,Phoenix_7,Tucson_21,Phoenix_2,Tucson_19,Phoenix_9,...,4545-MICE_Venezuela,3796_Venezuela,Nuevo_Leon_1,730334_Guatemala,Nuevo_Leon_2,San_Antonio_1,GT002_Texas,Coahuila_2,Sonora_1,B5773_Brazil
73248,CP075070.1,1237751,0.279243,,0,0,0,1,0,0,...,1,1,0,0,0,0,0,0,0,0
73249,CP075070.1,1238001,0.279243,D8B26_005343,0,0,0,1,0,0,...,1,1,0,0,0,0,0,0,0,0
73250,CP075070.1,1238251,0.279243,D8B26_005343,0,0,0,1,0,0,...,1,1,0,0,0,0,0,0,0,0
73251,CP075070.1,1238501,0.279243,D8B26_005343,0,0,0,1,0,0,...,1,1,0,0,0,0,0,0,0,0
73252,CP075070.1,1238751,0.282228,D8B26_005343,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74010,CP075070.1,1426751,0.579841,D8B26_005419,0,0,0,0,0,0,...,3,2,0,0,1,0,0,0,1,0
74011,CP075070.1,1427001,0.579841,D8B26_005420,0,0,0,0,0,0,...,3,2,0,0,1,0,0,0,1,0
74012,CP075070.1,1427251,0.412503,D8B26_005420,0,0,0,0,0,0,...,3,2,0,0,1,1,0,1,1,0
74013,CP075070.1,1427501,0.412503,D8B26_005420,0,0,0,0,0,0,...,3,2,0,0,1,1,0,1,1,0


In [21]:
max_distance = chr3_region_all_windows['Start'].max() - chr3_region_all_windows['Start'].min()
max_distance

190000

### Calculating p value for Chromosome 3 CNV region

Randomly assign 63 windows to be "high VST" - in how many cases do 35 or more land on the same chromosome? Answer = 1/10000. And these are spread throughout the chromosome, so not in a 190kb region as the chr3 CNV locus is.

In [22]:
np.random.seed(11)

n = 10000
num_high_VST_windows = len(total_high_VST_windows)
threshold = len(chr3_high_VST_windows)
window_size=250
max_distance = chr3_region_all_windows['Start'].max() - chr3_region_all_windows['Start'].min()
count = 0

for _ in range(n):
    
    if _ % 1000 == 0:
        print(_)
    
    randomized_high_VST_windows = df_no_transposons.sample(num_high_VST_windows, replace=False)

    for chrom in sorted(df_no_transposons['Chromosome'].unique()):
        
        same_chrom_randomized_high_VST_windows = randomized_high_VST_windows[randomized_high_VST_windows['Chromosome']==chrom]

        if same_chrom_randomized_high_VST_windows.shape[0] < threshold:
            continue
        
        else:
            
            print(same_chrom_randomized_high_VST_windows.shape[0], chrom)
            s = same_chrom_randomized_high_VST_windows
            t = randomized_high_VST_windows
            count+=1

            
count

0
35 CP075068.1
1000
2000
3000
4000
5000
6000
7000
8000
9000


1