In [None]:
#Use conda env open2c_env.yml to create conda env for this script
#This is an example script of calculating the intermingling metric (IM) for real and permuted walks
#It is run from within the 'scripts' subdirectory, using following directory structure:
#Analysis_Dir
#├── data
#    ├── permutations
#├── alignments
#├── figures
#├── scripts
#├── lsf_jobs

In [None]:
import bioframe
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import gridspec
from matplotlib import cm
from matplotlib.gridspec import GridSpec
from matplotlib.gridspec import GridSpecFromSubplotSpec
import matplotlib.colors as colors
from matplotlib.colors import ListedColormap
import random
import seaborn as sns
import scipy
import pickle
from numpy import diff

from pandas import read_csv
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot

%matplotlib inline

In [None]:
conditions = [
    't0Mit_R1',
    't2_R1',
    't4DMSO_R1',
    't4ICRF_R1',
    't8DMSO_R1',
    't8ICRF_R1',
    't0Mit_R2',
    't2_R2',
    't4DMSO_R2',
    't4ICRF_R2',
    't8DMSO_R2',
    't8ICRF_R2',
    't0Mit_R3',
    't2_R3',
    't4DMSO_R3',
    't4ICRF_R3',
    't8DMSO_R3',
    't8ICRF_R3'
]

long_names = {
    't0Mit_R1' : 'TI-MC3C-Dpn-t0Mit-4-30',
    't2_R1' : 'TI-MC3C-Dpn-t2-4-30',
    't4DMSO_R1' : 'TI-MC3C-Dpn-t4DMSO-4-30',
    't4ICRF_R1' : 'TI-MC3C-Dpn-t4ICRF-4-30',
    't8DMSO_R1' : 'TI-MC3C-Dpn-t8DMSO-4-30',
    't8ICRF_R1' : 'TI-MC3C-Dpn-t8ICRF-4-30',
    't0Mit_R2' : 'TI-MC3C-Dpn-t0Mit-4-39',
    't2_R2' : 'TI-MC3C-Dpn-t2-4-39',
    't4DMSO_R2' : 'TI-MC3C-Dpn-t4DMSO-4-39',
    't4ICRF_R2' : 'TI-MC3C-Dpn-t4ICRF-4-39',
    't8DMSO_R2' : 'TI-MC3C-Dpn-t8DMSO-4-39',
    't8ICRF_R2' : 'TI-MC3C-Dpn-t8ICRF-4-39',
    't0Mit_R3' : 'TI-MC3C-Dpn-t0Mit-R3-5-14',
    't2_R3' : 'TI-MC3C-Dpn-t2-R3-5-14',
    't4DMSO_R3' : 'TI-MC3C-Dpn-t4DMSO-R3-5-14',
    't4ICRF_R3' : 'TI-MC3C-Dpn-t4ICRF-R3-5-14',
    't8DMSO_R3' : 'TI-MC3C-Dpn-t8DMSO-R3-5-14',
    't8ICRF_R3' : 'TI-MC3C-Dpn-t8ICRF-R3-5-14',
}


In [None]:
sampleColors = {
    't0Mit_R1' : '#878787',
    't2_R1' : '#E1B7A3',
    't4DMSO_R1' : '#17BECF',
    't4ICRF_R1' : '#D62728',
    't8DMSO_R1': '#0D6871',
    't8ICRF_R1': '#751616',
    't0Mit_R2' : '#878787',
    't2_R2' : '#E1B7A3',
    't4DMSO_R2' : '#17BECF',
    't4ICRF_R2' : '#D62728',
    't8DMSO_R2' :  '#0D6871',
    't8ICRF_R2' : '#751616',
    't0Mit_R3' : '#878787',
    't2_R3' : '#E1B7A3',
    't4DMSO_R3' : '#17BECF',
    't4ICRF_R3' : '#D62728',
    't8DMSO_R3' :  '#0D6871',
    't8ICRF_R3' : '#751616'
}
    
sampleLineStyles = {
    't0Mit_R1' : '-',
    't2_R1' : '-',
    't4DMSO_R1' : '-',
    't4ICRF_R1' : '-',
    't8DMSO_R1': '-',
    't8ICRF_R1': '-',
    't0Mit_R2' : '--',
    't2_R2' : '--',
    't4DMSO_R2' : '--',
    't4ICRF_R2' : '--',
    't8DMSO_R2' :  '--',
    't8ICRF_R2' : '--',
    't0Mit_R3' : ':',
    't2_R3' : ':',
    't4DMSO_R3' : ':',
    't4ICRF_R3' : ':',
    't8DMSO_R3' : ':',
    't8ICRF_R3' : ':',
}

samplePlotNames = {
    't0Mit_R1' : 't0 Mit',
    't2_R1' : 't2',
    't4DMSO_R1' : 't4 DMSO',
    't4ICRF_R1' : 't4 ICRF-193',
    't8DMSO_R1' : 't8 DMSO',
    't8ICRF_R1' : 't8 ICRF-193',
    't0Mit_R2' : 't0 Mit, R2',
    't2_R2' : 't2, R2',
    't4DMSO_R2' : 't4 DMSO, R2',
    't4ICRF_R2' : 't4 ICRF-193, R2',
    't8DMSO_R2' : 't8 DMSO, R2',
    't8ICRF_R2' : 't8 ICRF-193, R2',
    't0Mit_R3' : 't0 Mit, R3',
    't2_R3' : 't2, R3',
    't4DMSO_R3' : 't4 DMSO, R3',
    't4ICRF_R3' : 't4 ICRF-193, R3',
    't8DMSO_R3' : 't8 DMSO, R3',
    't8ICRF_R3' : 't8 ICRF-193, R3' 
}

In [None]:
ctrlconds = [
    't8DMSO_R1',
    't8DMSO_R1',
    't8DMSO_R1',
    't8DMSO_R1',
    't8DMSO_R1',
    't4DMSO_R1',
    't8DMSO_R2',
    't8DMSO_R2',
    't8DMSO_R2',
    't8DMSO_R2',
    't8DMSO_R2',
    't4DMSO_R2',
    't8DMSO_R3',
    't8DMSO_R3',
    't8DMSO_R3',
    't8DMSO_R3',
    't8DMSO_R2',
    't4DMSO_R3',
]

treatconds = [
    't0Mit_R1',
    't2_R1',
    't4DMSO_R1',
    't4ICRF_R1',
    't8ICRF_R1',
    't4ICRF_R1',
    't0Mit_R2',
    't2_R2',
    't4DMSO_R2',
    't4ICRF_R2',
    't8ICRF_R2',
    't4ICRF_R2',
    't0Mit_R3',
    't2_R3',
    't4DMSO_R3',
    't4ICRF_R3',
    't8ICRF_R3',
    't4ICRF_R3',
]

In [None]:
repdict = {
    't0Mit_R1' : 'R1',
    't2_R1' : 'R1',
    't4DMSO_R1' : 'R1',
    't4ICRF_R1' : 'R1',
    't8DMSO_R1' : 'R1',
    't8ICRF_R1' : 'R1',
    't0Mit_R2' : 'R2',
    't2_R2' : 'R2',
    't4DMSO_R2' : 'R2',
    't4ICRF_R2' : 'R2',
    't8DMSO_R2' : 'R2',
    't8ICRF_R2' : 'R2',
    't0Mit_R3' : 'R3',
    't2_R3' : 'R3',
    't4DMSO_R3' : 'R3',
    't4ICRF_R3' : 'R3',
    't8DMSO_R3' : 'R3',
    't8ICRF_R3' : 'R3',
}

labeldict = {
    't0Mit_R1' : 't0 Prometa',
    't2_R1' : 't2 Ana/Telo',
    't4DMSO_R1' : 't4 G1 DMSO',
    't4ICRF_R1' : 't4 G1 ICRF-193',
    't8DMSO_R1' : 't8 G1 DMSO',
    't8ICRF_R1' : 't8 G1 ICRF-193',
    't0Mit_R2' : 't0 Prometa',
    't2_R2' : 't2 Ana/Telo',
    't4DMSO_R2' : 't4 G1 DMSO',
    't4ICRF_R2' : 't4 G1 ICRF-193',
    't8DMSO_R2' : 't8 G1 DMSO',
    't8ICRF_R2' : 't8 G1 ICRF-193',
    't0Mit_R3' : 't0 Prometa',
    't2_R3' : 't2 Ana/Telo',
    't4DMSO_R3' : 't4 G1 DMSO',
    't4ICRF_R3' : 't4 G1 ICRF-193',
    't8DMSO_R3' : 't8 G1 DMSO',
    't8ICRF_R3' : 't8 G1 ICRF-193',
}

labelPlotColors = {
    't0 Prometa' : '#878787',
    't2 Ana/Telo' : '#E1B7A3',
    't4 G1 DMSO' : '#17BECF',
    't4 G1 ICRF-193' : '#D62728',
    't8 G1 DMSO': '#0D6871',
    't8 G1 ICRF-193': '#751616',
}


In [None]:
#For each of the above, can separate by A vs B compartments, or by number of chromosomes visited
comp_types = ['A', 'B', 'AB']
good_chroms = ['chr4', 'chr14', 'chr17', 'chr18', 'chr20', 'chr21']

In [None]:
outDataDir = '..'

In [None]:
#Read in pickled files

f = open(f'{outDataDir}/data/220517_MRICRF_R1R2R3_real_walks_with_summary_firstx_length_fractions_dict_100kbEigs.pkl', 'rb')
real_walks_with_summary_firstx = pickle.load(f)
f.close()

f = open(f'{outDataDir}/data/220517_MRICRF_R1R2R3_real_walks_filtered_summarized_firstx_length_dict_100kbEigs.pkl', 'rb')
real_walks_summarized_firstx = pickle.load(f)
f.close()

In [None]:
#combining technical replicates T1 and T2 reads for t0 Mit R1
real_walks_with_summary_firstx['t0Mit_R1'] = {}
for fragnum in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
    real_walks_with_summary_firstx['t0Mit_R1'][f'length_{fragnum}'] = real_walks_with_summary_firstx['t0Mit_R1_T1'][f'length_{fragnum}'].append(
        real_walks_with_summary_firstx['t0Mit_R1_T2'][f'length_{fragnum}'],
        ignore_index = True
    ) 

In [None]:
#combining T1 and T2 reads for t0 Mit R1
real_walks_summarized_firstx['t0Mit_R1'] = {}
for fragnum in [2, 3, 4, 5, 6, 7, 8, 9, 10]:
    real_walks_summarized_firstx['t0Mit_R1'][f'length_{fragnum}'] = real_walks_summarized_firstx['t0Mit_R1_T1'][f'length_{fragnum}'].append(
        real_walks_summarized_firstx['t0Mit_R1_T2'][f'length_{fragnum}'],
        ignore_index = True
    ) 

In [None]:
#Calculate intermingling metric for real walks, first 6 steps, with QC filters

In [None]:
#Parameter sweep of window size, keep 1Mb window size step size for all. 

In [None]:
#Plotting entanglement for largest step - sliding window analysis

In [None]:
Intermingling_Sliding_Window_Sweep = {}

start_dist = 0 #start of first window
end_dist = 3e7 #end of last window

for window_size in [2e6, 4e6, 6e6, 8e6, 1e7, 1.2e7, 1.4e7, 1.6e7]:
    window_step = 1e6
    
    Intermingling_Sliding_Window = pd.DataFrame()

    #set up windows
    for i, start_bp in enumerate(range(int(start_dist), int(end_dist-window_size), int(window_step))):
        end_bp = start_bp + int(window_size)

        #iterate through conditions for each window, all compartments
        for cond in conditions:
            df = real_walks_with_summary_firstx[cond]['length_6']
            df['Query_Fragment_Length'] = df['Query_End'] - df['Query_Start']
            grouped_walks = df.groupby('Query_Name')
            walks_min_mapq = grouped_walks.agg({'Mapping_Quality' : 'min'})
            good_walks_mapq = walks_min_mapq[walks_min_mapq['Mapping_Quality'] > 59] #use this to filter for mapq
            walks_frac_map = grouped_walks.agg({'Match_Length' : 'sum',
                                        'Query_Fragment_Length' : 'sum',
                                        'Alignment_Length' : 'sum'
                                       })
            walks_high_frac_map = walks_frac_map[
                (walks_frac_map['Match_Length']/walks_frac_map['Query_Fragment_Length']) > 0.8] #use this to filter for fraction mapped

            df2 = real_walks_summarized_firstx[cond]['length_6'].copy()
            df_cond = df2[
                (df2['Chrom_Number'] == 1) &
                (df2['Walk_Comp_Type'].isin(['A', 'B', 'AB'])) &
                (df2['Query_Name'].isin(good_walks_mapq.index)) &
                (df2['Query_Name'].isin(walks_high_frac_map.index)) &
                (df2['chrom'].isin(good_chroms)) &
                (df2['Near_Largest_Step_Either_Side_Frag_Num'] == 6) &
                (df2['Largest_Step'] >= start_bp) &
                (df2['Largest_Step'] < end_bp) &
                (df2['Max_NearOneLargestStepEnd_FracOfFragments'] == 5/6)
            ][['Inter_Largest_Step_Side_Step_Num']]
            df_cond['Condition'] = cond
            df_cond['Label'] = labeldict[cond]
            df_cond['Window_Midpoint'] = (start_bp + end_bp)//2
            df_cond['Walk_Comp_Type'] = 'All'
            df_cond['Replicate'] = repdict[cond]

            #add together into one dataframe
            Intermingling_Sliding_Window = Intermingling_Sliding_Window.append(df_cond, ignore_index = True)

            for comp in comp_types:
                #iterate through comp types
                df_comp = df2[
                    (df2['Chrom_Number'] == 1) &
                    (df2['Walk_Comp_Type'] == comp) &
                    (df2['Query_Name'].isin(good_walks_mapq.index)) &
                    (df2['Query_Name'].isin(walks_high_frac_map.index)) &
                    (df2['chrom'].isin(good_chroms)) &
                    (df2['Near_Largest_Step_Either_Side_Frag_Num'] == 6) &
                    (df2['Largest_Step'] >= start_bp) &
                    (df2['Largest_Step'] < end_bp) &
                    (df2['Max_NearOneLargestStepEnd_FracOfFragments'] == 5/6)
                ][['Inter_Largest_Step_Side_Step_Num']]
                df_comp['Condition'] = cond
                df_comp['Label'] = labeldict[cond]
                df_comp['Window_Midpoint'] = (start_bp + end_bp)//2
                df_comp['Walk_Comp_Type'] = comp
                df_comp['Replicate'] = repdict[cond]

                #add together into one dataframe
                Intermingling_Sliding_Window = Intermingling_Sliding_Window.append(df_comp, ignore_index = True)
            
    Intermingling_Sliding_Window_Sweep[f'{window_size}'] = Intermingling_Sliding_Window

In [None]:
#save intermingling results as pickle file for later use since takes a long time to run
f = open(f'{outDataDir}/data/220518_MRICRF_R1R2R3_InterminglingSlidingWindowSweep.pkl', 'wb')
pickle.dump(Intermingling_Sliding_Window_Sweep, f)
f.close()

In [None]:
#permutations - calculate intermingling metric

In [None]:
#submit permutations for sliding window sweep intermingling analysis

In [None]:
iterations = range(100)
window_sizes = [4e6, 6e6, 8e6, 1e7, 1.2e7, 1.4e7, 1.6e7]

In [None]:
for i in iterations:
    for window in window_sizes:
        !bsub -q short -W 00:20 -e ../lsf_jobs/LSB_%J.err -o ../lsf_jobs/LSB_%J.log \
            -n 1 -R span[hosts=1] -R select[ib] -R rusage[mem=3000] -R select[rh=8] \
            "python 220517_MRICRF_Intermingling_Permutations.py $i $outDataDir $window"