In [None]:
#Use conda env minimap_conda_env.yml to create conda env for this script
#This is an example script of processing mapped MC-3C data for further analysis
#It is run from within the 'scripts' subdirectory, using following directory structure:
#Analysis_Dir
#├── data
#    ├── permutations
#├── alignments
#├── figures
#├── scripts
#├── lsf_jobs

In [None]:
from collections import Counter
import itertools
import sys
from collections import OrderedDict
import bioframe
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import gridspec
from matplotlib import cm
from matplotlib.gridspec import GridSpec
from matplotlib.gridspec import GridSpecFromSubplotSpec
import matplotlib.colors as colors
from matplotlib.colors import ListedColormap
import random
import seaborn as sns
import scipy
import pickle

In [None]:
alignmentDir = #path_to_alignments
outDataDir = #out_data_path
compartmentDataDir = #path_to_compartment_eigenvector

#### This section (below) will change depending on specific samples ####
conditions = [
    't0Mit_R1_T1',
    't2_R1',
    't4DMSO_R1',
    't4ICRF_R1',
    't8DMSO_R1',
    't8ICRF_R1',
    't0Mit_R1_T2',
    't0Mit_R2',
    't2_R2',
    't4DMSO_R2',
    't4ICRF_R2',
    't8DMSO_R2',
    't8ICRF_R2',
    't0Mit_R3',
    't2_R3',
    't4DMSO_R3',
    't4ICRF_R3',
    't8DMSO_R3',
    't8ICRF_R3'
]

long_names = {
    't0Mit_R1_T1' : 'TI-MC3C-Dpn-t0Mit-4-30',
    't2_R1' : 'TI-MC3C-Dpn-t2-4-30',
    't4DMSO_R1' : 'TI-MC3C-Dpn-t4DMSO-4-30',
    't4ICRF_R1' : 'TI-MC3C-Dpn-t4ICRF-4-30',
    't8DMSO_R1' : 'TI-MC3C-Dpn-t8DMSO-4-30',
    't8ICRF_R1' : 'TI-MC3C-Dpn-t8ICRF-4-30',
    't0Mit_R1_T2' : 'TI-MC3C-Dpn-t0Mit-4-30-T2',
    't0Mit_R2' : 'TI-MC3C-Dpn-t0Mit-4-39',
    't2_R2' : 'TI-MC3C-Dpn-t2-4-39',
    't4DMSO_R2' : 'TI-MC3C-Dpn-t4DMSO-4-39',
    't4ICRF_R2' : 'TI-MC3C-Dpn-t4ICRF-4-39',
    't8DMSO_R2' : 'TI-MC3C-Dpn-t8DMSO-4-39',
    't8ICRF_R2' : 'TI-MC3C-Dpn-t8ICRF-4-39',
    't0Mit_R3' : 'TI-MC3C-Dpn-t0Mit-R3-5-14',
    't2_R3' : 'TI-MC3C-Dpn-t2-R3-5-14',
    't4DMSO_R3' : 'TI-MC3C-Dpn-t4DMSO-R3-5-14',
    't4ICRF_R3' : 'TI-MC3C-Dpn-t4ICRF-R3-5-14',
    't8DMSO_R3' : 'TI-MC3C-Dpn-t8DMSO-R3-5-14',
    't8ICRF_R3' : 'TI-MC3C-Dpn-t8ICRF-R3-5-14'
}

#All using t8DMSO R1R2 eigen
eigs_conds = [
    't8DMSO_R1R2'
]

eigs_long_names = {
    't8DMSO_R1R2' : 'TI-HiC-Dpn-HeLa-MitoticRelease-t8hr-DMSO-6hr-G1Sort-R1R2'
}

eigs_to_conditions = {
    't0Mit_R1_T1' : 't8DMSO_R1R2',
    't2_R1' : 't8DMSO_R1R2',
    't4DMSO_R1' : 't8DMSO_R1R2',
    't4ICRF_R1' : 't8DMSO_R1R2',
    't8DMSO_R1' : 't8DMSO_R1R2',
    't8ICRF_R1' : 't8DMSO_R1R2',
    't0Mit_R1_T2' : 't8DMSO_R1R2',
    't0Mit_R2' : 't8DMSO_R1R2',
    't2_R2' : 't8DMSO_R1R2',
    't4DMSO_R2' : 't8DMSO_R1R2',
    't4ICRF_R2' : 't8DMSO_R1R2',
    't8DMSO_R2' : 't8DMSO_R1R2',
    't8ICRF_R2' : 't8DMSO_R1R2',
    't0Mit_R3': 't8DMSO_R1R2',
    't2_R3': 't8DMSO_R1R2',
    't4DMSO_R3': 't8DMSO_R1R2',
    't4ICRF_R3': 't8DMSO_R1R2',
    't8DMSO_R3': 't8DMSO_R1R2',
    't8ICRF_R3': 't8DMSO_R1R2',
}
#### This section (above) will change depending on specific samples ####

In [None]:
#load alignments
#Completely unaligned walks are dropped from this output, but walks may have unaligned steps which are just not shown...
#Only primary alignments are shown in this output
output_filenames = {}
for cond in conditions:
    output_filenames[cond] = f'{alignmentDir}/{long_names[cond]}.hg38.minimap2.output.paf'

#Read in walks as pandas dataframes
raw_walk_dfs = {}
for cond in conditions:
    raw_walk_dfs[cond] = pd.read_csv(output_filenames[cond], sep = '\t', header = None)

#add column names                                     
for cond in conditions:
    raw_walk_dfs[cond].columns = ['Query_Name', 'Query_Length', 'Query_Start', 'Query_End',
                                  'Strand', 'chrom', 'Target_ChrSize', 'start', 
                                  'end', 'Match_Length', 'Alignment_Length', 'Mapping_Quality', 13, 14, 15, 16, 17, 18]    
    
#Nicer to sort by position in walk instead of position in genome - fast! Don't need to group. 
sorted_dfs = {}
for cond in conditions:
    sorted_dfs[cond] = raw_walk_dfs[cond].sort_values(['Query_Name', 'Query_Start']).reset_index(drop = True)


In [None]:
#Add the annotations needed for splitting walk by first_x read length
annotated_dfs = {}
for cond in conditions:
    df = sorted_dfs[cond].iloc[:, 0:12].copy()
    grouped_df = df.groupby(by = 'Query_Name')
    df['Fragment_Index'] = grouped_df.cumcount()
    
    summary_table = pd.DataFrame()
    summary_table['Fragment_Number'] = grouped_df.size()
    
    df2 = df.merge(summary_table['Fragment_Number'], left_on = 'Query_Name', right_index = True)
    annotated_dfs[cond] = df2

In [None]:
#Split full walks based on number of fragments, run summary separately for each set
#Only using first 2, 3, 4, 5, 6, 7, 8, 9, 10 fragments of all walks
fragment_nums = [2, 3, 4, 5, 6, 7, 8, 9, 10]

real_walks_firstx_length = {}

for cond in conditions:
    real_walks_firstx_length[cond] = {}
    for num in fragment_nums:
        df = annotated_dfs[cond].copy()
    
        #Subset to only first num fragments of each walk
        df2 = df[(df['Fragment_Number'] >= num) & (df['Fragment_Index'] < num)]

        real_walks_firstx_length[cond][f'length_{num}'] = df2

In [None]:
#Annotate walks with largest step size - to use for intermingling/entanglement analysis - first X walks
stepsize_dfs = {}

for cond in conditions:
    stepsize_dfs[cond] = {}
    for num in fragment_nums:
        df = real_walks_firstx_length[cond][f'length_{num}'].copy()

        df['mid'] = (df['start'].copy() + df['end'].copy())//2
        df['dist'] = df.mid.diff()
        df['dist'].iloc[np.where(df['Query_Name'] != df['Query_Name'].shift())] = np.nan
        df['dist'].iloc[np.where(df['chrom'] != df['chrom'].shift())] = np.nan
        
        #Adding in whether a step changes chromosomes
        df['Trans_Step'] = df['chrom'] != df['chrom'].shift()
        df['Trans_Step'].iloc[np.where(df['Query_Name'] != df['Query_Name'].shift())] = False 
        
        #adding absolute distance of step as well
        df['Abs_Dist'] = abs(df['dist'].copy())
        
        grouped_df = df.groupby(by = 'Query_Name')
        summary_table1 = pd.DataFrame()

        #size of largest step in walk
        summary_table1['Largest_Step'] = grouped_df['Abs_Dist'].max()
        df = df.join(summary_table1, on = 'Query_Name')
        
        #whether a fragment is part of largest step
        df['Largest_Step_Fragment_Start'] = (df['Abs_Dist'] == df['Largest_Step']).shift(-1)
        df['Largest_Step_Fragment_End'] = (df['Abs_Dist'] == df['Largest_Step'])
        
        #Filter out reads where more than one step is the same size as the largest step - there are a few of these
        summary_table1 = pd.DataFrame()
        
        grouped_df = df.groupby(by = 'Query_Name')
        summary_table1['Num_Largest_Steps'] = grouped_df['Largest_Step_Fragment_Start'].sum().astype(int)
        
        df = df.join(summary_table1, on = 'Query_Name')
        df = df[df['Num_Largest_Steps'] == 1]
        
       #midpoint of start of largest step fragment
        midpoint_start = df.loc[df['Largest_Step_Fragment_Start'] == True, ['Query_Name', 'mid']]
        midpoint_start = midpoint_start.set_index('Query_Name', drop = True)

        #midpoint of end of largest step fragment
        midpoint_end = df.loc[df['Largest_Step_Fragment_End'] == True, ['Query_Name', 'mid']]
        midpoint_end = midpoint_end.set_index('Query_Name', drop = True)

        df = df.join(midpoint_start, on = 'Query_Name', rsuffix = '_Largest_Step_Start')
        df = df.join(midpoint_end, on = 'Query_Name', rsuffix = '_Largest_Step_End')

        #absolute distance from each fragment to start of largest step
        df['Distance_To_Largest_Step_Start'] = abs(df['mid_Largest_Step_Start'].copy() - df['mid'].copy())

        #distance from each fragment to end of largest step
        df['Distance_To_Largest_Step_End'] = abs(df['mid_Largest_Step_End'].copy() - df['mid'].copy())  

        stepsize_dfs[cond][f'length_{num}'] = df
    
    

In [None]:
#Annotate walks - add compartment info for each fragment - t8 DMSO R1R2 eigens, from Hi-C of 2 replicates combined
#100kb eigens
eigs = {}
for cond in eigs_conds:
    eigs[cond] = pd.read_csv(f'{compartmentDataDir}/{eigs_long_names[cond]}.100kb.mapq30.byarm.allchroms.eigs.cis.vecs.txt',
                  sep = '\t')
    
#calling A vs B compartments in each sample
eigsA = {}
eigsB = {}

for cond in eigs_conds:
    eigsA[cond] = eigs[cond][eigs[cond]['E1'] > 0]
    eigsB[cond] = eigs[cond][eigs[cond]['E1'] < 0]


#Merge adjacent bins into one compartment region, give each a unique index - unique across both types
eigsA_Ranges = {}
eigsB_Ranges = {}

for cond in eigs_conds:
    eigsA_Ranges[cond] = bioframe.merge(eigsA[cond][['chrom', 'start', 'end']], min_dist = 0).reset_index(drop = True)
    eigsA_Ranges[cond]['comp_type'] = 'A'
    
    eigsB_Ranges[cond] = bioframe.merge(eigsB[cond][['chrom', 'start', 'end']], min_dist = 0).reset_index(drop = True)
    eigsB_Ranges[cond]['comp_type'] = 'B'

#Combine A and B to get unique indices, then split again for overlaps
eigsAB_Ranges = {}

for cond in eigs_conds:
    eigsAB_Ranges[cond] = eigsA_Ranges[cond].append(eigsB_Ranges[cond]).reset_index(drop=True).reset_index()
    eigsAB_Ranges[cond].columns = ['comp_index', 'chrom', 'start', 'end', 'n_intervals', 'comp_type']

eigsA_Ranges = {}
eigsB_Ranges = {}

for cond in eigs_conds:
    eigsA_Ranges[cond] = eigsAB_Ranges[cond][eigsAB_Ranges[cond]['comp_type'] == 'A']
    eigsB_Ranges[cond] = eigsAB_Ranges[cond][eigsAB_Ranges[cond]['comp_type'] == 'B']

In [None]:
#Overlap A and B ranges with cwalk steps - assign each step as A or B based on overlap, change '0' to np.nan
#Add compartment index

overlap_dfs = {} #slow, but better here than doing compartment type again for each iteration
for cond in conditions:
    overlap_dfs[cond] = {}
    for num in fragment_nums:
        df = stepsize_dfs[cond][f'length_{num}'].copy()
        overlapA = bioframe.overlap(df, eigsA_Ranges[eigs_to_conditions[cond]], how = 'left', suffixes = ('', '_A'), 
                                keep_order = True)
        overlapAB = bioframe.overlap(overlapA, eigsB_Ranges[eigs_to_conditions[cond]], how = 'left', suffixes = ('', '_B'),
                                     keep_order = True)
        overlapAB['Frag_Comp_Type'] = 0
        overlapAB.loc[~overlapAB['comp_index_A'].isna(), 'Frag_Comp_Type'] = 'A'
        overlapAB.loc[~overlapAB['comp_index_B'].isna(), 'Frag_Comp_Type'] = 'B'
        overlapAB['Frag_Comp_Index'] = np.nan
        overlapAB.loc[~overlapAB['comp_index_A'].isna(), 'Frag_Comp_Index'] = overlapAB['comp_index_A'].copy()
        overlapAB.loc[~overlapAB['comp_index_B'].isna(), 'Frag_Comp_Index'] = overlapAB['comp_index_B'].copy()
        df = overlapAB.drop(labels = ['comp_index_A', 'chrom_A', 'start_A', 'end_A', 'n_intervals_A', 'comp_index_B', 'chrom_B',
                                      'start_B', 'end_B', 'n_intervals_B', 'comp_type_A', 'comp_type_B'], axis = 1)

        #Compartment Type of entire walk - A, B, AB, NA
        grouped_df = df.groupby(by = 'Query_Name')
        summary_table = pd.DataFrame()
        comp_types = []
        for key,group_df in grouped_df:
            comps = group_df['Frag_Comp_Type'].unique().tolist()
            comps = ''.join(sorted([str(i) for i in comps]))
            comp_types.append(comps)
        summary_table['Walk_Comp_Type'] = comp_types
        summary_table['Query_Name'] = grouped_df['Query_Name'].first().values
        summary_table.loc[summary_table['Walk_Comp_Type'].isin(['0', '0A', '0B', '0AB']), 'Walk_Comp_Type'] = np.nan
        df = df.merge(summary_table, on = 'Query_Name')
        overlap_dfs[cond][f'length_{num}'] = df    

In [None]:
#next add annotations to each fragment and summarize walks
overlap_dfs2 = {}
for cond in conditions:
    overlap_dfs2[cond] = {}
    for num in fragment_nums:
        df = overlap_dfs[cond][f'length_{num}'].copy()

        #Adding in whether a step changes compartment type, or compartment index
        df['Inter_Comp_Type_Step'] = df['Frag_Comp_Type'] != df['Frag_Comp_Type'].shift()
        df['Inter_Comp_Type_Step'].iloc[np.where(df['Query_Name'] != df['Query_Name'].shift())] = np.nan

        df['Inter_Comp_Index_Step'] = df['Frag_Comp_Index'] != df['Frag_Comp_Index'].shift()
        df['Inter_Comp_Index_Step'].iloc[np.where(df['Query_Name'] != df['Query_Name'].shift())] = np.nan
        overlap_dfs2[cond][f'length_{num}'] = df


In [None]:
#next add annotations to each fragment based on how far fragment is from either side of largest step
overlap_dfs3 = {}
for cond in conditions:
    overlap_dfs3[cond] = {}
    for num in fragment_nums:
        df = overlap_dfs2[cond][f'length_{num}'].copy()

        #is the step within 1/4 of largest step size from start of the largest step?
        df['Near_Largest_Step_Start_Step'] = df['Distance_To_Largest_Step_Start'] < df['Largest_Step']//4

        #is the step within 1/4 of largest step size from end of the largest step?
        df['Near_Largest_Step_End_Step'] = df['Distance_To_Largest_Step_End'] < df['Largest_Step']//4

        #If the fragment is close to one end of the largest step, was the step between the two regions, or within one region?
        df['Inter_Largest_Step_Side_Step'] = df['Near_Largest_Step_Start_Step'] != df['Near_Largest_Step_Start_Step'].shift()
        df['Inter_Largest_Step_Side_Step'].iloc[np.where(df['Query_Name'] != df['Query_Name'].shift())] = np.nan
        df['Inter_Largest_Step_Side_Step'].iloc[np.where((df['Near_Largest_Step_Start_Step'] == False) &
                                                         (df['Near_Largest_Step_End_Step'] == False))] = np.nan

        overlap_dfs3[cond][f'length_{num}'] = df

In [None]:
summarized_walks = {}

for cond in conditions:
    summarized_walks[cond] = {}
    for num in fragment_nums:
        summary_table = pd.DataFrame()
        grouped_df = overlap_dfs3[cond][f'length_{num}'].groupby('Query_Name')

        #Number of fragments in walk close to the two sides of the largest step
        summary_table['Near_Largest_Step_Start_Frag_Num'] = grouped_df['Near_Largest_Step_Start_Step'].sum().astype(int)
        summary_table['Near_Largest_Step_End_Frag_Num'] = grouped_df['Near_Largest_Step_End_Step'].sum().astype(int)

        #Number of steps between sides of largest step
        summary_table['Inter_Largest_Step_Side_Step_Num'] = grouped_df['Inter_Largest_Step_Side_Step'].sum().astype(int)

        #Indicate how many fragments are close to one of the two sides
        summary_table['Near_Largest_Step_Either_Side_Frag_Num'] = summary_table['Near_Largest_Step_Start_Frag_Num'] + summary_table['Near_Largest_Step_End_Frag_Num']

        #Number of chromosomes visited
        summary_table['Chrom_Number'] = grouped_df['chrom'].nunique()

        #Number of compartment types visited
        summary_table['Comp_Type_Number'] = grouped_df['Frag_Comp_Type'].nunique()

        #Number of compartment indices visited
        summary_table['Comp_Index_Number'] = grouped_df['Frag_Comp_Index'].nunique()

        #span, cis walks only
        summary_table['MaxCoord'] = grouped_df['end'].max()
        summary_table['MinCoord'] = grouped_df['start'].min()
        summary_table['Span'] = summary_table['MaxCoord'] - summary_table['MinCoord']
        #Set to nan for walks with more than 1 chromosome
        summary_table.loc[summary_table['Chrom_Number'] > 1, 'Span'] = np.nan

        #Sum of distances, cis walks only
        summary_table['Sum_Dists'] = grouped_df['dist'].apply(lambda x: np.sum(np.abs(x)))
        #Set to nan for walks with more than 1 chromosome
        summary_table.loc[summary_table['Chrom_Number'] > 1, 'Sum_Dists'] = np.nan

        #Number of inter chromosomal steps
        summary_table['Trans_Steps'] = grouped_df['Trans_Step'].sum().astype(int)

        #Number of inter compartment type steps
        summary_table['Inter_Compartment_Type_Steps'] = grouped_df['Inter_Comp_Type_Step'].sum().astype(int)

        #Number of inter compartment index steps in walk - also includes switches between compartment types
        summary_table['Inter_Compartment_Index_Steps'] = grouped_df['Inter_Comp_Index_Step'].sum().astype(int)

        summarized_walks[cond][f'length_{num}'] = summary_table


In [None]:
#Merge full walks with summarized walks for all walk lengths to save
full_walks_with_summary = {}
for cond in conditions:
    full_walks_with_summary[cond] = {}
    for num in fragment_nums:
        full_walks_with_summary[cond][f'length_{num}'] = overlap_dfs3[cond][f'length_{num}'].merge(summarized_walks[cond][f'length_{num}'], left_on = 'Query_Name', right_on = 'Query_Name')


In [None]:
real_walks_with_summary_firstx_length_fractions = {}
for cond in conditions:
    real_walks_with_summary_firstx_length_fractions[cond] = {}
    for num in fragment_nums:
        df = full_walks_with_summary[cond][f'length_{num}']
        #Add max fraction of fragments within one compartment index
        df1 = df.groupby(['Query_Name', 'Frag_Comp_Index']).agg(
            {'Query_Length' : 'count', 
             'Fragment_Number' : 'mean'}).reset_index().groupby('Query_Name').max().reset_index()

        df1['Max_OneCompIndex_FracOfFragments'] = df1['Query_Length']/num
        df_filter1 = df.merge(df1[['Query_Name', 'Max_OneCompIndex_FracOfFragments']], on = 'Query_Name')

        #Add max fraction of fragments within one chromosome
        df2 = df.groupby(['Query_Name', 'chrom']).agg(
            {'Query_Length' : 'count', 
             'Fragment_Number' : 'mean'}).reset_index().groupby('Query_Name').max().reset_index()

        df2['Max_OneChrom_FracOfFragments'] = df2['Query_Length']/num
        df_filter2 = df_filter1.merge(df2[['Query_Name', 'Max_OneChrom_FracOfFragments']], on = 'Query_Name')
        
        #add max fraction of fragments near to one side of largest step
        df3 = df.groupby(['Query_Name']).agg({
            'Near_Largest_Step_Start_Frag_Num' : 'mean',
            'Near_Largest_Step_End_Frag_Num' : 'mean'}).reset_index()
        
        df3['Max_NearOneLargestStepEnd_FracOfFragments'] = df3[['Near_Largest_Step_Start_Frag_Num', 'Near_Largest_Step_End_Frag_Num']].max(axis = 1)/num
        df_filter3 = df_filter2.merge(df3[['Query_Name', 'Max_NearOneLargestStepEnd_FracOfFragments']], on = 'Query_Name')


        real_walks_with_summary_firstx_length_fractions[cond][f'length_{num}'] = df_filter3

In [None]:
#Make filtered summary from permuted_walks_with_summary_filtered
real_walks_summarized_firstx_length = {}
for cond in conditions:
    real_walks_summarized_firstx_length[cond] = {}
    for num in fragment_nums:
        real_walks_summarized_firstx_length[cond][f'length_{num}'] = real_walks_with_summary_firstx_length_fractions[cond][f'length_{num}'][[
            'Query_Name',
            'chrom',
            'Fragment_Number',
            'Chrom_Number',
            'Comp_Type_Number',
            'Comp_Index_Number',
            'Near_Largest_Step_Either_Side_Frag_Num',
            'MaxCoord',
            'MinCoord',
            'Largest_Step',
            'Span',
            'Sum_Dists',
            'Walk_Comp_Type',
            'Trans_Steps',
            'Inter_Compartment_Type_Steps',
            'Inter_Compartment_Index_Steps',
            'Inter_Largest_Step_Side_Step_Num',
            'Max_OneCompIndex_FracOfFragments',
            'Max_OneChrom_FracOfFragments',
            'Max_NearOneLargestStepEnd_FracOfFragments']].drop_duplicates()

In [None]:
#Make filtered summary from permuted_walks_with_summary_filtered
overlap_df_for_permutations = {}
for cond in conditions:
    overlap_df_for_permutations[cond] = {}
    for num in fragment_nums:
        overlap_df_for_permutations[cond][f'length_{num}'] = real_walks_with_summary_firstx_length_fractions[cond][f'length_{num}'][[
            'Query_Name', 'Query_Length', 'Query_Start', 'Query_End', 'Strand',
            'chrom', 'Target_ChrSize', 'start', 'end', 'Match_Length',
            'Alignment_Length', 'Mapping_Quality', 'mid', 'Frag_Comp_Type', 'Frag_Comp_Index',
            'Walk_Comp_Type', 'Chrom_Number', 'Comp_Type_Number', 'Comp_Index_Number', 'MaxCoord', 'MinCoord', 
            'Span', 'Max_OneCompIndex_FracOfFragments',
            'Max_OneChrom_FracOfFragments']]

In [None]:
#Save these so I don't have to make them again...as pickle, entire dict

f = open(f'{outDataDir}/data/220517_MRICRF_R1R2R3_real_walks_with_summary_firstx_length_fractions_dict_100kbEigs.pkl', 'wb')
pickle.dump(real_walks_with_summary_firstx_length_fractions, f)
f.close()


#save overlap dfs of fragments with compartment and bin index overlaps here - these will be used for permutations

f = open(f'{outDataDir}/data/220517_MRICRF_R1R2R3_real_walks_full_overlap_df_dict_100kbEigs.pkl', 'wb')
pickle.dump(overlap_df_for_permutations, f)
f.close()

In [None]:
f = open(f'{outDataDir}/data/220517_MRICRF_R1R2R3_real_walks_filtered_summarized_firstx_length_dict_100kbEigs.pkl', 'wb')
pickle.dump(real_walks_summarized_firstx_length, f)
f.close()

In [None]:
#make annotated txt file for GEO upload

In [None]:
#Read in pickled files

f = open(f'{outDataDir}/220517_MRICRF_R1R2R3_real_walks_with_summary_firstx_length_fractions_dict_100kbEigs.pkl', 'rb')
real_walks_with_summary_firstx = pickle.load(f)
f.close()

In [None]:
all_walks_df = pd.DataFrame()

for cond in conditions:
    for length in list(real_walks_with_summary_firstx[cond].keys()):
        real_walks_with_summary_firstx[cond][length]['FirstXLength'] = length
        real_walks_with_summary_firstx[cond][length]['Condition'] = cond
        all_walks_df = all_walks_df.append(real_walks_with_summary_firstx[cond][length], ignore_index = True)   

In [None]:
all_walks_df.to_csv(f'{outDataDir}/220517_MRICRF_MC3C_FirstXFragmentsPerWalk_Annotated.txt', sep = '\t', index = False)

In [None]:
df_file = f'{outDataDir}/220517_MRICRF_MC3C_FirstXFragmentsPerWalk_Annotated.txt'
!gzip $df_file