In [2]:
from os import listdir
from os.path import isfile, join
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_colwidth', None)

In [3]:
def bc(df_s1,df_s2):
    diff = abs(df_s1 - df_s2)
    sm = df_s1 + df_s2
    bc = diff.sum()/sm.sum()
    return bc 

### BC distance metric
Creates multiple similarity matrices based on percentage of unique barcode indel pairs to use

In [36]:
import math
# Get all file names in Step8
samples = [f[:f.find('.fastq')] for f in listdir('./Step8/') if isfile(join('./Step8/', f)) and f.find("R1") != -1]
samples.sort()
frames = []
names = []

for percent in range(20,25,5):
    names.append(f'{percent}%')
    # Make matrix of 0's using the index of barcode_indel_frq1_filtered_r1
    matrix_r1 = pd.DataFrame(0, index=samples, columns=samples)
    #Iterate through the matrix and calculate the BC distance. Use the indices in the outer loop.
    for id in matrix_r1.index:
        for col in matrix_r1.columns:
            #Get the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1 = pd.read_csv(f'./Step8/{id}.fastq',header=None).value_counts()
            barcode_indel_frq2 = pd.read_csv(f'./Step8/{col}.fastq',header=None).value_counts()
            #Normalize the frequency of each indel and barcode combination for each sample using the total number of reads in Step5
            barcode_indel_frq1 = barcode_indel_frq1/len(pd.read_csv(f'./Step5/{id}.fastq',header=None).index)*1000000
            barcode_indel_frq2 = barcode_indel_frq2/len(pd.read_csv(f'./Step5/{col}.fastq',header=None).index)*1000000
            # Turn multiindex to index
            barcode_indel_frq1.index = barcode_indel_frq1.index.map(lambda x: x[0])
            barcode_indel_frq2.index = barcode_indel_frq2.index.map(lambda x: x[0])
            #Filter the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2.loc[barcode_indel_frq1.index.intersection(barcode_indel_frq2.index)]
            only_frq1 = barcode_indel_frq1_filtered_r1.index.difference(barcode_indel_frq2_filtered_r1.index)
            new_index = pd.Index([index for index in barcode_indel_frq2_filtered_r1.index] + [index for index in only_frq1])
            #Add 0 to the missing values using only_frq1 
            barcode_indel_frq2_filtered_r1 = pd.Series(np.append(barcode_indel_frq2_filtered_r1.values,np.zeros(len(only_frq1))),index=new_index)
            barcode_indel_frq2_filtered_r1.reindex(barcode_indel_frq1_filtered_r1.index)
            # Filter based on percent
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1[0:math.floor(len(barcode_indel_frq1_filtered_r1)*percent*0.01)]
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1[0:math.floor(len(barcode_indel_frq2_filtered_r1)*percent*0.01)]
            #Calculate the BC distance
            matrix_r1.loc[id,col] = bc(barcode_indel_frq1_filtered_r1,barcode_indel_frq2_filtered_r1)
    frames.append(matrix_r1)

In [37]:
frames[0]

Unnamed: 0,C1-10_R1_001,C1-1F_R1_001,C1-1Re_R1_001,C1-1_R1_001,C1-5_R1_001,C2-10_R1_001,C2-1_R1_001,C2-5_R1_001,D1-10_R1_001,D1-1_R1_001,D1-5_R1_001
C1-10_R1_001,0.0,0.453451,0.324267,0.359891,0.407607,0.519511,0.4422,0.61417,0.357024,0.391087,0.368008
C1-1F_R1_001,0.405297,0.0,0.269368,0.25427,0.339892,0.449869,0.297995,0.643667,0.35371,0.403372,0.31808
C1-1Re_R1_001,0.338116,0.277197,0.0,0.072223,0.439873,0.501426,0.264863,0.622775,0.296904,0.361022,0.301709
C1-1_R1_001,0.370347,0.257746,0.0726,0.0,0.349532,0.504314,0.261502,0.620164,0.312979,0.351576,0.276085
C1-5_R1_001,0.418701,0.325573,0.439548,0.330413,0.0,0.601374,0.43899,0.686899,0.3276,0.488079,0.370811
C2-10_R1_001,0.561962,0.463347,0.492716,0.488306,0.56684,0.0,0.548094,0.542068,0.481714,0.74266,0.521739
C2-1_R1_001,0.363267,0.285367,0.230847,0.257929,0.398235,0.538809,0.0,0.619496,0.31241,0.394539,0.39786
C2-5_R1_001,0.616284,0.637917,0.617905,0.616753,0.672524,0.551407,0.638207,0.0,0.649229,0.726309,0.647361
D1-10_R1_001,0.371933,0.353941,0.288431,0.30261,0.325239,0.488831,0.294669,0.625551,0.0,0.489726,0.343749
D1-1_R1_001,0.332606,0.370007,0.342994,0.339572,0.414356,0.730905,0.399463,0.719508,0.490618,0.0,0.407752


### Aggregate BC distance metric
Version of previous that is more resistant to noise in the data. Smooths data through aggregating.

In [2]:
from weighted_levenshtein import levenshtein
levenshtein("abc","bca")

2.0

In [5]:
import math
# Get all file names in Step8
samples = [f[:f.find('.fastq')] for f in listdir('./Step8/') if isfile(join('./Step8/', f)) and f.find("R1") != -1]
samples.sort()
frames = []
names = []

agg = 5

for percent in range(20,25,5):
    names.append(f'{percent}%')
    # Make matrix of 0's using the index of barcode_indel_frq1_filtered_r1
    matrix_r1 = pd.DataFrame(0, index=samples, columns=samples)
    #Iterate through the matrix and calculate the BC distance. Use the indices in the outer loop.
    for id in matrix_r1.index:
        for col in matrix_r1.columns:
            #Get the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1 = pd.read_csv(f'./Step8/{id}.fastq',header=None).value_counts()
            barcode_indel_frq1 = barcode_indel_frq1[0:math.floor(len(barcode_indel_frq1)*percent*0.01)]
            barcode_indel_frq2 = pd.read_csv(f'./Step8/{col}.fastq',header=None).value_counts()
            barcode_indel_frq2 = barcode_indel_frq2[0:math.floor(len(barcode_indel_frq2)*percent*0.01)]
            #Normalize the frequency of each indel and barcode combination for each sample using the total number of reads in Step5
            barcode_indel_frq1 = barcode_indel_frq1/len(pd.read_csv(f'./Step5/{id}.fastq',header=None).index)*1000000
            barcode_indel_frq2 = barcode_indel_frq2/len(pd.read_csv(f'./Step5/{col}.fastq',header=None).index)*1000000
            # Turn multiindex to index
            barcode_indel_frq1.index = barcode_indel_frq1.index.map(lambda x: x[0])
            barcode_indel_frq2.index = barcode_indel_frq2.index.map(lambda x: x[0])
            #Filter the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2.loc[barcode_indel_frq1.index.intersection(barcode_indel_frq2.index)]
            only_frq1 = barcode_indel_frq1_filtered_r1.index.difference(barcode_indel_frq2_filtered_r1.index)
            new_index = pd.Index([index for index in barcode_indel_frq2_filtered_r1.index] + [index for index in only_frq1])
            #Add 0 to the missing values using only_frq1 
            barcode_indel_frq2_filtered_r1 = pd.Series(np.append(barcode_indel_frq2_filtered_r1.values,np.zeros(len(only_frq1))),index=new_index)
            #Sort the index for frq1 by frequency values
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1.sort_values(ascending=False)
            #Make the order of the index for frq2 the same as frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1.reindex(barcode_indel_frq1_filtered_r1.index)
            # Create a new index with aggregating every four indices
            new_index = []
            for i in range(len(barcode_indel_frq1_filtered_r1.index)):
                if i % agg == 0:
                    # Turn mulitindex into string
                    new_index.append(barcode_indel_frq1_filtered_r1.index[i][0]) # turning multiindex into string
            # Create dataframe with new index initialized with 0
            aggregate_barcode_indel_frq1_filtered = pd.DataFrame(0, index=new_index, columns=['frequency'])
            aggregate_barcode_indel_frq2_filtered = pd.DataFrame(0, index=new_index, columns=['frequency'])
            for i in range(len(new_index)):
                value1 = 0
                value2 = 0
                for j in range(agg):
                    if agg*i+j < len(barcode_indel_frq1_filtered_r1):
                        value1 += barcode_indel_frq1_filtered_r1.iloc[agg*i+j]
                        value2 += barcode_indel_frq2_filtered_r1.iloc[agg*i+j]
                aggregate_barcode_indel_frq1_filtered.iloc[i] = value1
                aggregate_barcode_indel_frq2_filtered.iloc[i] = value2
            #Calculate the BC distance
            bc_value = bc(aggregate_barcode_indel_frq1_filtered,aggregate_barcode_indel_frq2_filtered)[0]
            matrix_r1.loc[id,col] = bc_value
    frames.append(matrix_r1)

In [6]:
frames[0]

Unnamed: 0,C1-10_R1_001,C1-1F_R1_001,C1-1Re_R1_001,C1-1_R1_001,C1-5_R1_001,C2-10_R1_001,C2-1_R1_001,C2-5_R1_001,D1-10_R1_001,D1-1_R1_001,D1-5_R1_001
C1-10_R1_001,0.0,0.431943,0.397286,0.409579,0.538661,0.346191,0.46841,0.401452,0.440582,0.577002,0.375383
C1-1F_R1_001,0.294089,0.0,0.242367,0.257716,0.404992,0.336444,0.26909,0.451433,0.304605,0.471261,0.268862
C1-1Re_R1_001,0.277876,0.171508,0.0,0.032131,0.462459,0.257859,0.23841,0.489486,0.343485,0.439015,0.262574
C1-1_R1_001,0.264428,0.143339,0.032069,0.0,0.399548,0.289235,0.214352,0.501699,0.327126,0.431548,0.22451
C1-5_R1_001,0.277549,0.247798,0.193374,0.217094,0.0,0.254271,0.346532,0.423134,0.324721,0.414108,0.187476
C2-10_R1_001,0.519463,0.494769,0.530593,0.540064,0.629791,0.0,0.55963,0.255599,0.545659,0.775125,0.522233
C2-1_R1_001,0.37522,0.130713,0.184679,0.192125,0.45214,0.390216,0.0,0.528008,0.204976,0.380313,0.267099
C2-5_R1_001,0.58731,0.626661,0.569709,0.581387,0.730881,0.484392,0.605532,0.0,0.671189,0.711602,0.604019
D1-10_R1_001,0.279659,0.152269,0.235734,0.260127,0.275499,0.283609,0.181474,0.494635,0.0,0.457521,0.223269
D1-1_R1_001,0.350374,0.272468,0.201334,0.203713,0.221364,0.528934,0.175132,0.607789,0.319028,0.0,0.325331


### Kendall Tau
Metric for comparing two rankings

In [5]:
import math
import pandas as pd
import seaborn as sns
from scipy.stats import weightedtau

# Get all file names in Step8
samples = [f[:f.find('.fastq')] for f in listdir('./Step8/') if isfile(join('./Step8/', f)) and f.find("R1") != -1]
samples.sort()
frames = []
names = []

for percent in range(20,25,5):
    names.append(f'{percent}%')
    # Make matrix of 0's using the index of barcode_indel_frq1_filtered_r1
    matrix_r1 = pd.DataFrame(0, index=samples, columns=samples)
    #Iterate through the matrix and calculate the BC distance. Use the indices in the outer loop.
    for id in matrix_r1.index:
        for col in matrix_r1.columns:
            #Get the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1 = pd.read_csv(f'./Step8/{id}.fastq',header=None).value_counts()
            barcode_indel_frq2 = pd.read_csv(f'./Step8/{col}.fastq',header=None).value_counts()
            #Normalize the frequency of each indel and barcode combination for each sample using the total number of reads in Step5
            barcode_indel_frq1 = barcode_indel_frq1/len(pd.read_csv(f'./Step5/{id}.fastq',header=None).index)*1000000
            barcode_indel_frq2 = barcode_indel_frq2/len(pd.read_csv(f'./Step5/{col}.fastq',header=None).index)*1000000
            # Turn multiindex to index
            barcode_indel_frq1.index = barcode_indel_frq1.index.map(lambda x: x[0])
            barcode_indel_frq2.index = barcode_indel_frq2.index.map(lambda x: x[0])
            #Filter the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2.loc[barcode_indel_frq1.index.intersection(barcode_indel_frq2.index)]
            only_frq1 = barcode_indel_frq1_filtered_r1.index.difference(barcode_indel_frq2_filtered_r1.index)
            new_index = pd.Index([index for index in barcode_indel_frq2_filtered_r1.index] + [index for index in only_frq1])
            #Add 0 to the missing values using only_frq1 
            barcode_indel_frq2_filtered_r1 = pd.Series(np.append(barcode_indel_frq2_filtered_r1.values,np.zeros(len(only_frq1))),index=new_index)
            #Sort the index for frq1 by frequency values
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1.sort_values(ascending=False)
            #Make the order of the index for frq2 the same as frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1.reindex(barcode_indel_frq1_filtered_r1.index)
            # Filter based on percent
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1[0:math.floor(len(barcode_indel_frq1_filtered_r1)*percent*0.01)]
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1[0:math.floor(len(barcode_indel_frq2_filtered_r1)*percent*0.01)]

            #Calculate the weighted tau
            matrix_r1.loc[id,col] = weightedtau(barcode_indel_frq1_filtered_r1,barcode_indel_frq2_filtered_r1)[0]
    frames.append(matrix_r1)

In [109]:
frames[0]

Unnamed: 0,C1-10_R1_001,C1-1F_R1_001,C1-1Re_R1_001,C1-1_R1_001,C1-5_R1_001,C2-10_R1_001,C2-1_R1_001,C2-5_R1_001,D1-10_R1_001,D1-1_R1_001,D1-5_R1_001
C1-10_R1_001,1.0,0.472849,0.64026,0.6572,0.800416,0.674595,0.486637,0.621126,0.686787,0.738402,0.44885
C1-1F_R1_001,0.615845,1.0,0.751011,0.75772,0.855738,0.807627,0.742339,0.700151,0.780522,0.860652,0.695535
C1-1Re_R1_001,0.652664,0.760967,1.0,0.916399,0.657629,0.762372,0.772991,0.658046,0.746461,0.851396,0.724536
C1-1_R1_001,0.654864,0.744182,0.92267,1.0,0.78346,0.781478,0.731862,0.683366,0.73877,0.854597,0.732366
C1-5_R1_001,0.72059,0.770203,0.589381,0.747912,1.0,0.5678,0.674427,0.491958,0.752676,0.588598,0.635074
C2-10_R1_001,0.587072,0.674352,0.66453,0.750109,0.457047,1.0,0.683617,0.614105,0.747208,0.518691,0.534587
C2-1_R1_001,0.547011,0.748528,0.77834,0.745242,0.805473,0.726581,1.0,0.607423,0.780899,0.834816,0.706101
C2-5_R1_001,0.760721,0.722588,0.75067,0.750818,0.736674,0.713442,0.648692,1.0,0.655079,0.830726,0.723113
D1-10_R1_001,0.712094,0.75055,0.753712,0.715985,0.834579,0.777621,0.808779,0.58001,1.0,0.819621,0.708913
D1-1_R1_001,0.64764,0.732489,0.769195,0.767235,0.522683,0.571582,0.713077,0.638539,0.663216,1.0,0.730677


### weighted RBO

In [54]:
import math
import numpy as np

def rbo(S,T, p= 0.9):
    """ Takes two lists S and T of any lengths and gives out the RBO Score
    Parameters
    ----------
    S, T : Lists (str, integers)
    p : Weight parameter, giving the influence of the first d
        elements on the final score. p<0<1. Default 0.9 give the top 10 
        elements 86% of the contribution in the final score.
    
    Returns
    -------
    Float of RBO score
    """
    
    # Fixed Terms
    k = max(len(S), len(T))
    x_k = len(set(S).intersection(set(T)))
    
    summation_term = 0

    # Loop for summation
    # k+1 for the loop to reach the last element (at k) in the bigger list    
    for d in range (1, k+1): 
            # Create sets from the lists
            set1 = set(S[:d]) if d < len(S) else set(S)
            set2 = set(T[:d]) if d < len(T) else set(T)
            
            # Intersection at depth d
            x_d = len(set1.intersection(set2))

            # Agreement at depth d
            a_d = x_d/d   
            
            # Summation
            summation_term = summation_term + math.pow(p, d) * a_d

    # Rank Biased Overlap - extrapolated
    rbo_ext = (x_k/k) * math.pow(p, k) + ((1-p)/p * summation_term)

    return rbo_ext

def weightage_calculator(p,d):
    """ 
    Takes values of p and d
    ----------
    p : Weight parameter, giving the influence of the first d
        elements on the final score. p<0<1.
    d : depth at which the weight has to be calculated
    
    Returns
    -------
    Float of Weightage Wrbo at depth d
    """

    summation_term = 0

    for i in range (1, d): # taking d here will loop upto the value d-1 
        summation_term = summation_term + math.pow(p,i)/i


    Wrbo_1_d = 1 - math.pow(p, d-1) + (((1-p)/p) * d *(np.log(1/(1-p)) - summation_term))

    return Wrbo_1_d

In [67]:
import math
import pandas as pd
import seaborn as sns
from scipy.stats import weightedtau

# Get all file names in Step8
samples = [f[:f.find('.fastq')] for f in listdir('./Step8/') if isfile(join('./Step8/', f)) and f.find("R1") != -1]
samples.sort()
frames = []
names = []

for percent in range(20,25,5):
    names.append(f'{percent}%')
    # Make matrix of 0's using the index of barcode_indel_frq1_filtered_r1
    matrix_r1 = pd.DataFrame(0, index=samples, columns=samples)
    #Iterate through the matrix and calculate the BC distance. Use the indices in the outer loop.
    for id in matrix_r1.index:
        for col in matrix_r1.columns:
            #Get the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1 = pd.read_csv(f'./Step8/{id}.fastq',header=None).value_counts()
            barcode_indel_frq2 = pd.read_csv(f'./Step8/{col}.fastq',header=None).value_counts()
            #Normalize the frequency of each indel and barcode combination for each sample using the total number of reads in Step5
            barcode_indel_frq1 = barcode_indel_frq1/len(pd.read_csv(f'./Step5/{id}.fastq',header=None).index)*1000000
            barcode_indel_frq2 = barcode_indel_frq2/len(pd.read_csv(f'./Step5/{col}.fastq',header=None).index)*1000000
            # Turn multiindex to index
            barcode_indel_frq1.index = barcode_indel_frq1.index.map(lambda x: x[0])
            barcode_indel_frq2.index = barcode_indel_frq2.index.map(lambda x: x[0])
            #Filter the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2.loc[barcode_indel_frq1.index.intersection(barcode_indel_frq2.index)]
            only_frq1 = barcode_indel_frq1_filtered_r1.index.difference(barcode_indel_frq2_filtered_r1.index)
            new_index = pd.Index([index for index in barcode_indel_frq2_filtered_r1.index] + [index for index in only_frq1])
            #Add 0 to the missing values using only_frq1 
            barcode_indel_frq2_filtered_r1 = pd.Series(np.append(barcode_indel_frq2_filtered_r1.values,np.zeros(len(only_frq1))),index=new_index)
            #Sort the index for frq1 by frequency values
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1.sort_values(ascending=False)
            #Make the order of the index for frq2 the same as frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1.reindex(barcode_indel_frq1_filtered_r1.index)
            # Filter based on percent
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1[0:math.floor(len(barcode_indel_frq1_filtered_r1)*percent*0.01)]
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1[0:math.floor(len(barcode_indel_frq2_filtered_r1)*percent*0.01)]
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1.sort_values(ascending=False)
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1.sort_values(ascending=False)
            #Calculate the weighted tau
            matrix_r1.loc[id,col] = rbo(barcode_indel_frq1_filtered_r1.index,barcode_indel_frq2_filtered_r1.index)
    frames.append(matrix_r1)

In [68]:
frames[0]

Unnamed: 0,C1-10_R1_001,C1-1F_R1_001,C1-1Re_R1_001,C1-1_R1_001,C1-5_R1_001,C2-10_R1_001,C2-1_R1_001,C2-5_R1_001,D1-10_R1_001,D1-1_R1_001,D1-5_R1_001
C1-10_R1_001,1.0,0.512796,0.513822,0.517843,0.433245,0.399854,0.494464,0.466412,0.483566,0.438293,0.462574
C1-1F_R1_001,0.471879,1.0,0.588837,0.640916,0.532742,0.580843,0.58431,0.480106,0.513868,0.570107,0.564045
C1-1Re_R1_001,0.675959,0.588994,1.0,0.916047,0.379262,0.620069,0.693025,0.57123,0.637834,0.648632,0.718184
C1-1_R1_001,0.608525,0.6353,0.915409,1.0,0.486458,0.660267,0.679877,0.581656,0.632697,0.608701,0.694824
C1-5_R1_001,0.343767,0.423246,0.43993,0.435862,1.0,0.340338,0.406673,0.365482,0.443717,0.411912,0.393134
C2-10_R1_001,0.547796,0.645823,0.715519,0.76213,0.675871,1.0,0.634184,0.63686,0.640755,0.587051,0.607946
C2-1_R1_001,0.592425,0.556473,0.72625,0.705361,0.41204,0.523114,1.0,0.587972,0.691683,0.617754,0.626164
C2-5_R1_001,0.538514,0.428056,0.545324,0.535432,0.553132,0.559898,0.572447,1.0,0.559492,0.407146,0.546176
D1-10_R1_001,0.596673,0.534211,0.692437,0.690117,0.491553,0.566602,0.69537,0.620128,1.0,0.485058,0.643105
D1-1_R1_001,0.574961,0.582565,0.644544,0.586843,0.476607,0.484814,0.589219,0.501517,0.475093,1.0,0.588107


In [None]:
import math
import pandas as pd
import seaborn as sns
from scipy.stats import weightedtau

# Get all file names in Step8
samples = [f[:f.find('.fastq')] for f in listdir('./Step8/') if isfile(join('./Step8/', f)) and f.find("R1") != -1]
samples.sort()
frames = []
names = []

for percent in range(20,25,5):
    names.append(f'{percent}%')
    # Make matrix of 0's using the index of barcode_indel_frq1_filtered_r1
    matrix_r1 = pd.DataFrame(0, index=samples, columns=samples)
    #Iterate through the matrix and calculate the BC distance. Use the indices in the outer loop.
    for id in matrix_r1.index:
        for col in matrix_r1.columns:
            #Get the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1 = pd.read_csv(f'./Step8/{id}.fastq',header=None).value_counts()
            barcode_indel_frq2 = pd.read_csv(f'./Step8/{col}.fastq',header=None).value_counts()
            #Normalize the frequency of each indel and barcode combination for each sample using the total number of reads in Step5
            barcode_indel_frq1 = barcode_indel_frq1/len(pd.read_csv(f'./Step5/{id}.fastq',header=None).index)*1000000
            barcode_indel_frq2 = barcode_indel_frq2/len(pd.read_csv(f'./Step5/{col}.fastq',header=None).index)*1000000
            # Turn multiindex to index
            barcode_indel_frq1.index = barcode_indel_frq1.index.map(lambda x: x[0])
            barcode_indel_frq2.index = barcode_indel_frq2.index.map(lambda x: x[0])
            #Filter the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2.loc[barcode_indel_frq1.index.intersection(barcode_indel_frq2.index)]
            only_frq1 = barcode_indel_frq1_filtered_r1.index.difference(barcode_indel_frq2_filtered_r1.index)
            new_index = pd.Index([index for index in barcode_indel_frq2_filtered_r1.index] + [index for index in only_frq1])
            #Add 0 to the missing values using only_frq1 
            barcode_indel_frq2_filtered_r1 = pd.Series(np.append(barcode_indel_frq2_filtered_r1.values,np.zeros(len(only_frq1))),index=new_index)
            #Sort the index for frq1 by frequency values
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1.sort_values(ascending=False)
            #Make the order of the index for frq2 the same as frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1.reindex(barcode_indel_frq1_filtered_r1.index)
            # Filter based on percent
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1[0:math.floor(len(barcode_indel_frq1_filtered_r1)*percent*0.01)]
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1[0:math.floor(len(barcode_indel_frq2_filtered_r1)*percent*0.01)]
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1.sort_values(ascending=False)
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1.sort_values(ascending=False)
            #Calculate the weighted tau
            matrix_r1.loc[id,col] = rbo(barcode_indel_frq1_filtered_r1.index,barcode_indel_frq2_filtered_r1.index)
    frames.append(matrix_r1)



In [None]:
frames[0]

In [None]:
# https://github.com/zauri/clustering look at this for clustering