In [1]:
from os import listdir
from os.path import isfile, join
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_colwidth', None)

In [2]:
def bc(df_s1,df_s2):
    diff = abs(df_s1 - df_s2)
    sm = df_s1 + df_s2
    bc = diff.sum()/sm.sum()
    return bc 

### BC distance metric
Creates multiple similarity matrices based on percentage of unique barcode indel pairs to use

In [36]:
import math
# Get all file names in Step8
samples = [f[:f.find('.fastq')] for f in listdir('./Step8/') if isfile(join('./Step8/', f)) and f.find("R1") != -1]
samples.sort()
frames = []
names = []

for percent in range(20,25,5):
    names.append(f'{percent}%')
    # Make matrix of 0's using the index of barcode_indel_frq1_filtered_r1
    matrix_r1 = pd.DataFrame(0, index=samples, columns=samples)
    #Iterate through the matrix and calculate the BC distance. Use the indices in the outer loop.
    for id in matrix_r1.index:
        for col in matrix_r1.columns:
            #Get the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1 = pd.read_csv(f'./Step8/{id}.fastq',header=None).value_counts()
            barcode_indel_frq2 = pd.read_csv(f'./Step8/{col}.fastq',header=None).value_counts()
            #Normalize the frequency of each indel and barcode combination for each sample using the total number of reads in Step5
            barcode_indel_frq1 = barcode_indel_frq1/len(pd.read_csv(f'./Step5/{id}.fastq',header=None).index)*1000000
            barcode_indel_frq2 = barcode_indel_frq2/len(pd.read_csv(f'./Step5/{col}.fastq',header=None).index)*1000000
            #Turn multiindex to index
            barcode_indel_frq1.index = barcode_indel_frq1.index.map(lambda x: x[0])
            barcode_indel_frq2.index = barcode_indel_frq2.index.map(lambda x: x[0])
            #Filter based on percent
            barcode_indel_frq1 = barcode_indel_frq1[0:math.floor(len(barcode_indel_frq1)*percent*0.01)]
            #Filter the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2.loc[barcode_indel_frq1.index.intersection(barcode_indel_frq2.index)]
            only_frq1 = barcode_indel_frq1_filtered_r1.index.difference(barcode_indel_frq2_filtered_r1.index)
            new_index = pd.Index([index for index in barcode_indel_frq2_filtered_r1.index] + [index for index in only_frq1])
            #Add 0 to the missing values using only_frq1 
            barcode_indel_frq2_filtered_r1 = pd.Series(np.append(barcode_indel_frq2_filtered_r1.values,np.zeros(len(only_frq1))),index=new_index)
            barcode_indel_frq2_filtered_r1.reindex(barcode_indel_frq1_filtered_r1.index)
            #Calculate the BC distance
            matrix_r1.loc[id,col] = bc(barcode_indel_frq1_filtered_r1,barcode_indel_frq2_filtered_r1)
    frames.append(matrix_r1)

In [37]:
frames[0]

Unnamed: 0,C1-10_R1_001,C1-1F_R1_001,C1-1Re_R1_001,C1-1_R1_001,C1-5_R1_001,C2-10_R1_001,C2-1_R1_001,C2-5_R1_001,D1-10_R1_001,D1-1_R1_001,D1-5_R1_001
C1-10_R1_001,0.0,0.453451,0.324267,0.359891,0.407607,0.519511,0.4422,0.61417,0.357024,0.391087,0.368008
C1-1F_R1_001,0.405297,0.0,0.269368,0.25427,0.339892,0.449869,0.297995,0.643667,0.35371,0.403372,0.31808
C1-1Re_R1_001,0.338116,0.277197,0.0,0.072223,0.439873,0.501426,0.264863,0.622775,0.296904,0.361022,0.301709
C1-1_R1_001,0.370347,0.257746,0.0726,0.0,0.349532,0.504314,0.261502,0.620164,0.312979,0.351576,0.276085
C1-5_R1_001,0.418701,0.325573,0.439548,0.330413,0.0,0.601374,0.43899,0.686899,0.3276,0.488079,0.370811
C2-10_R1_001,0.561962,0.463347,0.492716,0.488306,0.56684,0.0,0.548094,0.542068,0.481714,0.74266,0.521739
C2-1_R1_001,0.363267,0.285367,0.230847,0.257929,0.398235,0.538809,0.0,0.619496,0.31241,0.394539,0.39786
C2-5_R1_001,0.616284,0.637917,0.617905,0.616753,0.672524,0.551407,0.638207,0.0,0.649229,0.726309,0.647361
D1-10_R1_001,0.371933,0.353941,0.288431,0.30261,0.325239,0.488831,0.294669,0.625551,0.0,0.489726,0.343749
D1-1_R1_001,0.332606,0.370007,0.342994,0.339572,0.414356,0.730905,0.399463,0.719508,0.490618,0.0,0.407752


Optional:
Change index of matrix to user defined index

In [None]:
names = list(frames[0].index)
names

In [None]:
# Change order from output above
names = [
 'C1-10_R1_001',
 'C1-1F_R1_001',
 'C1-1Re_R1_001',
 'C1-1_R1_001',
 'C1-5_R1_001',
 'C2-10_R1_001',
 'C2-1_R1_001',
 'C2-5_R1_001',
 'D1-10_R1_001',
 'D1-1_R1_001',
 'D1-5_R1_001']
# Set index and column names to new order
frames[0].set_axis(names, axis=0, inplace=True)
frames[0].set_axis(names, axis=1, inplace=True)
frames[0]

### Aggregate BC distance metric
Version of previous that is more resistant to noise in the data. Smooths data through aggregating.

In [3]:
import math
# Get all file names in Step8
samples = [f[:f.find('.fastq')] for f in listdir('./Step8/') if isfile(join('./Step8/', f)) and f.find("R1") != -1]
samples.sort()
frames = []
names = []

agg = 4

for percent in range(15,20,5):
    names.append(f'{percent}%')
    # Make matrix of 0's using the index of barcode_indel_frq1_filtered_r1
    matrix_r1 = pd.DataFrame(0, index=samples, columns=samples)
    #Iterate through the matrix and calculate the BC distance. Use the indices in the outer loop.
    for id in matrix_r1.index:
        for col in matrix_r1.columns:
            #Get the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1 = pd.read_csv(f'./Step8/{id}.fastq',header=None).value_counts()
            barcode_indel_frq1 = barcode_indel_frq1[0:math.floor(len(barcode_indel_frq1)*percent*0.01)]
            barcode_indel_frq2 = pd.read_csv(f'./Step8/{col}.fastq',header=None).value_counts()
            barcode_indel_frq2 = barcode_indel_frq2[0:math.floor(len(barcode_indel_frq2)*percent*0.01)]
            #Normalize the frequency of each indel and barcode combination for each sample using the total number of reads in Step5
            barcode_indel_frq1 = barcode_indel_frq1/len(pd.read_csv(f'./Step5/{id}.fastq',header=None).index)*1000000
            barcode_indel_frq2 = barcode_indel_frq2/len(pd.read_csv(f'./Step5/{col}.fastq',header=None).index)*1000000
            # Turn multiindex to index
            barcode_indel_frq1.index = barcode_indel_frq1.index.map(lambda x: x[0])
            barcode_indel_frq2.index = barcode_indel_frq2.index.map(lambda x: x[0])
            #Filter based on percent
            barcode_indel_frq1 = barcode_indel_frq1[0:math.floor(len(barcode_indel_frq1)*percent*0.01)]
            #Filter the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2.loc[barcode_indel_frq1.index.intersection(barcode_indel_frq2.index)]
            only_frq1 = barcode_indel_frq1_filtered_r1.index.difference(barcode_indel_frq2_filtered_r1.index)
            new_index = pd.Index([index for index in barcode_indel_frq2_filtered_r1.index] + [index for index in only_frq1])
            #Add 0 to the missing values using only_frq1 
            barcode_indel_frq2_filtered_r1 = pd.Series(np.append(barcode_indel_frq2_filtered_r1.values,np.zeros(len(only_frq1))),index=new_index)
            #Sort the index for frq1 by frequency values
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1.sort_values(ascending=False)
            #Make the order of the index for frq2 the same as frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1.reindex(barcode_indel_frq1_filtered_r1.index)
            # Create a new index with aggregating every four indices
            new_index = []
            for i in range(len(barcode_indel_frq1_filtered_r1.index)):
                if i % agg == 0:
                    # Turn mulitindex into string
                    new_index.append(barcode_indel_frq1_filtered_r1.index[i][0]) # turning multiindex into string
            # Create dataframe with new index initialized with 0
            aggregate_barcode_indel_frq1_filtered = pd.DataFrame(0, index=new_index, columns=['frequency'])
            aggregate_barcode_indel_frq2_filtered = pd.DataFrame(0, index=new_index, columns=['frequency'])
            for i in range(len(new_index)):
                value1 = 0
                value2 = 0
                for j in range(agg):
                    if agg*i+j < len(barcode_indel_frq1_filtered_r1):
                        value1 += barcode_indel_frq1_filtered_r1.iloc[agg*i+j]
                        value2 += barcode_indel_frq2_filtered_r1.iloc[agg*i+j]
                aggregate_barcode_indel_frq1_filtered.iloc[i] = value1
                aggregate_barcode_indel_frq2_filtered.iloc[i] = value2
            #Calculate the BC distance
            bc_value = bc(aggregate_barcode_indel_frq1_filtered,aggregate_barcode_indel_frq2_filtered)[0]
            matrix_r1.loc[id,col] = bc_value
    frames.append(matrix_r1)

In [4]:
frames[0]

Unnamed: 0,F0-FT-repeat_R1_001,F0-FT_R1_001,F0-Pe-repeat_R1_001,F0-Pe_R1_001,G0-FT-repeat_R1_001,G0-FT_R1_001,G0-Pe-repeat_R1_001,G0-Pe_R1_001,H0-FT-repeat_R1_001,H0-FT_R1_001,H0-Pe-repeat_R1_001,H0-Pe_R1_001
F0-FT-repeat_R1_001,0.0,0.181447,0.349681,0.324166,0.382393,0.388518,0.353905,0.36095,0.300522,0.310651,0.321713,0.330972
F0-FT_R1_001,0.179618,0.0,0.335042,0.328797,0.412079,0.398358,0.351619,0.349796,0.340914,0.330677,0.339755,0.352222
F0-Pe-repeat_R1_001,0.276051,0.293362,0.0,0.144013,0.339777,0.320899,0.296844,0.277556,0.271117,0.284968,0.279078,0.275808
F0-Pe_R1_001,0.307989,0.325879,0.16454,0.0,0.359719,0.359519,0.276411,0.270759,0.263995,0.273946,0.244804,0.257032
G0-FT-repeat_R1_001,0.323521,0.333161,0.310228,0.294768,0.0,0.201506,0.299841,0.283957,0.289708,0.283397,0.261176,0.256901
G0-FT_R1_001,0.348711,0.367096,0.312639,0.283852,0.190492,0.0,0.29247,0.281264,0.288925,0.272061,0.273483,0.274197
G0-Pe-repeat_R1_001,0.372962,0.378745,0.344857,0.310444,0.396423,0.390072,0.0,0.20046,0.260353,0.274029,0.312432,0.321
G0-Pe_R1_001,0.34984,0.353855,0.317526,0.282538,0.385404,0.372829,0.203461,0.0,0.261807,0.258322,0.292555,0.298966
H0-FT-repeat_R1_001,0.469665,0.466926,0.453979,0.426769,0.509281,0.493219,0.410422,0.425771,0.0,0.127034,0.368942,0.379565
H0-FT_R1_001,0.478243,0.473244,0.469816,0.433022,0.511901,0.498292,0.43557,0.438148,0.134492,0.0,0.400177,0.414449


In [5]:
frames[0].to_csv('bc_distance_15.csv')

### Kendall Tau
Metric for comparing two rankings

In [5]:
import math
import pandas as pd
import seaborn as sns
from scipy.stats import weightedtau

# Get all file names in Step8
samples = [f[:f.find('.fastq')] for f in listdir('./Step8/') if isfile(join('./Step8/', f)) and f.find("R1") != -1]
samples.sort()
frames = []
names = []

for percent in range(20,25,5):
    names.append(f'{percent}%')
    # Make matrix of 0's using the index of barcode_indel_frq1_filtered_r1
    matrix_r1 = pd.DataFrame(0, index=samples, columns=samples)
    #Iterate through the matrix and calculate the BC distance. Use the indices in the outer loop.
    for id in matrix_r1.index:
        for col in matrix_r1.columns:
            #Get the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1 = pd.read_csv(f'./Step8/{id}.fastq',header=None).value_counts()
            barcode_indel_frq2 = pd.read_csv(f'./Step8/{col}.fastq',header=None).value_counts()
            #Normalize the frequency of each indel and barcode combination for each sample using the total number of reads in Step5
            barcode_indel_frq1 = barcode_indel_frq1/len(pd.read_csv(f'./Step5/{id}.fastq',header=None).index)*1000000
            barcode_indel_frq2 = barcode_indel_frq2/len(pd.read_csv(f'./Step5/{col}.fastq',header=None).index)*1000000
            # Turn multiindex to index
            barcode_indel_frq1.index = barcode_indel_frq1.index.map(lambda x: x[0])
            barcode_indel_frq2.index = barcode_indel_frq2.index.map(lambda x: x[0])
            #Filter the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2.loc[barcode_indel_frq1.index.intersection(barcode_indel_frq2.index)]
            only_frq1 = barcode_indel_frq1_filtered_r1.index.difference(barcode_indel_frq2_filtered_r1.index)
            new_index = pd.Index([index for index in barcode_indel_frq2_filtered_r1.index] + [index for index in only_frq1])
            #Add 0 to the missing values using only_frq1 
            barcode_indel_frq2_filtered_r1 = pd.Series(np.append(barcode_indel_frq2_filtered_r1.values,np.zeros(len(only_frq1))),index=new_index)
            #Sort the index for frq1 by frequency values
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1.sort_values(ascending=False)
            #Make the order of the index for frq2 the same as frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1.reindex(barcode_indel_frq1_filtered_r1.index)
            # Filter based on percent
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1[0:math.floor(len(barcode_indel_frq1_filtered_r1)*percent*0.01)]
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1[0:math.floor(len(barcode_indel_frq2_filtered_r1)*percent*0.01)]

            #Calculate the weighted tau
            matrix_r1.loc[id,col] = weightedtau(barcode_indel_frq1_filtered_r1,barcode_indel_frq2_filtered_r1)[0]
    frames.append(matrix_r1)

In [109]:
frames[0]

Unnamed: 0,C1-10_R1_001,C1-1F_R1_001,C1-1Re_R1_001,C1-1_R1_001,C1-5_R1_001,C2-10_R1_001,C2-1_R1_001,C2-5_R1_001,D1-10_R1_001,D1-1_R1_001,D1-5_R1_001
C1-10_R1_001,1.0,0.472849,0.64026,0.6572,0.800416,0.674595,0.486637,0.621126,0.686787,0.738402,0.44885
C1-1F_R1_001,0.615845,1.0,0.751011,0.75772,0.855738,0.807627,0.742339,0.700151,0.780522,0.860652,0.695535
C1-1Re_R1_001,0.652664,0.760967,1.0,0.916399,0.657629,0.762372,0.772991,0.658046,0.746461,0.851396,0.724536
C1-1_R1_001,0.654864,0.744182,0.92267,1.0,0.78346,0.781478,0.731862,0.683366,0.73877,0.854597,0.732366
C1-5_R1_001,0.72059,0.770203,0.589381,0.747912,1.0,0.5678,0.674427,0.491958,0.752676,0.588598,0.635074
C2-10_R1_001,0.587072,0.674352,0.66453,0.750109,0.457047,1.0,0.683617,0.614105,0.747208,0.518691,0.534587
C2-1_R1_001,0.547011,0.748528,0.77834,0.745242,0.805473,0.726581,1.0,0.607423,0.780899,0.834816,0.706101
C2-5_R1_001,0.760721,0.722588,0.75067,0.750818,0.736674,0.713442,0.648692,1.0,0.655079,0.830726,0.723113
D1-10_R1_001,0.712094,0.75055,0.753712,0.715985,0.834579,0.777621,0.808779,0.58001,1.0,0.819621,0.708913
D1-1_R1_001,0.64764,0.732489,0.769195,0.767235,0.522683,0.571582,0.713077,0.638539,0.663216,1.0,0.730677


### weighted RBO

In [54]:
import math
import numpy as np

def rbo(S,T, p= 0.9):
    """ Takes two lists S and T of any lengths and gives out the RBO Score
    Parameters
    ----------
    S, T : Lists (str, integers)
    p : Weight parameter, giving the influence of the first d
        elements on the final score. p<0<1. Default 0.9 give the top 10 
        elements 86% of the contribution in the final score.
    
    Returns
    -------
    Float of RBO score
    """
    
    # Fixed Terms
    k = max(len(S), len(T))
    x_k = len(set(S).intersection(set(T)))
    
    summation_term = 0

    # Loop for summation
    # k+1 for the loop to reach the last element (at k) in the bigger list    
    for d in range (1, k+1): 
            # Create sets from the lists
            set1 = set(S[:d]) if d < len(S) else set(S)
            set2 = set(T[:d]) if d < len(T) else set(T)
            
            # Intersection at depth d
            x_d = len(set1.intersection(set2))

            # Agreement at depth d
            a_d = x_d/d   
            
            # Summation
            summation_term = summation_term + math.pow(p, d) * a_d

    # Rank Biased Overlap - extrapolated
    rbo_ext = (x_k/k) * math.pow(p, k) + ((1-p)/p * summation_term)

    return rbo_ext

def weightage_calculator(p,d):
    """ 
    Takes values of p and d
    ----------
    p : Weight parameter, giving the influence of the first d
        elements on the final score. p<0<1.
    d : depth at which the weight has to be calculated
    
    Returns
    -------
    Float of Weightage Wrbo at depth d
    """

    summation_term = 0

    for i in range (1, d): # taking d here will loop upto the value d-1 
        summation_term = summation_term + math.pow(p,i)/i


    Wrbo_1_d = 1 - math.pow(p, d-1) + (((1-p)/p) * d *(np.log(1/(1-p)) - summation_term))

    return Wrbo_1_d

In [67]:
import math
import pandas as pd
import seaborn as sns
from scipy.stats import weightedtau

# Get all file names in Step8
samples = [f[:f.find('.fastq')] for f in listdir('./Step8/') if isfile(join('./Step8/', f)) and f.find("R1") != -1]
samples.sort()
frames = []
names = []

for percent in range(20,25,5):
    names.append(f'{percent}%')
    # Make matrix of 0's using the index of barcode_indel_frq1_filtered_r1
    matrix_r1 = pd.DataFrame(0, index=samples, columns=samples)
    #Iterate through the matrix and calculate the BC distance. Use the indices in the outer loop.
    for id in matrix_r1.index:
        for col in matrix_r1.columns:
            #Get the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1 = pd.read_csv(f'./Step8/{id}.fastq',header=None).value_counts()
            barcode_indel_frq2 = pd.read_csv(f'./Step8/{col}.fastq',header=None).value_counts()
            #Normalize the frequency of each indel and barcode combination for each sample using the total number of reads in Step5
            barcode_indel_frq1 = barcode_indel_frq1/len(pd.read_csv(f'./Step5/{id}.fastq',header=None).index)*1000000
            barcode_indel_frq2 = barcode_indel_frq2/len(pd.read_csv(f'./Step5/{col}.fastq',header=None).index)*1000000
            # Turn multiindex to index
            barcode_indel_frq1.index = barcode_indel_frq1.index.map(lambda x: x[0])
            barcode_indel_frq2.index = barcode_indel_frq2.index.map(lambda x: x[0])
            #Filter the frequency of each indel and barcode combination for each sample
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2.loc[barcode_indel_frq1.index.intersection(barcode_indel_frq2.index)]
            only_frq1 = barcode_indel_frq1_filtered_r1.index.difference(barcode_indel_frq2_filtered_r1.index)
            new_index = pd.Index([index for index in barcode_indel_frq2_filtered_r1.index] + [index for index in only_frq1])
            #Add 0 to the missing values using only_frq1 
            barcode_indel_frq2_filtered_r1 = pd.Series(np.append(barcode_indel_frq2_filtered_r1.values,np.zeros(len(only_frq1))),index=new_index)
            #Sort the index for frq1 by frequency values
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1.sort_values(ascending=False)
            #Make the order of the index for frq2 the same as frq1
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1.reindex(barcode_indel_frq1_filtered_r1.index)
            # Filter based on percent
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1[0:math.floor(len(barcode_indel_frq1_filtered_r1)*percent*0.01)]
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1[0:math.floor(len(barcode_indel_frq2_filtered_r1)*percent*0.01)]
            barcode_indel_frq2_filtered_r1 = barcode_indel_frq2_filtered_r1.sort_values(ascending=False)
            barcode_indel_frq1_filtered_r1 = barcode_indel_frq1_filtered_r1.sort_values(ascending=False)
            #Calculate the weighted tau
            matrix_r1.loc[id,col] = rbo(barcode_indel_frq1_filtered_r1.index,barcode_indel_frq2_filtered_r1.index)
    frames.append(matrix_r1)

In [68]:
frames[0]

Unnamed: 0,C1-10_R1_001,C1-1F_R1_001,C1-1Re_R1_001,C1-1_R1_001,C1-5_R1_001,C2-10_R1_001,C2-1_R1_001,C2-5_R1_001,D1-10_R1_001,D1-1_R1_001,D1-5_R1_001
C1-10_R1_001,1.0,0.512796,0.513822,0.517843,0.433245,0.399854,0.494464,0.466412,0.483566,0.438293,0.462574
C1-1F_R1_001,0.471879,1.0,0.588837,0.640916,0.532742,0.580843,0.58431,0.480106,0.513868,0.570107,0.564045
C1-1Re_R1_001,0.675959,0.588994,1.0,0.916047,0.379262,0.620069,0.693025,0.57123,0.637834,0.648632,0.718184
C1-1_R1_001,0.608525,0.6353,0.915409,1.0,0.486458,0.660267,0.679877,0.581656,0.632697,0.608701,0.694824
C1-5_R1_001,0.343767,0.423246,0.43993,0.435862,1.0,0.340338,0.406673,0.365482,0.443717,0.411912,0.393134
C2-10_R1_001,0.547796,0.645823,0.715519,0.76213,0.675871,1.0,0.634184,0.63686,0.640755,0.587051,0.607946
C2-1_R1_001,0.592425,0.556473,0.72625,0.705361,0.41204,0.523114,1.0,0.587972,0.691683,0.617754,0.626164
C2-5_R1_001,0.538514,0.428056,0.545324,0.535432,0.553132,0.559898,0.572447,1.0,0.559492,0.407146,0.546176
D1-10_R1_001,0.596673,0.534211,0.692437,0.690117,0.491553,0.566602,0.69537,0.620128,1.0,0.485058,0.643105
D1-1_R1_001,0.574961,0.582565,0.644544,0.586843,0.476607,0.484814,0.589219,0.501517,0.475093,1.0,0.588107


### Comparing difference in Normalized Matrix

In [41]:
import math
# Get all file names inNormalized_Matrix 
samples = [f[:f.find('.fastq')] for f in listdir('./Normalized_Matrix/') if isfile(join('./Normalized_Matrix/', f)) and f.find("R1") != -1]
samples.sort()
frames = []
names = []

# Make matrix of 0's using the index of barcode_indel_frq1_filtered_r1
matrix_r1 = pd.DataFrame(0, index=samples, columns=samples)
#Iterate through the matrix and calculate the BC distance. Use the indices in the outer loop.
for id in matrix_r1.index:
    for col in matrix_r1.columns:
        norm_matrix1 = pd.read_csv(f'./Normalized_Matrix/{id}.fastq.csv',header=None)
        norm_matrix2 = pd.read_csv(f'./Normalized_Matrix/{col}.fastq.csv',header=None)
        # Get rid of column and row names
        norm_matrix1 = norm_matrix1.iloc[1:,1:]
        norm_matrix2 = norm_matrix2.iloc[1:,1:]
        # Get top 200 of each matrix
        norm_matrix1 = norm_matrix1.iloc[0:200,0:200]
        norm_matrix2 = norm_matrix2.iloc[0:200,0:200]
        # Turn to float
        norm_matrix1 = norm_matrix1.astype(float)
        norm_matrix2 = norm_matrix2.astype(float)
        # Difference of the two matrices
        norm_matrix_diff = abs(norm_matrix1 - norm_matrix2)
        # Turn the elements that are 10 or less to 0
        norm_matrix_diff[norm_matrix_diff <= 10] = 0
        # Take sum of each element in the matrix
        norm_matrix_diff = norm_matrix_diff.sum().sum()
        matrix_r1.loc[id,col] = norm_matrix_diff
frames.append(matrix_r1)

In [42]:
frames[0]

Unnamed: 0,F0-FT-repeat_R1_001,F0-FT_R1_001,F0-Pe-repeat_R1_001,F0-Pe_R1_001,G0-FT-repeat_R1_001,G0-FT_R1_001,G0-Pe-repeat_R1_001,G0-Pe_R1_001,H0-FT-repeat_R1_001,H0-FT_R1_001,H0-Pe-repeat_R1_001,H0-Pe_R1_001
F0-FT-repeat_R1_001,0.0,30027.486604,30631.226023,31413.64939,45194.6047,44948.527509,47773.591232,47741.721435,42327.393744,40921.435883,42813.146887,40557.663798
F0-FT_R1_001,30027.486604,0.0,29672.212872,31946.14108,44449.381526,44543.962654,49336.344899,47532.86082,41357.875087,40825.869017,42322.330915,41513.724092
F0-Pe-repeat_R1_001,30631.226023,29672.212872,0.0,25622.482655,42108.345378,42512.88022,47746.195456,44390.972213,38905.588057,38619.337019,39291.908374,39397.777216
F0-Pe_R1_001,31413.64939,31946.14108,25622.482655,0.0,43198.938831,44006.36361,48468.631904,47448.686124,39020.659687,38196.956329,40759.071008,38663.985743
G0-FT-repeat_R1_001,45194.6047,44449.381526,42108.345378,43198.938831,0.0,54077.315154,63425.48792,61419.312035,54333.608578,54717.793241,55402.37266,53181.841974
G0-FT_R1_001,44948.527509,44543.962654,42512.88022,44006.36361,54077.315154,0.0,62754.892328,61322.766969,52831.247188,53182.972469,53781.367473,52834.094671
G0-Pe-repeat_R1_001,47773.591232,49336.344899,47746.195456,48468.631904,63425.48792,62754.892328,0.0,63489.568879,56762.687047,56834.516821,57827.1879,56365.084332
G0-Pe_R1_001,47741.721435,47532.86082,44390.972213,47448.686124,61419.312035,61322.766969,63489.568879,0.0,53843.067429,53941.81039,58713.11674,55275.786092
H0-FT-repeat_R1_001,42327.393744,41357.875087,38905.588057,39020.659687,54333.608578,52831.247188,56762.687047,53843.067429,0.0,39118.003908,45774.379691,45423.126913
H0-FT_R1_001,40921.435883,40825.869017,38619.337019,38196.956329,54717.793241,53182.972469,56834.516821,53941.81039,39118.003908,0.0,45094.052249,45029.354129


In [None]:
# https://github.com/zauri/clustering look at this for clustering