# Some ideas for the next round of analysis:
>- in process.sh uses 64bp of homology, might be too strict, could use half or less 
>- suggestion from polo: first and last 8bp of 64bp matches 

# For generating the matrix count, directly count the BC and output a csv in the following format

||ref_DNA1|ref_RNA1|m5_DNA1|m5_RNA1|...|
|:-:|:-:|:-:|:-:|:-:|:-:|
|STR1|count|count|count|count|...|
|...|...|...|...|...|...|

# Import Packages and Set Up

In [25]:
import scipy.stats as stats
import pandas as pd
import numpy as np
import copy

# directory
q_dir = "/storage/q5gong/MPRA-Susan/Next-seq/2022-06-07/"

# data
cDNA = [
    "cDNA_6h_6_well_S19_R1_001-proc.tsv",
    "cDNA_ON_12_well_non_PDL_S15_R1_001-proc.tsv",
    "cDNA_ON_12_well_S14_R1_001-proc.tsv",
    "cDNA_ON_6_well_S13_R1_001-proc.tsv"
]

gDNA = [
    "gDNA_6h_6_well_S20_R1_001-proc.tsv",
    "gDNA_ON_12_well_non_PDL_S18_R1_001-proc.tsv",
    "gDNA_ON_12_well_S17_R1_001-proc.tsv",
    "gDNA_ON_6_well_S16_R1_001-proc.tsv"
]

# data2
cDNA_40 = [
    "cDNA_6h_6_well_S19_R1_001-proc-40.tsv",
    "cDNA_ON_12_well_non_PDL_S15_R1_001-proc-40.tsv",
    "cDNA_ON_12_well_S14_R1_001-proc-40.tsv",
    "cDNA_ON_6_well_S13_R1_001-proc-40.tsv"
]

gDNA_40 = [
    "gDNA_6h_6_well_S20_R1_001-proc-40.tsv",
    "gDNA_ON_12_well_non_PDL_S18_R1_001-proc-40.tsv",
    "gDNA_ON_12_well_S17_R1_001-proc-40.tsv",
    "gDNA_ON_6_well_S16_R1_001-proc-40.tsv"
]

# data3
cDNA_20 = [
    "cDNA_6h_6_well_S19_R1_001-proc-20.tsv",
    "cDNA_ON_12_well_non_PDL_S15_R1_001-proc-20.tsv",
    "cDNA_ON_12_well_S14_R1_001-proc-20.tsv",
    "cDNA_ON_6_well_S13_R1_001-proc-20.tsv"
]

gDNA_20 = [
    "gDNA_6h_6_well_S20_R1_001-proc-20.tsv",
    "gDNA_ON_12_well_non_PDL_S18_R1_001-proc-20.tsv",
    "gDNA_ON_12_well_S17_R1_001-proc-20.tsv",
    "gDNA_ON_6_well_S16_R1_001-proc-20.tsv"
]

# data4
cDNA_10 = [
    "cDNA_6h_6_well_S19_R1_001-proc-10.tsv",
    "cDNA_ON_12_well_non_PDL_S15_R1_001-proc-10.tsv",
    "cDNA_ON_12_well_S14_R1_001-proc-10.tsv",
    "cDNA_ON_6_well_S13_R1_001-proc-10.tsv"
]

gDNA_10 = [
    "gDNA_6h_6_well_S20_R1_001-proc-10.tsv",
    "gDNA_ON_12_well_non_PDL_S18_R1_001-proc-10.tsv",
    "gDNA_ON_12_well_S17_R1_001-proc-10.tsv",
    "gDNA_ON_6_well_S16_R1_001-proc-10.tsv"
]

# data5
cDNA_5 = [
    "cDNA_6h_6_well_S19_R1_001-proc-5.tsv",
    "cDNA_ON_12_well_non_PDL_S15_R1_001-proc-5.tsv",
    "cDNA_ON_12_well_S14_R1_001-proc-5.tsv",
    "cDNA_ON_6_well_S13_R1_001-proc-5.tsv"
]

gDNA_5 = [
    "gDNA_6h_6_well_S20_R1_001-proc-5.tsv",
    "gDNA_ON_12_well_non_PDL_S18_R1_001-proc-5.tsv",
    "gDNA_ON_12_well_S17_R1_001-proc-5.tsv",
    "gDNA_ON_6_well_S16_R1_001-proc-5.tsv"
]

col_order = ["barcode", "STR",
             "gDNA1", "cDNA1", 
             "gDNA2", "cDNA2",
             "gDNA3", "cDNA3",
             "gDNA4", "cDNA4"]

# Helper Functions

In [2]:
def loadData (data):
    path = q_dir + "Processed/" + data
    df = pd.read_csv(path, sep="\t", header=None)
    df.columns = ["read_id", "sequence"]
    df["barcode"] = df["sequence"].str[0:20]
    
    return df

In [3]:
def bc_count (seq_type, df):
    bc_df = df.barcode.value_counts().to_frame().reset_index()
    bc_df.columns = ["barcode", seq_type]
    
    return bc_df

In [4]:
def count_filter (in_df, group_num, threshold):
    df = copy.deepcopy(in_df)
    
    cD = "cDNA" + str(group_num)
    gD = "gDNA" + str(group_num)    
    
    df = df[df[cD] >= threshold]
    df = df[df[gD] >= threshold]
    
    return df

In [5]:
def STR_split(in_df):
    
    df = copy.deepcopy(in_df)
    
    STR_name = df["STR"].str.rsplit("_", 1, expand=True)
    df["STR"] = STR_name[0]
    df["type"] = STR_name[1]
    
    return df

def STR_dfs (group):
    ref = group[group["STR"].str.contains("ref")]
    minus = group[group["STR"].str.contains("_m5")]
    plus5 = group[group["STR"].str.contains("_p5")]
    plus3 = group[group["STR"].str.contains("_p3")]

    ref = STR_split(ref)
    minus = STR_split(minus)
    plus5 = STR_split(plus5)
    plus3 = STR_split(plus3)
    
    #STRs = pd.concat([ref, minus, plus5, plus3])
    
    return ref, minus, plus5, plus3

# Associate DNA(gDNA) and RNA(cDNA)

In [6]:
index = 0
# group 1 - 6h_6_well
group_1 = pd.merge(left=bc_count("gDNA1", loadData(gDNA[index])),
                   right=bc_count("cDNA1", loadData(cDNA[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 2 - ON_12_well_non_PDL
group_2 = pd.merge(left=bc_count("gDNA2", loadData(gDNA[index])),
                   right=bc_count("cDNA2", loadData(cDNA[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 3 - ON_12_well
group_3 = pd.merge(left=bc_count("gDNA3", loadData(gDNA[index])),
                   right=bc_count("cDNA3", loadData(cDNA[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 4 - ON_6_well
group_4 = pd.merge(left=bc_count("gDNA4", loadData(gDNA[index])),
                   right=bc_count("cDNA4", loadData(cDNA[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

# Filter read with less than 10 counts 

In [7]:
# group 1 - 6h_6_well
group_1 = count_filter(group_1, 1, 10)

# group 2 - ON_12_well_non_PDL
group_2 = count_filter(group_2, 2, 10)

# group 3 - ON_12_well
group_3 = count_filter(group_3, 3, 10)

# group 4 - ON_6_well
group_4 = count_filter(group_4, 4, 10)

# Associate Barcode with STR 
>- drop the ones that does not find a match 

In [8]:
# STR-BC association 
association = {}

file = open("/storage/q5gong/MPRA-Susan/lz0504/association.tsv", "r")

for line in file:
    if line.rstrip():
        info = list(line.rstrip().split("\t"))
        barcode = info[0]
        STR = info[1]
        
        association[barcode] = STR
    else:
        file.close()
        break

In [9]:
# group 1 - 6h_6_well
group_1["STR"] = group_1["barcode"].map(association)
group_1 = group_1.dropna()

# group 2 - ON_12_well_non_PDL
group_2["STR"] = group_2["barcode"].map(association)
group_2 = group_2.dropna()

# group 3 - ON_12_well
group_3["STR"] = group_3["barcode"].map(association)
group_3 = group_3.dropna()

# group 4 - ON_6_well
group_4["STR"] = group_4["barcode"].map(association)
group_4 = group_4.dropna()

# Separate into STR name + type (ref/m/p)

In [10]:
# group 1 - 6h_6_well
ref_1, m5_1, p5_1, p3_1 = STR_dfs(group_1)

# group 2 - ON_12_well_non_PDL
ref_2, m5_2, p5_2, p3_2 = STR_dfs(group_2)

# group 3 - ON_12_well
ref_3, m5_3, p5_3, p3_3 = STR_dfs(group_3)

# group 4 - ON_6_well
ref_4, m5_4, p5_4, p3_4 = STR_dfs(group_4)

__group 2 has the most barcode associated with STRs, merge each type based on group 2 based on barcodes, and drop the nulls__

In [20]:
def multi_merge_on_bc (*args, base_group):
    groups = [item for item in args]
    merge_res = copy.deepcopy(base_group)
    for group in groups:
        merge_res = merge_res.merge(group, how="outer", on=["barcode", "STR"])
    
    return merge_res

In [21]:
refs = multi_merge_on_bc(ref_1[["barcode", "STR", "gDNA1", "cDNA1"]],
                         ref_3[["barcode", "STR", "gDNA3", "cDNA3"]],
                         ref_4[["barcode", "STR", "gDNA4", "cDNA4"]],
                         base_group=ref_2)
refs = refs[col_order]
refs.columns = ["barcode", "STR",
                "gDNA1_ref", "cDNA1_ref", 
                "gDNA2_ref", "cDNA2_ref",
                "gDNA3_ref", "cDNA3_ref",
                "gDNA4_ref", "cDNA4_ref"]
display(refs)
display(refs.isnull().sum())
display(refs.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_ref,cDNA1_ref,gDNA2_ref,cDNA2_ref,gDNA3_ref,cDNA3_ref,gDNA4_ref,cDNA4_ref
0,ATCTGAAGCTTTTAGCATGG,Human_STR_1482658,273.0,152.0,7533.0,4147.0,6744.0,6977.0,7863.0,8597.0
1,CTATTGACTCCTTTTATCTC,Human_STR_99870,155.0,93.0,5423.0,2352.0,5446.0,183.0,5611.0,4543.0
2,CAATTCGCCTAACCTTTTCA,Human_STR_207538,,,4104.0,401.0,,,4104.0,1650.0
3,CTTACATGCTCAGAACAGGT,Human_STR_690192,104.0,69.0,3537.0,557.0,,,3474.0,516.0
4,CAAACATTCCCGTTTGTTCC,Human_STR_7555,85.0,298.0,3493.0,1498.0,2532.0,688.0,3655.0,2363.0
...,...,...,...,...,...,...,...,...,...,...
1432,TATTTTGATTTGGTCAAACC,Human_STR_43049,,,,,,,46.0,687.0
1433,TTGTTTTGCTTAGCGGTGAC,Human_STR_664415,,,,,,,41.0,1137.0
1434,TTGTTTACGGATCCCCTAAA,Human_STR_1467202,,,,,,,35.0,101.0
1435,TCTTCTCTCCAGGTGTTAAT,Human_STR_352608,,,,,,,26.0,708.0


barcode        0
STR            0
gDNA1_ref    898
cDNA1_ref    898
gDNA2_ref    306
cDNA2_ref    306
gDNA3_ref    760
cDNA3_ref    760
gDNA4_ref    562
cDNA4_ref    562
dtype: int64

False

In [22]:
minus5s = multi_merge_on_bc(m5_1[["barcode", "STR", "gDNA1", "cDNA1"]],
                            m5_3[["barcode", "STR", "gDNA3", "cDNA3"]],
                            m5_4[["barcode", "STR", "gDNA4", "cDNA4"]],
                            base_group=m5_2)
minus5s = minus5s[col_order]
minus5s.columns = ["barcode", "STR",
                   "gDNA1_m5", "cDNA1_m5", 
                   "gDNA2_m5", "cDNA2_m5",
                   "gDNA3_m5", "cDNA3_m5",
                   "gDNA4_m5", "cDNA4_m5"]
display(minus5s)
display(minus5s.isnull().sum())
display(minus5s.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_m5,cDNA1_m5,gDNA2_m5,cDNA2_m5,gDNA3_m5,cDNA3_m5,gDNA4_m5,cDNA4_m5
0,CTAATAACTGCATTTGTATT,Human_STR_856874,119.0,74.0,4715.0,732.0,4418.0,800.0,4754.0,550.0
1,TAAGATTTTTATGTACATCC,Human_STR_1367136,222.0,167.0,4475.0,334.0,4456.0,491.0,5070.0,289.0
2,CACATGAATCCTCCGTTTTC,Human_STR_1182272,89.0,63.0,3999.0,1154.0,4344.0,940.0,,
3,GCGCATACTGCAATTTGTTC,Human_STR_635874,,,3942.0,1027.0,,,3764.0,2046.0
4,CTATTCCCTATCCATCGATC,Human_STR_1039074,65.0,80.0,3375.0,1898.0,2771.0,2106.0,3121.0,4048.0
...,...,...,...,...,...,...,...,...,...,...
1577,GTCGTTACTGTTTATAGGCT,Human_STR_338524,,,,,,,47.0,1612.0
1578,TTCTCCTGTTTATTGTCCCT,Human_STR_1407401,,,,,,,43.0,117.0
1579,AAGCACTTATATTTCGAGTT,Human_STR_1382615,,,,,,,42.0,21.0
1580,CGCAACGAGTATCGAGCACT,Human_STR_493979,,,,,,,18.0,22.0


barcode       0
STR           0
gDNA1_m5    963
cDNA1_m5    963
gDNA2_m5    368
cDNA2_m5    368
gDNA3_m5    864
cDNA3_m5    864
gDNA4_m5    665
cDNA4_m5    665
dtype: int64

False

In [23]:
plus5s = multi_merge_on_bc(p5_1[["barcode", "STR", "gDNA1", "cDNA1"]],
                           p5_3[["barcode", "STR", "gDNA3", "cDNA3"]],
                           p5_4[["barcode", "STR", "gDNA4", "cDNA4"]],
                           base_group=p5_2)
plus5s = plus5s[col_order]
plus5s.columns = ["barcode", "STR",
                   "gDNA1_p5", "cDNA1_p5", 
                   "gDNA2_p5", "cDNA2_p5",
                   "gDNA3_p5", "cDNA3_p5",
                   "gDNA4_p5", "cDNA4_p5"]
display(plus5s)
display(plus5s.isnull().sum())
display(plus5s.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_p5,cDNA1_p5,gDNA2_p5,cDNA2_p5,gDNA3_p5,cDNA3_p5,gDNA4_p5,cDNA4_p5
0,ACAAATGCAACCCGATCTTG,Human_STR_1186275,154.0,241.0,6429.0,2323.0,5711.0,6636.0,6823.0,6854.0
1,CCACACTATCAAGTATTGCG,Human_STR_1187476,145.0,200.0,5416.0,2949.0,4365.0,3264.0,4206.0,5824.0
2,ACGGTCTTAGCATATCCATC,Human_STR_556341,177.0,217.0,5405.0,1667.0,6169.0,2586.0,6683.0,10644.0
3,AATCCTGGCCTTAGTTTATC,Human_STR_234793,133.0,199.0,4411.0,1778.0,4031.0,549.0,4128.0,554.0
4,AACTATTTCATCTTTTCTTG,Human_STR_688386,122.0,51.0,4363.0,1289.0,3223.0,813.0,3816.0,3679.0
...,...,...,...,...,...,...,...,...,...,...
1638,AATGCCTTCTGTTGCATCCT,Human_STR_237448,,,,,,,24.0,618.0
1639,TATATCAGATCCAAGTGTCG,Human_STR_715963,,,,,,,23.0,200.0
1640,AAATGTGGGGTTAGGTATTT,Human_STR_544744,,,,,,,14.0,428.0
1641,ATCTGATAGCTACAGTTCTG,Human_STR_474423,,,,,,,13.0,484.0


barcode       0
STR           0
gDNA1_p5    965
cDNA1_p5    965
gDNA2_p5    325
cDNA2_p5    325
gDNA3_p5    879
cDNA3_p5    879
gDNA4_p5    624
cDNA4_p5    624
dtype: int64

False

In [24]:
plus3s = multi_merge_on_bc(p3_1[["barcode", "STR", "gDNA1", "cDNA1"]],
                           p3_3[["barcode", "STR", "gDNA3", "cDNA3"]],
                           p3_4[["barcode", "STR", "gDNA4", "cDNA4"]],
                           base_group=p3_2)
plus3s = plus3s[col_order]
plus3s.columns = ["barcode", "STR",
                  "gDNA1_p3", "cDNA1_p3", 
                  "gDNA2_p3", "cDNA2_p3",
                  "gDNA3_p3", "cDNA3_p3",
                  "gDNA4_p3", "cDNA4_p3"]
display(plus3s)
display(plus3s.isnull().sum())
display(plus3s.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_p3,cDNA1_p3,gDNA2_p3,cDNA2_p3,gDNA3_p3,cDNA3_p3,gDNA4_p3,cDNA4_p3
0,CCTCAACGACCTTTCTCTGG,Human_STR_399685,,,3895.0,916.0,3160.0,880.0,3842.0,4028.0
1,AACATTTTGCACTACTTTTG,Human_STR_691804,127.0,153.0,3809.0,100.0,,,3838.0,7076.0
2,CCGCTCGCACGGCTTCGGTG,Human_STR_1526069,65.0,254.0,2905.0,7539.0,3002.0,2782.0,2928.0,4984.0
3,TCATATACTTCTGTCTTGTT,Human_STR_1423138,82.0,650.0,2862.0,19605.0,2395.0,27622.0,3114.0,19510.0
4,ACGGATGTGCTGGCCCTTCC,Human_STR_1180537,86.0,114.0,2470.0,507.0,2160.0,1851.0,2446.0,1314.0
...,...,...,...,...,...,...,...,...,...,...
317,AGTATTTTCGACGTCTCAAT,Human_STR_65622,,,,,,,277.0,408.0
318,TAATGATTTGAGGTTCTATC,Human_STR_55694,,,,,,,239.0,135.0
319,TCTTCATTATGGGCTCCCTT,Human_STR_663357,,,,,,,154.0,1510.0
320,TTCTTTATCTATTTGAACTA,Human_STR_1159196,,,,,,,113.0,170.0


barcode       0
STR           0
gDNA1_p3    189
cDNA1_p3    189
gDNA2_p3     64
cDNA2_p3     64
gDNA3_p3    146
cDNA3_p3    146
gDNA4_p3    122
cDNA4_p3    122
dtype: int64

False

# Repeat same process above for proc-40, proc-20, proc-10, and proc-5

## proc-40

In [26]:
# Associate DNA(gDNA) and RNA(cDNA)
index = 0
# group 1 - 6h_6_well
group_1_40 = pd.merge(left=bc_count("gDNA1", loadData(gDNA_40[index])),
                   right=bc_count("cDNA1", loadData(cDNA_40[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 2 - ON_12_well_non_PDL
group_2_40 = pd.merge(left=bc_count("gDNA2", loadData(gDNA_40[index])),
                   right=bc_count("cDNA2", loadData(cDNA_40[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 3 - ON_12_well
group_3_40 = pd.merge(left=bc_count("gDNA3", loadData(gDNA_40[index])),
                   right=bc_count("cDNA3", loadData(cDNA_40[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 4 - ON_6_well
group_4_40 = pd.merge(left=bc_count("gDNA4", loadData(gDNA_40[index])),
                   right=bc_count("cDNA4", loadData(cDNA_40[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')


# Filter read with less than 10 counts 
# group 1 - 6h_6_well
group_1_40 = count_filter(group_1_40, 1, 10)

# group 2 - ON_12_well_non_PDL
group_2_40 = count_filter(group_2_40, 2, 10)

# group 3 - ON_12_well
group_3_40 = count_filter(group_3_40, 3, 10)

# group 4 - ON_6_well
group_4_40 = count_filter(group_4_40, 4, 10)


# Associate Barcode with STR 
# drop the ones that does not find a match 

# STR-BC association 
# group 1 - 6h_6_well
group_1_40["STR"] = group_1_40["barcode"].map(association)
group_1_40 = group_1_40.dropna()

# group 2 - ON_12_well_non_PDL
group_2_40["STR"] = group_2_40["barcode"].map(association)
group_2_40 = group_2_40.dropna()

# group 3 - ON_12_well
group_3_40["STR"] = group_3_40["barcode"].map(association)
group_3_40 = group_3_40.dropna()

# group 4 - ON_6_well
group_4_40["STR"] = group_4_40["barcode"].map(association)
group_4_40 = group_4_40.dropna()


# Separate into STR name + type (ref/m/p)
# group 1 - 6h_6_well
ref_1_40, m5_1_40, p5_1_40, p3_1_40 = STR_dfs(group_1_40)

# group 2 - ON_12_well_non_PDL
ref_2_40, m5_2_40, p5_2_40, p3_2_40 = STR_dfs(group_2_40)

# group 3 - ON_12_well
ref_3_40, m5_3_40, p5_3_40, p3_3_40 = STR_dfs(group_3_40)

# group 4 - ON_6_well
ref_4_40, m5_4_40, p5_4_40, p3_4_40 = STR_dfs(group_4_40)

In [27]:
refs_40 = multi_merge_on_bc(ref_1_40[["barcode", "STR", "gDNA1", "cDNA1"]],
                         ref_3_40[["barcode", "STR", "gDNA3", "cDNA3"]],
                         ref_4_40[["barcode", "STR", "gDNA4", "cDNA4"]],
                         base_group=ref_2_40)
refs_40 = refs_40[col_order]
refs_40.columns = ["barcode", "STR",
                "gDNA1_ref", "cDNA1_ref", 
                "gDNA2_ref", "cDNA2_ref",
                "gDNA3_ref", "cDNA3_ref",
                "gDNA4_ref", "cDNA4_ref"]
display(refs_40)
display(refs_40.isnull().sum())
display(refs_40.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_ref,cDNA1_ref,gDNA2_ref,cDNA2_ref,gDNA3_ref,cDNA3_ref,gDNA4_ref,cDNA4_ref
0,ATCTGAAGCTTTTAGCATGG,Human_STR_1482658,285.0,159.0,7905.0,4401.0,7113.0,7372.0,8290.0,9021.0
1,CTATTGACTCCTTTTATCTC,Human_STR_99870,164.0,95.0,5776.0,2525.0,5828.0,193.0,5922.0,4817.0
2,CAATTCGCCTAACCTTTTCA,Human_STR_207538,,,4369.0,426.0,,,4395.0,1740.0
3,CTTACATGCTCAGAACAGGT,Human_STR_690192,106.0,73.0,3729.0,595.0,,,3639.0,541.0
4,CAAACATTCCCGTTTGTTCC,Human_STR_7555,92.0,310.0,3686.0,1574.0,2663.0,726.0,3835.0,2487.0
...,...,...,...,...,...,...,...,...,...,...
1437,TTGTTTTGCTTAGCGGTGAC,Human_STR_664415,,,,,,,49.0,1189.0
1438,TATTTTGATTTGGTCAAACC,Human_STR_43049,,,,,,,48.0,709.0
1439,TTGTTTACGGATCCCCTAAA,Human_STR_1467202,,,,,,,39.0,105.0
1440,TCTTCTCTCCAGGTGTTAAT,Human_STR_352608,,,,,,,26.0,733.0


barcode        0
STR            0
gDNA1_ref    891
cDNA1_ref    891
gDNA2_ref    304
cDNA2_ref    304
gDNA3_ref    762
cDNA3_ref    762
gDNA4_ref    562
cDNA4_ref    562
dtype: int64

False

In [28]:
minus5s_40 = multi_merge_on_bc(m5_1_40[["barcode", "STR", "gDNA1", "cDNA1"]],
                            m5_3_40[["barcode", "STR", "gDNA3", "cDNA3"]],
                            m5_4_40[["barcode", "STR", "gDNA4", "cDNA4"]],
                            base_group=m5_2_40)
minus5s_40 = minus5s_40[col_order]
minus5s_40.columns = ["barcode", "STR",
                   "gDNA1_m5", "cDNA1_m5", 
                   "gDNA2_m5", "cDNA2_m5",
                   "gDNA3_m5", "cDNA3_m5",
                   "gDNA4_m5", "cDNA4_m5"]
display(minus5s_40)
display(minus5s_40.isnull().sum())
display(minus5s_40.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_m5,cDNA1_m5,gDNA2_m5,cDNA2_m5,gDNA3_m5,cDNA3_m5,gDNA4_m5,cDNA4_m5
0,CTAATAACTGCATTTGTATT,Human_STR_856874,123.0,80.0,4920.0,770.0,4606.0,842.0,4932.0,572.0
1,TAAGATTTTTATGTACATCC,Human_STR_1367136,231.0,177.0,4692.0,348.0,4665.0,522.0,5343.0,306.0
2,CACATGAATCCTCCGTTTTC,Human_STR_1182272,94.0,66.0,4274.0,1227.0,4657.0,993.0,,
3,GCGCATACTGCAATTTGTTC,Human_STR_635874,,,4136.0,1073.0,,,3933.0,2136.0
4,CTATTCCCTATCCATCGATC,Human_STR_1039074,68.0,80.0,3570.0,2015.0,2943.0,2236.0,3298.0,4288.0
...,...,...,...,...,...,...,...,...,...,...
1588,TTCTCCTGTTTATTGTCCCT,Human_STR_1407401,,,,,,,45.0,119.0
1589,AAGCACTTATATTTCGAGTT,Human_STR_1382615,,,,,,,45.0,21.0
1590,CGCAACGAGTATCGAGCACT,Human_STR_493979,,,,,,,19.0,22.0
1591,TTATCGTGATTGTTGTCCGT,Human_STR_340393,,,,,,,16.0,18.0


barcode       0
STR           0
gDNA1_m5    957
cDNA1_m5    957
gDNA2_m5    367
cDNA2_m5    367
gDNA3_m5    872
cDNA3_m5    872
gDNA4_m5    672
cDNA4_m5    672
dtype: int64

False

In [29]:
plus5s_40 = multi_merge_on_bc(p5_1_40[["barcode", "STR", "gDNA1", "cDNA1"]],
                           p5_3_40[["barcode", "STR", "gDNA3", "cDNA3"]],
                           p5_4_40[["barcode", "STR", "gDNA4", "cDNA4"]],
                           base_group=p5_2_40)
plus5s_40 = plus5s_40[col_order]
plus5s_40.columns = ["barcode", "STR",
                   "gDNA1_p5", "cDNA1_p5", 
                   "gDNA2_p5", "cDNA2_p5",
                   "gDNA3_p5", "cDNA3_p5",
                   "gDNA4_p5", "cDNA4_p5"]
display(plus5s_40)
display(plus5s_40.isnull().sum())
display(plus5s_40.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_p5,cDNA1_p5,gDNA2_p5,cDNA2_p5,gDNA3_p5,cDNA3_p5,gDNA4_p5,cDNA4_p5
0,ACAAATGCAACCCGATCTTG,Human_STR_1186275,168.0,249.0,6787.0,2463.0,6041.0,7018.0,7230.0,7252.0
1,CCACACTATCAAGTATTGCG,Human_STR_1187476,153.0,213.0,5687.0,3123.0,4543.0,3424.0,4416.0,6102.0
2,ACGGTCTTAGCATATCCATC,Human_STR_556341,184.0,224.0,5655.0,1746.0,6455.0,2697.0,6972.0,11129.0
3,AATCCTGGCCTTAGTTTATC,Human_STR_234793,140.0,208.0,4656.0,1888.0,4282.0,577.0,4366.0,589.0
4,AACTATTTCATCTTTTCTTG,Human_STR_688386,127.0,55.0,4640.0,1403.0,3441.0,872.0,4075.0,3931.0
...,...,...,...,...,...,...,...,...,...,...
1646,TTACTTTCTTGCTACGATCA,Human_STR_1054434,,,,,,,31.0,33.0
1647,AATGCCTTCTGTTGCATCCT,Human_STR_237448,,,,,,,28.0,648.0
1648,AAATGTGGGGTTAGGTATTT,Human_STR_544744,,,,,,,15.0,442.0
1649,GCCGGCCTACATTAATTAAT,Human_STR_851565,,,,,,,14.0,34.0


barcode       0
STR           0
gDNA1_p5    963
cDNA1_p5    963
gDNA2_p5    324
cDNA2_p5    324
gDNA3_p5    882
cDNA3_p5    882
gDNA4_p5    627
cDNA4_p5    627
dtype: int64

False

In [30]:
plus3s_40 = multi_merge_on_bc(p3_1_40[["barcode", "STR", "gDNA1", "cDNA1"]],
                           p3_3_40[["barcode", "STR", "gDNA3", "cDNA3"]],
                           p3_4_40[["barcode", "STR", "gDNA4", "cDNA4"]],
                           base_group=p3_2_40)
plus3s_40 = plus3s_40[col_order]
plus3s_40.columns = ["barcode", "STR",
                  "gDNA1_p3", "cDNA1_p3", 
                  "gDNA2_p3", "cDNA2_p3",
                  "gDNA3_p3", "cDNA3_p3",
                  "gDNA4_p3", "cDNA4_p3"]
display(plus3s_40)
display(plus3s_40.isnull().sum())
display(plus3s_40.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_p3,cDNA1_p3,gDNA2_p3,cDNA2_p3,gDNA3_p3,cDNA3_p3,gDNA4_p3,cDNA4_p3
0,CCTCAACGACCTTTCTCTGG,Human_STR_399685,,,4127.0,980.0,3345.0,929.0,4054.0,4241.0
1,AACATTTTGCACTACTTTTG,Human_STR_691804,131.0,157.0,4002.0,108.0,,,3999.0,7400.0
2,CCGCTCGCACGGCTTCGGTG,Human_STR_1526069,70.0,268.0,3078.0,7969.0,3146.0,2923.0,3069.0,5217.0
3,TCATATACTTCTGTCTTGTT,Human_STR_1423138,87.0,670.0,3011.0,20690.0,2525.0,29069.0,3261.0,20540.0
4,ACGGATGTGCTGGCCCTTCC,Human_STR_1180537,91.0,126.0,2640.0,548.0,2297.0,1976.0,2617.0,1400.0
...,...,...,...,...,...,...,...,...,...,...
319,AGTATTTTCGACGTCTCAAT,Human_STR_65622,,,,,,,294.0,431.0
320,TAATGATTTGAGGTTCTATC,Human_STR_55694,,,,,,,253.0,142.0
321,TCTTCATTATGGGCTCCCTT,Human_STR_663357,,,,,,,165.0,1610.0
322,TTCTTTATCTATTTGAACTA,Human_STR_1159196,,,,,,,121.0,175.0


barcode       0
STR           0
gDNA1_p3    190
cDNA1_p3    190
gDNA2_p3     64
cDNA2_p3     64
gDNA3_p3    147
cDNA3_p3    147
gDNA4_p3    123
cDNA4_p3    123
dtype: int64

False

## proc-20

In [31]:
# Associate DNA(gDNA) and RNA(cDNA)
index = 0
# group 1 - 6h_6_well
group_1_20 = pd.merge(left=bc_count("gDNA1", loadData(gDNA_20[index])),
                   right=bc_count("cDNA1", loadData(cDNA_20[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 2 - ON_12_well_non_PDL
group_2_20 = pd.merge(left=bc_count("gDNA2", loadData(gDNA_20[index])),
                   right=bc_count("cDNA2", loadData(cDNA_20[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 3 - ON_12_well
group_3_20 = pd.merge(left=bc_count("gDNA3", loadData(gDNA_20[index])),
                   right=bc_count("cDNA3", loadData(cDNA_20[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 4 - ON_6_well
group_4_20 = pd.merge(left=bc_count("gDNA4", loadData(gDNA_20[index])),
                   right=bc_count("cDNA4", loadData(cDNA_20[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')


# Filter read with less than 10 counts 
# group 1 - 6h_6_well
group_1_20 = count_filter(group_1_20, 1, 10)

# group 2 - ON_12_well_non_PDL
group_2_20 = count_filter(group_2_20, 2, 10)

# group 3 - ON_12_well
group_3_20 = count_filter(group_3_20, 3, 10)

# group 4 - ON_6_well
group_4_20 = count_filter(group_4_20, 4, 10)


# Associate Barcode with STR 
# drop the ones that does not find a match 

# STR-BC association 
# group 1 - 6h_6_well
group_1_20["STR"] = group_1_20["barcode"].map(association)
group_1_20 = group_1_20.dropna()

# group 2 - ON_12_well_non_PDL
group_2_20["STR"] = group_2_20["barcode"].map(association)
group_2_20 = group_2_20.dropna()

# group 3 - ON_12_well
group_3_20["STR"] = group_3_20["barcode"].map(association)
group_3_20 = group_3_20.dropna()

# group 4 - ON_6_well
group_4_20["STR"] = group_4_20["barcode"].map(association)
group_4_20 = group_4_20.dropna()


# Separate into STR name + type (ref/m/p)
# group 1 - 6h_6_well
ref_1_20, m5_1_20, p5_1_20, p3_1_20 = STR_dfs(group_1_20)

# group 2 - ON_12_well_non_PDL
ref_2_20, m5_2_20, p5_2_20, p3_2_20 = STR_dfs(group_2_20)

# group 3 - ON_12_well
ref_3_20, m5_3_20, p5_3_20, p3_3_20 = STR_dfs(group_3_20)

# group 4 - ON_6_well
ref_4_20, m5_4_20, p5_4_20, p3_4_20 = STR_dfs(group_4_20)

In [32]:
refs_20 = multi_merge_on_bc(ref_1_20[["barcode", "STR", "gDNA1", "cDNA1"]],
                         ref_3_20[["barcode", "STR", "gDNA3", "cDNA3"]],
                         ref_4_20[["barcode", "STR", "gDNA4", "cDNA4"]],
                         base_group=ref_2_20)
refs_20 = refs_20[col_order]
refs_20.columns = ["barcode", "STR",
                "gDNA1_ref", "cDNA1_ref", 
                "gDNA2_ref", "cDNA2_ref",
                "gDNA3_ref", "cDNA3_ref",
                "gDNA4_ref", "cDNA4_ref"]
display(refs_20)
display(refs_20.isnull().sum())
display(refs_20.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_ref,cDNA1_ref,gDNA2_ref,cDNA2_ref,gDNA3_ref,cDNA3_ref,gDNA4_ref,cDNA4_ref
0,ATCTGAAGCTTTTAGCATGG,Human_STR_1482658,291.0,159.0,8225.0,4656.0,7445.0,7698.0,8664.0,9408.0
1,CTATTGACTCCTTTTATCTC,Human_STR_99870,172.0,141.0,6067.0,2666.0,6158.0,204.0,6198.0,5039.0
2,CAATTCGCCTAACCTTTTCA,Human_STR_207538,,,4582.0,452.0,,,4603.0,1849.0
3,CTTACATGCTCAGAACAGGT,Human_STR_690192,110.0,74.0,3903.0,620.0,,,3793.0,565.0
4,CAAACATTCCCGTTTGTTCC,Human_STR_7555,93.0,409.0,3846.0,1636.0,2768.0,754.0,3981.0,2600.0
...,...,...,...,...,...,...,...,...,...,...
1441,TTGTTTTGCTTAGCGGTGAC,Human_STR_664415,,,,,,,50.0,1233.0
1442,TATTTTGATTTGGTCAAACC,Human_STR_43049,,,,,,,48.0,736.0
1443,TTGTTTACGGATCCCCTAAA,Human_STR_1467202,,,,,,,40.0,109.0
1444,TCTTCTCTCCAGGTGTTAAT,Human_STR_352608,,,,,,,27.0,765.0


barcode        0
STR            0
gDNA1_ref    888
cDNA1_ref    888
gDNA2_ref    306
cDNA2_ref    306
gDNA3_ref    762
cDNA3_ref    762
gDNA4_ref    562
cDNA4_ref    562
dtype: int64

False

In [33]:
minus5s_20 = multi_merge_on_bc(m5_1_20[["barcode", "STR", "gDNA1", "cDNA1"]],
                            m5_3_20[["barcode", "STR", "gDNA3", "cDNA3"]],
                            m5_4_20[["barcode", "STR", "gDNA4", "cDNA4"]],
                            base_group=m5_2_20)
minus5s_20 = minus5s_20[col_order]
minus5s_20.columns = ["barcode", "STR",
                   "gDNA1_m5", "cDNA1_m5", 
                   "gDNA2_m5", "cDNA2_m5",
                   "gDNA3_m5", "cDNA3_m5",
                   "gDNA4_m5", "cDNA4_m5"]
display(minus5s_20)
display(minus5s_20.isnull().sum())
display(minus5s_20.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_m5,cDNA1_m5,gDNA2_m5,cDNA2_m5,gDNA3_m5,cDNA3_m5,gDNA4_m5,cDNA4_m5
0,CTAATAACTGCATTTGTATT,Human_STR_856874,127.0,82.0,5079.0,791.0,4749.0,864.0,5079.0,591.0
1,TAAGATTTTTATGTACATCC,Human_STR_1367136,236.0,183.0,4895.0,365.0,4856.0,554.0,5606.0,315.0
2,CACATGAATCCTCCGTTTTC,Human_STR_1182272,102.0,69.0,4499.0,1309.0,4921.0,1056.0,,
3,GCGCATACTGCAATTTGTTC,Human_STR_635874,,,4291.0,1647.0,,,4068.0,2201.0
4,CTATTCCCTATCCATCGATC,Human_STR_1039074,75.0,85.0,3736.0,2123.0,3099.0,2349.0,3442.0,4475.0
...,...,...,...,...,...,...,...,...,...,...
1597,GTCGTTACTGTTTATAGGCT,Human_STR_338524,,,,,,,51.0,1720.0
1598,TTCTCCTGTTTATTGTCCCT,Human_STR_1407401,,,,,,,49.0,123.0
1599,AAGCACTTATATTTCGAGTT,Human_STR_1382615,,,,,,,46.0,23.0
1600,GGGAATTCATCCTACTAAGC,Human_STR_1226231,,,,,,,20.0,40.0


barcode       0
STR           0
gDNA1_m5    960
cDNA1_m5    960
gDNA2_m5    365
cDNA2_m5    365
gDNA3_m5    878
cDNA3_m5    878
gDNA4_m5    675
cDNA4_m5    675
dtype: int64

False

In [34]:
plus5s_20 = multi_merge_on_bc(p5_1_20[["barcode", "STR", "gDNA1", "cDNA1"]],
                           p5_3_20[["barcode", "STR", "gDNA3", "cDNA3"]],
                           p5_4_20[["barcode", "STR", "gDNA4", "cDNA4"]],
                           base_group=p5_2_20)
plus5s_20 = plus5s_20[col_order]
plus5s_20.columns = ["barcode", "STR",
                   "gDNA1_p5", "cDNA1_p5", 
                   "gDNA2_p5", "cDNA2_p5",
                   "gDNA3_p5", "cDNA3_p5",
                   "gDNA4_p5", "cDNA4_p5"]
display(plus5s_20)
display(plus5s_20.isnull().sum())
display(plus5s_20.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_p5,cDNA1_p5,gDNA2_p5,cDNA2_p5,gDNA3_p5,cDNA3_p5,gDNA4_p5,cDNA4_p5
0,ACAAATGCAACCCGATCTTG,Human_STR_1186275,179.0,263.0,7141.0,2627.0,6346.0,7371.0,7578.0,7628.0
1,CCACACTATCAAGTATTGCG,Human_STR_1187476,156.0,224.0,5927.0,3275.0,4731.0,3565.0,4627.0,6335.0
2,ACGGTCTTAGCATATCCATC,Human_STR_556341,189.0,255.0,5880.0,1821.0,6705.0,2800.0,7192.0,11547.0
3,AACTATTTCATCTTTTCTTG,Human_STR_688386,133.0,58.0,4888.0,1485.0,3624.0,917.0,4273.0,4117.0
4,ATCATATTAGTAGCAACCTC,Human_STR_931406,145.0,177.0,4879.0,1536.0,3868.0,127.0,4954.0,581.0
...,...,...,...,...,...,...,...,...,...,...
1649,TTACTTTCTTGCTACGATCA,Human_STR_1054434,,,,,,,31.0,35.0
1650,AATGCCTTCTGTTGCATCCT,Human_STR_237448,,,,,,,29.0,679.0
1651,AAATGTGGGGTTAGGTATTT,Human_STR_544744,,,,,,,16.0,463.0
1652,ATCTGATAGCTACAGTTCTG,Human_STR_474423,,,,,,,16.0,548.0


barcode       0
STR           0
gDNA1_p5    960
cDNA1_p5    960
gDNA2_p5    323
cDNA2_p5    323
gDNA3_p5    884
cDNA3_p5    884
gDNA4_p5    625
cDNA4_p5    625
dtype: int64

False

In [35]:
plus3s_20 = multi_merge_on_bc(p3_1_20[["barcode", "STR", "gDNA1", "cDNA1"]],
                           p3_3_20[["barcode", "STR", "gDNA3", "cDNA3"]],
                           p3_4_20[["barcode", "STR", "gDNA4", "cDNA4"]],
                           base_group=p3_2_20)
plus3s_20 = plus3s_20[col_order]
plus3s_20.columns = ["barcode", "STR",
                  "gDNA1_p3", "cDNA1_p3", 
                  "gDNA2_p3", "cDNA2_p3",
                  "gDNA3_p3", "cDNA3_p3",
                  "gDNA4_p3", "cDNA4_p3"]
display(plus3s_20)
display(plus3s_20.isnull().sum())
display(plus3s_20.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_p3,cDNA1_p3,gDNA2_p3,cDNA2_p3,gDNA3_p3,cDNA3_p3,gDNA4_p3,cDNA4_p3
0,CCTCAACGACCTTTCTCTGG,Human_STR_399685,,,4310.0,1031.0,3482.0,967.0,4214.0,4439.0
1,AACATTTTGCACTACTTTTG,Human_STR_691804,135.0,163.0,4151.0,113.0,,,4128.0,7647.0
2,CCGCTCGCACGGCTTCGGTG,Human_STR_1526069,73.0,290.0,3212.0,8399.0,3282.0,3050.0,3182.0,5433.0
3,TCATATACTTCTGTCTTGTT,Human_STR_1423138,89.0,721.0,3129.0,21629.0,2622.0,30199.0,3392.0,21365.0
4,ACGGATGTGCTGGCCCTTCC,Human_STR_1180537,94.0,128.0,2809.0,572.0,2414.0,2076.0,2736.0,1475.0
...,...,...,...,...,...,...,...,...,...,...
321,AGTATTTTCGACGTCTCAAT,Human_STR_65622,,,,,,,309.0,454.0
322,TAATGATTTGAGGTTCTATC,Human_STR_55694,,,,,,,266.0,150.0
323,TCTTCATTATGGGCTCCCTT,Human_STR_663357,,,,,,,172.0,1686.0
324,TTCTTTATCTATTTGAACTA,Human_STR_1159196,,,,,,,128.0,179.0


barcode       0
STR           0
gDNA1_p3    189
cDNA1_p3    189
gDNA2_p3     64
cDNA2_p3     64
gDNA3_p3    149
cDNA3_p3    149
gDNA4_p3    121
cDNA4_p3    121
dtype: int64

False

## proc-10

In [None]:
# Associate DNA(gDNA) and RNA(cDNA)
index = 0
# group 1 - 6h_6_well
group_1_10 = pd.merge(left=bc_count("gDNA1", loadData(gDNA_10[index])),
                   right=bc_count("cDNA1", loadData(cDNA_10[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 2 - ON_12_well_non_PDL
group_2_10 = pd.merge(left=bc_count("gDNA2", loadData(gDNA_10[index])),
                   right=bc_count("cDNA2", loadData(cDNA_10[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 3 - ON_12_well
group_3_10 = pd.merge(left=bc_count("gDNA3", loadData(gDNA_10[index])),
                   right=bc_count("cDNA3", loadData(cDNA_10[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 4 - ON_6_well
group_4_10 = pd.merge(left=bc_count("gDNA4", loadData(gDNA_10[index])),
                   right=bc_count("cDNA4", loadData(cDNA_10[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')


# Filter read with less than 10 counts 
# group 1 - 6h_6_well
group_1_10 = count_filter(group_1_10, 1, 10)

# group 2 - ON_12_well_non_PDL
group_2_10 = count_filter(group_2_10, 2, 10)

# group 3 - ON_12_well
group_3_10 = count_filter(group_3_10, 3, 10)

# group 4 - ON_6_well
group_4_10 = count_filter(group_4_10, 4, 10)


# Associate Barcode with STR 
# drop the ones that does not find a match 

# STR-BC association 
# group 1 - 6h_6_well
group_1_10["STR"] = group_1_10["barcode"].map(association)
group_1_10 = group_1_10.dropna()

# group 2 - ON_12_well_non_PDL
group_2_10["STR"] = group_2_10["barcode"].map(association)
group_2_10 = group_2_10.dropna()

# group 3 - ON_12_well
group_3_10["STR"] = group_3_10["barcode"].map(association)
group_3_10 = group_3_10.dropna()

# group 4 - ON_6_well
group_4_10["STR"] = group_4_10["barcode"].map(association)
group_4_10 = group_4_10.dropna()


# Separate into STR name + type (ref/m/p)
# group 1 - 6h_6_well
ref_1_10, m5_1_10, p5_1_10, p3_1_10 = STR_dfs(group_1_10)

# group 2 - ON_12_well_non_PDL
ref_2_10, m5_2_10, p5_2_10, p3_2_10 = STR_dfs(group_2_10)

# group 3 - ON_12_well
ref_3_10, m5_3_10, p5_3_10, p3_3_10 = STR_dfs(group_3_10)

# group 4 - ON_6_well
ref_4_10, m5_4_10, p5_4_10, p3_4_10 = STR_dfs(group_4_10)

In [21]:
refs_10 = multi_merge_on_bc(ref_1_10[["barcode", "STR", "gDNA1", "cDNA1"]],
                         ref_3_10[["barcode", "STR", "gDNA3", "cDNA3"]],
                         ref_4_10[["barcode", "STR", "gDNA4", "cDNA4"]],
                         base_group=ref_2_10)
refs_10 = refs_10[col_order]
refs_10.columns = ["barcode", "STR",
                "gDNA1_ref", "cDNA1_ref", 
                "gDNA2_ref", "cDNA2_ref",
                "gDNA3_ref", "cDNA3_ref",
                "gDNA4_ref", "cDNA4_ref"]
display(refs_10)
display(refs_10.isnull().sum())
display(refs_10.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_ref,cDNA1_ref,gDNA2_ref,cDNA2_ref,gDNA3_ref,cDNA3_ref,gDNA4_ref,cDNA4_ref
0,ATCTGAAGCTTTTAGCATGG,Human_STR_1482658,273.0,152.0,7533.0,4147.0,6744.0,6977.0,7863.0,8597.0
1,CTATTGACTCCTTTTATCTC,Human_STR_99870,155.0,93.0,5423.0,2352.0,5446.0,183.0,5611.0,4543.0
2,CAATTCGCCTAACCTTTTCA,Human_STR_207538,,,4104.0,401.0,,,4104.0,1650.0
3,CTTACATGCTCAGAACAGGT,Human_STR_690192,104.0,69.0,3537.0,557.0,,,3474.0,516.0
4,CAAACATTCCCGTTTGTTCC,Human_STR_7555,85.0,298.0,3493.0,1498.0,2532.0,688.0,3655.0,2363.0
...,...,...,...,...,...,...,...,...,...,...
1432,TATTTTGATTTGGTCAAACC,Human_STR_43049,,,,,,,46.0,687.0
1433,TTGTTTTGCTTAGCGGTGAC,Human_STR_664415,,,,,,,41.0,1137.0
1434,TTGTTTACGGATCCCCTAAA,Human_STR_1467202,,,,,,,35.0,101.0
1435,TCTTCTCTCCAGGTGTTAAT,Human_STR_352608,,,,,,,26.0,708.0


barcode        0
STR            0
gDNA1_ref    898
cDNA1_ref    898
gDNA2_ref    306
cDNA2_ref    306
gDNA3_ref    760
cDNA3_ref    760
gDNA4_ref    562
cDNA4_ref    562
dtype: int64

False

In [22]:
minus5s_10 = multi_merge_on_bc(m5_1_10[["barcode", "STR", "gDNA1", "cDNA1"]],
                            m5_3_10[["barcode", "STR", "gDNA3", "cDNA3"]],
                            m5_4_10[["barcode", "STR", "gDNA4", "cDNA4"]],
                            base_group=m5_2_10)
minus5s_10 = minus5s_10[col_order]
minus5s_10.columns = ["barcode", "STR",
                   "gDNA1_m5", "cDNA1_m5", 
                   "gDNA2_m5", "cDNA2_m5",
                   "gDNA3_m5", "cDNA3_m5",
                   "gDNA4_m5", "cDNA4_m5"]
display(minus5s_10)
display(minus5s_10.isnull().sum())
display(minus5s_10.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_m5,cDNA1_m5,gDNA2_m5,cDNA2_m5,gDNA3_m5,cDNA3_m5,gDNA4_m5,cDNA4_m5
0,CTAATAACTGCATTTGTATT,Human_STR_856874,119.0,74.0,4715.0,732.0,4418.0,800.0,4754.0,550.0
1,TAAGATTTTTATGTACATCC,Human_STR_1367136,222.0,167.0,4475.0,334.0,4456.0,491.0,5070.0,289.0
2,CACATGAATCCTCCGTTTTC,Human_STR_1182272,89.0,63.0,3999.0,1154.0,4344.0,940.0,,
3,GCGCATACTGCAATTTGTTC,Human_STR_635874,,,3942.0,1027.0,,,3764.0,2046.0
4,CTATTCCCTATCCATCGATC,Human_STR_1039074,65.0,80.0,3375.0,1898.0,2771.0,2106.0,3121.0,4048.0
...,...,...,...,...,...,...,...,...,...,...
1577,GTCGTTACTGTTTATAGGCT,Human_STR_338524,,,,,,,47.0,1612.0
1578,TTCTCCTGTTTATTGTCCCT,Human_STR_1407401,,,,,,,43.0,117.0
1579,AAGCACTTATATTTCGAGTT,Human_STR_1382615,,,,,,,42.0,21.0
1580,CGCAACGAGTATCGAGCACT,Human_STR_493979,,,,,,,18.0,22.0


barcode       0
STR           0
gDNA1_m5    963
cDNA1_m5    963
gDNA2_m5    368
cDNA2_m5    368
gDNA3_m5    864
cDNA3_m5    864
gDNA4_m5    665
cDNA4_m5    665
dtype: int64

False

In [23]:
plus5s_10 = multi_merge_on_bc(p5_1_10[["barcode", "STR", "gDNA1", "cDNA1"]],
                           p5_3_10[["barcode", "STR", "gDNA3", "cDNA3"]],
                           p5_4_10[["barcode", "STR", "gDNA4", "cDNA4"]],
                           base_group=p5_2_10)
plus5s_10 = plus5s_10[col_order]
plus5s_10.columns = ["barcode", "STR",
                   "gDNA1_p5", "cDNA1_p5", 
                   "gDNA2_p5", "cDNA2_p5",
                   "gDNA3_p5", "cDNA3_p5",
                   "gDNA4_p5", "cDNA4_p5"]
display(plus5s_10)
display(plus5s_10.isnull().sum())
display(plus5s_10.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_p5,cDNA1_p5,gDNA2_p5,cDNA2_p5,gDNA3_p5,cDNA3_p5,gDNA4_p5,cDNA4_p5
0,ACAAATGCAACCCGATCTTG,Human_STR_1186275,154.0,241.0,6429.0,2323.0,5711.0,6636.0,6823.0,6854.0
1,CCACACTATCAAGTATTGCG,Human_STR_1187476,145.0,200.0,5416.0,2949.0,4365.0,3264.0,4206.0,5824.0
2,ACGGTCTTAGCATATCCATC,Human_STR_556341,177.0,217.0,5405.0,1667.0,6169.0,2586.0,6683.0,10644.0
3,AATCCTGGCCTTAGTTTATC,Human_STR_234793,133.0,199.0,4411.0,1778.0,4031.0,549.0,4128.0,554.0
4,AACTATTTCATCTTTTCTTG,Human_STR_688386,122.0,51.0,4363.0,1289.0,3223.0,813.0,3816.0,3679.0
...,...,...,...,...,...,...,...,...,...,...
1638,AATGCCTTCTGTTGCATCCT,Human_STR_237448,,,,,,,24.0,618.0
1639,TATATCAGATCCAAGTGTCG,Human_STR_715963,,,,,,,23.0,200.0
1640,AAATGTGGGGTTAGGTATTT,Human_STR_544744,,,,,,,14.0,428.0
1641,ATCTGATAGCTACAGTTCTG,Human_STR_474423,,,,,,,13.0,484.0


barcode       0
STR           0
gDNA1_p5    965
cDNA1_p5    965
gDNA2_p5    325
cDNA2_p5    325
gDNA3_p5    879
cDNA3_p5    879
gDNA4_p5    624
cDNA4_p5    624
dtype: int64

False

In [24]:
plus3s_10 = multi_merge_on_bc(p3_1_10[["barcode", "STR", "gDNA1", "cDNA1"]],
                           p3_3_10[["barcode", "STR", "gDNA3", "cDNA3"]],
                           p3_4_10[["barcode", "STR", "gDNA4", "cDNA4"]],
                           base_group=p3_2_10)
plus3s_10 = plus3s_10[col_order]
plus3s_10.columns = ["barcode", "STR",
                  "gDNA1_p3", "cDNA1_p3", 
                  "gDNA2_p3", "cDNA2_p3",
                  "gDNA3_p3", "cDNA3_p3",
                  "gDNA4_p3", "cDNA4_p3"]
display(plus3s_10)
display(plus3s_10.isnull().sum())
display(plus3s_10.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_p3,cDNA1_p3,gDNA2_p3,cDNA2_p3,gDNA3_p3,cDNA3_p3,gDNA4_p3,cDNA4_p3
0,CCTCAACGACCTTTCTCTGG,Human_STR_399685,,,3895.0,916.0,3160.0,880.0,3842.0,4028.0
1,AACATTTTGCACTACTTTTG,Human_STR_691804,127.0,153.0,3809.0,100.0,,,3838.0,7076.0
2,CCGCTCGCACGGCTTCGGTG,Human_STR_1526069,65.0,254.0,2905.0,7539.0,3002.0,2782.0,2928.0,4984.0
3,TCATATACTTCTGTCTTGTT,Human_STR_1423138,82.0,650.0,2862.0,19605.0,2395.0,27622.0,3114.0,19510.0
4,ACGGATGTGCTGGCCCTTCC,Human_STR_1180537,86.0,114.0,2470.0,507.0,2160.0,1851.0,2446.0,1314.0
...,...,...,...,...,...,...,...,...,...,...
317,AGTATTTTCGACGTCTCAAT,Human_STR_65622,,,,,,,277.0,408.0
318,TAATGATTTGAGGTTCTATC,Human_STR_55694,,,,,,,239.0,135.0
319,TCTTCATTATGGGCTCCCTT,Human_STR_663357,,,,,,,154.0,1510.0
320,TTCTTTATCTATTTGAACTA,Human_STR_1159196,,,,,,,113.0,170.0


barcode       0
STR           0
gDNA1_p3    189
cDNA1_p3    189
gDNA2_p3     64
cDNA2_p3     64
gDNA3_p3    146
cDNA3_p3    146
gDNA4_p3    122
cDNA4_p3    122
dtype: int64

False

## proc-5

In [None]:
# Associate DNA(gDNA) and RNA(cDNA)
index = 0
# group 1 - 6h_6_well
group_1_5 = pd.merge(left=bc_count("gDNA1", loadData(gDNA_5[index])),
                   right=bc_count("cDNA1", loadData(cDNA_5[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 2 - ON_12_well_non_PDL
group_2_5 = pd.merge(left=bc_count("gDNA2", loadData(gDNA_5[index])),
                   right=bc_count("cDNA2", loadData(cDNA_5[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 3 - ON_12_well
group_3_5 = pd.merge(left=bc_count("gDNA3", loadData(gDNA_5[index])),
                   right=bc_count("cDNA3", loadData(cDNA_5[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')

index += 1
# group 4 - ON_6_well
group_4_5 = pd.merge(left=bc_count("gDNA4", loadData(gDNA_5[index])),
                   right=bc_count("cDNA4", loadData(cDNA_5[index])),
                   how='left', left_on='barcode',
                   right_on='barcode')


# Filter read with less than 10 counts 
# group 1 - 6h_6_well
group_1_5 = count_filter(group_1_5, 1, 10)

# group 2 - ON_12_well_non_PDL
group_2_5 = count_filter(group_2_5, 2, 10)

# group 3 - ON_12_well
group_3_5 = count_filter(group_3_5, 3, 10)

# group 4 - ON_6_well
group_4_5 = count_filter(group_4_5, 4, 10)


# Associate Barcode with STR 
# drop the ones that does not find a match 

# STR-BC association 
# group 1 - 6h_6_well
group_1_5["STR"] = group_1_5["barcode"].map(association)
group_1_5 = group_1_5.dropna()

# group 2 - ON_12_well_non_PDL
group_2_5["STR"] = group_2_5["barcode"].map(association)
group_2_5 = group_2_5.dropna()

# group 3 - ON_12_well
group_3_5["STR"] = group_3_5["barcode"].map(association)
group_3_5 = group_3_5.dropna()

# group 4 - ON_6_well
group_4_5["STR"] = group_4_5["barcode"].map(association)
group_4_5 = group_4_5.dropna()


# Separate into STR name + type (ref/m/p)
# group 1 - 6h_6_well
ref_1_5, m5_1_5, p5_1_5, p3_1_5 = STR_dfs(group_1_5)

# group 2 - ON_12_well_non_PDL
ref_2_5, m5_2_5, p5_2_5, p3_2_5 = STR_dfs(group_2_5)

# group 3 - ON_12_well
ref_3_5, m5_3_5, p5_3_5, p3_3_5 = STR_dfs(group_3_5)

# group 4 - ON_6_well
ref_4_5, m5_4_5, p5_4_5, p3_4_5 = STR_dfs(group_4_5)

In [21]:
refs_5 = multi_merge_on_bc(ref_1_5[["barcode", "STR", "gDNA1", "cDNA1"]],
                         ref_3_5[["barcode", "STR", "gDNA3", "cDNA3"]],
                         ref_4_5[["barcode", "STR", "gDNA4", "cDNA4"]],
                         base_group=ref_2_5)
refs_5 = refs_5[col_order]
refs_5.columns = ["barcode", "STR",
                "gDNA1_ref", "cDNA1_ref", 
                "gDNA2_ref", "cDNA2_ref",
                "gDNA3_ref", "cDNA3_ref",
                "gDNA4_ref", "cDNA4_ref"]
display(refs_5)
display(refs_5.isnull().sum())
display(refs_5.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_ref,cDNA1_ref,gDNA2_ref,cDNA2_ref,gDNA3_ref,cDNA3_ref,gDNA4_ref,cDNA4_ref
0,ATCTGAAGCTTTTAGCATGG,Human_STR_1482658,273.0,152.0,7533.0,4147.0,6744.0,6977.0,7863.0,8597.0
1,CTATTGACTCCTTTTATCTC,Human_STR_99870,155.0,93.0,5423.0,2352.0,5446.0,183.0,5611.0,4543.0
2,CAATTCGCCTAACCTTTTCA,Human_STR_207538,,,4104.0,401.0,,,4104.0,1650.0
3,CTTACATGCTCAGAACAGGT,Human_STR_690192,104.0,69.0,3537.0,557.0,,,3474.0,516.0
4,CAAACATTCCCGTTTGTTCC,Human_STR_7555,85.0,298.0,3493.0,1498.0,2532.0,688.0,3655.0,2363.0
...,...,...,...,...,...,...,...,...,...,...
1432,TATTTTGATTTGGTCAAACC,Human_STR_43049,,,,,,,46.0,687.0
1433,TTGTTTTGCTTAGCGGTGAC,Human_STR_664415,,,,,,,41.0,1137.0
1434,TTGTTTACGGATCCCCTAAA,Human_STR_1467202,,,,,,,35.0,101.0
1435,TCTTCTCTCCAGGTGTTAAT,Human_STR_352608,,,,,,,26.0,708.0


barcode        0
STR            0
gDNA1_ref    898
cDNA1_ref    898
gDNA2_ref    306
cDNA2_ref    306
gDNA3_ref    760
cDNA3_ref    760
gDNA4_ref    562
cDNA4_ref    562
dtype: int64

False

In [22]:
minus5s_5 = multi_merge_on_bc(m5_1_5[["barcode", "STR", "gDNA1", "cDNA1"]],
                            m5_3_5[["barcode", "STR", "gDNA3", "cDNA3"]],
                            m5_4_5[["barcode", "STR", "gDNA4", "cDNA4"]],
                            base_group=m5_2_5)
minus5s_5 = minus5s_5[col_order]
minus5s_5.columns = ["barcode", "STR",
                   "gDNA1_m5", "cDNA1_m5", 
                   "gDNA2_m5", "cDNA2_m5",
                   "gDNA3_m5", "cDNA3_m5",
                   "gDNA4_m5", "cDNA4_m5"]
display(minus5s_5)
display(minus5s_5.isnull().sum())
display(minus5s_5.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_m5,cDNA1_m5,gDNA2_m5,cDNA2_m5,gDNA3_m5,cDNA3_m5,gDNA4_m5,cDNA4_m5
0,CTAATAACTGCATTTGTATT,Human_STR_856874,119.0,74.0,4715.0,732.0,4418.0,800.0,4754.0,550.0
1,TAAGATTTTTATGTACATCC,Human_STR_1367136,222.0,167.0,4475.0,334.0,4456.0,491.0,5070.0,289.0
2,CACATGAATCCTCCGTTTTC,Human_STR_1182272,89.0,63.0,3999.0,1154.0,4344.0,940.0,,
3,GCGCATACTGCAATTTGTTC,Human_STR_635874,,,3942.0,1027.0,,,3764.0,2046.0
4,CTATTCCCTATCCATCGATC,Human_STR_1039074,65.0,80.0,3375.0,1898.0,2771.0,2106.0,3121.0,4048.0
...,...,...,...,...,...,...,...,...,...,...
1577,GTCGTTACTGTTTATAGGCT,Human_STR_338524,,,,,,,47.0,1612.0
1578,TTCTCCTGTTTATTGTCCCT,Human_STR_1407401,,,,,,,43.0,117.0
1579,AAGCACTTATATTTCGAGTT,Human_STR_1382615,,,,,,,42.0,21.0
1580,CGCAACGAGTATCGAGCACT,Human_STR_493979,,,,,,,18.0,22.0


barcode       0
STR           0
gDNA1_m5    963
cDNA1_m5    963
gDNA2_m5    368
cDNA2_m5    368
gDNA3_m5    864
cDNA3_m5    864
gDNA4_m5    665
cDNA4_m5    665
dtype: int64

False

In [23]:
plus5s_5 = multi_merge_on_bc(p5_1_5[["barcode", "STR", "gDNA1", "cDNA1"]],
                           p5_3_5[["barcode", "STR", "gDNA3", "cDNA3"]],
                           p5_4_5[["barcode", "STR", "gDNA4", "cDNA4"]],
                           base_group=p5_2_5)
plus5s_5 = plus5s_5[col_order]
plus5s_5.columns = ["barcode", "STR",
                   "gDNA1_p5", "cDNA1_p5", 
                   "gDNA2_p5", "cDNA2_p5",
                   "gDNA3_p5", "cDNA3_p5",
                   "gDNA4_p5", "cDNA4_p5"]
display(plus5s_5)
display(plus5s_5.isnull().sum())
display(plus5s_5.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_p5,cDNA1_p5,gDNA2_p5,cDNA2_p5,gDNA3_p5,cDNA3_p5,gDNA4_p5,cDNA4_p5
0,ACAAATGCAACCCGATCTTG,Human_STR_1186275,154.0,241.0,6429.0,2323.0,5711.0,6636.0,6823.0,6854.0
1,CCACACTATCAAGTATTGCG,Human_STR_1187476,145.0,200.0,5416.0,2949.0,4365.0,3264.0,4206.0,5824.0
2,ACGGTCTTAGCATATCCATC,Human_STR_556341,177.0,217.0,5405.0,1667.0,6169.0,2586.0,6683.0,10644.0
3,AATCCTGGCCTTAGTTTATC,Human_STR_234793,133.0,199.0,4411.0,1778.0,4031.0,549.0,4128.0,554.0
4,AACTATTTCATCTTTTCTTG,Human_STR_688386,122.0,51.0,4363.0,1289.0,3223.0,813.0,3816.0,3679.0
...,...,...,...,...,...,...,...,...,...,...
1638,AATGCCTTCTGTTGCATCCT,Human_STR_237448,,,,,,,24.0,618.0
1639,TATATCAGATCCAAGTGTCG,Human_STR_715963,,,,,,,23.0,200.0
1640,AAATGTGGGGTTAGGTATTT,Human_STR_544744,,,,,,,14.0,428.0
1641,ATCTGATAGCTACAGTTCTG,Human_STR_474423,,,,,,,13.0,484.0


barcode       0
STR           0
gDNA1_p5    965
cDNA1_p5    965
gDNA2_p5    325
cDNA2_p5    325
gDNA3_p5    879
cDNA3_p5    879
gDNA4_p5    624
cDNA4_p5    624
dtype: int64

False

In [24]:
plus3s_5 = multi_merge_on_bc(p3_1_5[["barcode", "STR", "gDNA1", "cDNA1"]],
                           p3_3_5[["barcode", "STR", "gDNA3", "cDNA3"]],
                           p3_4_5[["barcode", "STR", "gDNA4", "cDNA4"]],
                           base_group=p3_2_5)
plus3s_5 = plus3s_5[col_order]
plus3s_5.columns = ["barcode", "STR",
                  "gDNA1_p3", "cDNA1_p3", 
                  "gDNA2_p3", "cDNA2_p3",
                  "gDNA3_p3", "cDNA3_p3",
                  "gDNA4_p3", "cDNA4_p3"]
display(plus3s_5)
display(plus3s_5.isnull().sum())
display(plus3s_5.STR.is_unique)

Unnamed: 0,barcode,STR,gDNA1_p3,cDNA1_p3,gDNA2_p3,cDNA2_p3,gDNA3_p3,cDNA3_p3,gDNA4_p3,cDNA4_p3
0,CCTCAACGACCTTTCTCTGG,Human_STR_399685,,,3895.0,916.0,3160.0,880.0,3842.0,4028.0
1,AACATTTTGCACTACTTTTG,Human_STR_691804,127.0,153.0,3809.0,100.0,,,3838.0,7076.0
2,CCGCTCGCACGGCTTCGGTG,Human_STR_1526069,65.0,254.0,2905.0,7539.0,3002.0,2782.0,2928.0,4984.0
3,TCATATACTTCTGTCTTGTT,Human_STR_1423138,82.0,650.0,2862.0,19605.0,2395.0,27622.0,3114.0,19510.0
4,ACGGATGTGCTGGCCCTTCC,Human_STR_1180537,86.0,114.0,2470.0,507.0,2160.0,1851.0,2446.0,1314.0
...,...,...,...,...,...,...,...,...,...,...
317,AGTATTTTCGACGTCTCAAT,Human_STR_65622,,,,,,,277.0,408.0
318,TAATGATTTGAGGTTCTATC,Human_STR_55694,,,,,,,239.0,135.0
319,TCTTCATTATGGGCTCCCTT,Human_STR_663357,,,,,,,154.0,1510.0
320,TTCTTTATCTATTTGAACTA,Human_STR_1159196,,,,,,,113.0,170.0


barcode       0
STR           0
gDNA1_p3    189
cDNA1_p3    189
gDNA2_p3     64
cDNA2_p3     64
gDNA3_p3    146
cDNA3_p3    146
gDNA4_p3    122
cDNA4_p3    122
dtype: int64

False