In [None]:
import os, sys
import csv
import json
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
file = '20191204_peptide_aptamer_pairs_correct.csv'

## General metrics about the file

In [None]:
# Generate a dictionary from peptide --> [(aptamer, read count)]
all_aptamers = []
dataset = {}
with open(file) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
        else:
            peptide = row[0]
            aptamer = row[1]
            # Controls
            if "RRRRRR" in peptide:
                continue
            if aptamer == "CTTTGTAATTGGTTCTGAGTTCCGTTGTGGGAGGAACATG":
                continue
            if len(aptamer) < 40:
                continue
            rc = row[2]
            
            # Reformat the peptide
            if '_' in peptide:
                split = peptide.split('_')
                save = split[0]
                if len(save) < 8:
                    continue
                peptide = save 
            # Only look at length-8 peptides
            if len(peptide) < 8:
                continue
            
            if peptide not in dataset.keys():
                dataset[peptide] = []
            dataset[peptide].append((aptamer, rc))
            all_aptamers.append(aptamer)
        line_count += 1
    print("Number of lines: ", line_count)

In [None]:
print(str(len(dataset.keys())))
peptides = list(dataset.keys())
lengths = [len(p) for p in peptides]
for i in range(3, 9):
    print("count of ", i, ":", lengths.count(i))

## Split the dataset into binding affinity groups

In [None]:
affinity_groups = [0, 0, 0, 0]
x = ['high (> 50)', 'medium (>10)', 'medium-ish (>5)', 'low (<= 5)']
for peptide in dataset:
    for a, r in dataset[peptide]:
        r = int(r)
        if r > 50:
            affinity_groups[0] += 1
        elif r > 10:
            affinity_groups[1] += 1
        elif r > 5:
            affinity_groups[2] += 1
        else:
            affinity_groups[3] += 1

print("Affinity Groups: ", affinity_groups)          
plt.title("Aptamer dataset binding affinity groups (based on read count)")
plt.xlabel("Affinity groups (read count)")
plt.ylabel("Number of samples")
sns.barplot(x, affinity_groups)

## Write the dataset into a json file

In [None]:
# Dictionary should be aptamer --> [(peptide, aff), ...., ]
aptamer_dataset = {}
with open(file) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
        else:
            peptide = row[0]
            aptamer = row[1]
            rc = row[2]
            
            # Reformat the peptide
            if '_' in peptide:
                split = peptide.split('_')
                save = split[0]
                if len(save) < 8:
                    continue
                peptide = save 
            if aptamer not in aptamer_dataset.keys():
                aptamer_dataset[aptamer] = []
            
            aptamer_dataset[aptamer].append((peptide, rc))
        line_count +=1

with open('aptamer_dataset.json', 'w') as f:
    json.dump(aptamer_dataset, f)
    print("Wrote to dataset file")

## Test to see if the datasets are uniform

In [None]:
# Reduce the peptides to only peptides with 8 AA
all_peptides = [p for p in peptides if len(p) == 8]
all_aptamers = [a for a in all_aptamers if len(a) == 40]

print(len(all_peptides))
print(len(set(all_peptides)))
print(len(all_aptamers))
print(len(set(all_aptamers)))

## Test if each position of peptides is uniformly distributed

In [None]:
for i in range(1,8):
    char = dict()
    for p in set(all_peptides):
        if p[i] not in char:
            char[p[i]] = 1
        else:
            char[p[i]] += 1
    print("-------------Position ", i, "--------------")
    print(char)
    plt.bar(sorted(list(char.keys())), list(char.values()), color='g')
    plt.show()
print(sorted(list(char.keys())))

In [None]:
for i in range(40):
    char = dict()
    for a in set(all_aptamers):
        if a[i] not in char:
            char[a[i]] = 1
        else:
            char[a[i]] += 1
    print("-------------Position ", i, "--------------")
    print(char)
    plt.bar(sorted(list(char.keys())), list(char.values()), color='g')
    plt.show()
print(sorted(char.keys()))

## Test pairwise interactions 

In [None]:
import numpy as np


na_list = ['A', 'C', 'G', 'T']
aa_list = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
num_samples = 1000


# Generate uniform random samples without replacement
def get_samples(kind="pep",num=num_samples):
    if kind == "apt":
        samples = [all_aptamers[i] for i in np.random.choice(len(all_aptamers), num_samples, replace=False)]
    else:
        samples = [all_peptides[i] for i in np.random.choice(len(all_peptides), num_samples, replace=False)]
    return samples


# Index list for pairwise interaction
def get_index(kind="pep"):
    index = []
    if kind == "apt":
        a = np.random.choice(40, 40, replace=False)
        for i in range(0,40,2):
            index.append((a[i],a[(i+1)]))
    else:
        for i in range(1,7):
            for j in range(1,7):
                if i+j >= 8:
                    continue
                index.append((i, (i+j)))
    return index


# Plot matrix of relative probabilities
def get_pair_mat(kind="pep"):
    char_list = na_list if kind == "apt" else aa_list
    dim = len(char_list)
    index = get_index(kind)
    samples = get_samples(kind)
    M = np.zeros((dim,dim))
    # Loop through all possible pairwise combination
    for (i,j) in index:
        for s in samples:
            M[char_list.index(s[i]), char_list.index(s[j])] += 1
            M[char_list.index(s[j]), char_list.index(s[i])] += 1
        print("-----------Positions: ",(i,j),"Heatmap of probability matrix: ---------")
        M = np.true_divide(M, num_samples)
        ct = 2/(dim**2)
        mat = sns.heatmap(M,center=ct)
        plt.show()
        

# Plot matrix of relative probabilities, assume independence
def get_indep_mat(kind="pep"):
    char_list = na_list if kind == "apt" else aa_list
    dim = len(char_list)
    index = get_index(kind)
    samples = get_samples(kind)
    # Loop through all possible pairwise combination
    for (i,j) in index:
        # Create dict to count num of each letter
        i_char, j_char = dict.fromkeys(char_list, 0), dict.fromkeys(char_list, 0)
        for s in samples:
            if s[i] in i_char:
                i_char[s[i]] += 1
            if s[j] in j_char:
                j_char[s[j]] += 1
        # Initialize matrix of relative probabilities
        M = np.zeros((dim,dim))
        for r in range(dim):
            for c in range(dim):
                # Each entry in M is the product of their relative probabilities
                M[r,c] = (i_char[char_list[r]] * j_char[char_list[c]])/(num_samples**2)
        print("-----------Positions: ",(i,j),"Heatmap of probability matrix: ---------")
        p_range = 4/(dim**2)
        mat = sns.heatmap(M, vmin=0, vmax=p_range)
        plt.show()

In [None]:
# Peptides, pair
get_pair_mat()

In [None]:
# Peptides, independence
get_indep_mat()

In [None]:
# Peptides, pair
get_pair_mat("apt")

In [None]:
# Aptamers, independence
get_indep_mat("apt")