## Code for exercises, small projects or variations

In [38]:
# Dichotomous search with indexing (Chapter 1) [Correct, as of 8.3.2023]

## Part 1. Indexing ecoli genome

### reading of file
file = open('files/ecoli.fa', 'r')
ecoli = ''
count = 0
for line in file:
    count += 1
    if (count > 1): # the first line contains the non-sequence header so we discard it 
        ecoli += line.replace("\n", "") # we string the newline character from the end of each line


### Creating a sort list of all k-mers in the genome 
k = 10
# note that position i is attached to kmer
kmers = [ecoli[i:i+k]+":"+str(i) for i in range(len(ecoli)-k+1)] 

dictkmers = {}

for k in kmers:
    kstring = k.split(":")[0]
    dictkmers[kstring] = ""

### Recasting the kmer list to a dictionary with kmers as keys and position list as values
for k in kmers:
    kstring = k.split(":")[0]
    dictkmers[kstring] = dictkmers[kstring]+","+str(k.split(":")[1])

list_of_kmers = list(dictkmers.keys()) # full list
list_of_kmers.sort()


In [39]:
## Part 2. Dichotomous Search for k-mers [Correct, as of 8.3.2023]

### Using time to measure time of execution
import time
start_time = time.time()

### Pattern search

pattern = 'AAAAAACGAG' # non-mer
pattern = 'ATGCACGACC' # existing

matches = 0

iter = 0
min = 1
max = len(list_of_kmers)

midpoint = int((max+min)/2)

import math 

while iter <= math.log2(len(list_of_kmers)) + 1:
    kmer = list_of_kmers[midpoint]
    iter += 1
    if (pattern == kmer): # case is a success
        matches = len(dictkmers[kmer][1:].split(",")) # number of matches equals length of positions
        positions = dictkmers[kmer][1:].split(",")
        print("Pattern", pattern," matched ", matches, " times in position(s):", positions)
        break
    if (pattern > kmer):
        min = midpoint
        midpoint = int((max+min)/2)
    if (pattern < kmer ):
        max = midpoint
        midpoint = int((max+min)/2)
if (matches == 0):
    if (pattern == list_of_kmers[0]):
        kmer = list_of_kmers[0]
        matches = len(dictkmers[kmer][1:].split(",")) # number of matches equals length of positions
        positions = dictkmers[kmer][1:].split(",")         
        print("Pattern", pattern," matched ", matches, " times in position(s):", positions)
    if (pattern == list_of_kmers[-1]):
        kmer = list_of_kmers[-1]
        matches = len(dictkmers[kmer][1:].split(",")) # number of matches equals length of positions
        positions = dictkmers[kmer][1:].split(",")         
        print("Pattern", pattern," matched ", matches, " times in position(s):", positions)
    else:
        print("No matches found")

print("--- %s seconds ---" % (time.time() - start_time))


Pattern ATGCACGACC  matched  1  times in position(s): ['1826110']
--- 0.0010075569152832031 seconds ---


Notice how the above strategy of _dichotomous search_ is actually one order of magnitude faster than a regular expression search performed with a python generator function

In [42]:
# Pattern search with a generator function
import re
import regex

import time
start_time = time.time()

keystring = 'ATGCACGACC'
# generator function with re.finditer()
dictkmers[keystring] = [m.start() for m in re.finditer(f'(?={keystring})', ecoli)]

matches = len(dictkmers[keystring]) 
print(keystring, " was found ", matches, " times at positions:", dictkmers[keystring])
print("--- %s seconds ---" % (time.time() - start_time))


ATGCACGACC  was found  1  times at positions: [1826110]
--- 0.06594705581665039 seconds ---


In [67]:
## Clustering of a dataset 

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram

# Load the dataframe and assign values/labels
df = pd.read_csv('files/GCContent_simple.csv')
dvalues = df['GCContent'].values.reshape(-1,1)
dlabels = list(df['Genome'])

# Calculate the distances
distances = pdist(dvalues)

# Convert the pairwise distances into a square distance matrix
distance_matrix = squareform(distances)
print(distance_matrix)

# Calculate the linkage matrix using Ward's method
linkage_matrix = linkage(distance_matrix, method='ward')
print(linkage_matrix)

# Plot the dendrogram
sns.set_style('white')
dendrogram(linkage_matrix, labels=dlabels, color_threshold=0, orientation='left')

# Show the plot
plt.show()

[[0.    0.261 0.334 0.075 0.207 0.018 0.02  0.084 0.1   0.293]
 [0.261 0.    0.073 0.186 0.054 0.243 0.241 0.177 0.161 0.032]
 [0.334 0.073 0.    0.259 0.127 0.316 0.314 0.25  0.234 0.041]
 [0.075 0.186 0.259 0.    0.132 0.057 0.055 0.009 0.025 0.218]
 [0.207 0.054 0.127 0.132 0.    0.189 0.187 0.123 0.107 0.086]
 [0.018 0.243 0.316 0.057 0.189 0.    0.002 0.066 0.082 0.275]
 [0.02  0.241 0.314 0.055 0.187 0.002 0.    0.064 0.08  0.273]
 [0.084 0.177 0.25  0.009 0.123 0.066 0.064 0.    0.016 0.209]
 [0.1   0.161 0.234 0.025 0.107 0.082 0.08  0.016 0.    0.193]
 [0.293 0.032 0.041 0.218 0.086 0.275 0.273 0.209 0.193 0.   ]]
[[5.00000000e+00 6.00000000e+00 6.32455532e-03 2.00000000e+00]
 [3.00000000e+00 7.00000000e+00 2.84604989e-02 2.00000000e+00]
 [0.00000000e+00 1.00000000e+01 6.86828460e-02 3.00000000e+00]
 [8.00000000e+00 1.10000000e+01 7.22449537e-02 3.00000000e+00]
 [1.00000000e+00 9.00000000e+00 1.01192885e-01 2.00000000e+00]
 [2.00000000e+00 1.40000000e+01 1.99552833e-01 3.00000

  linkage_matrix = linkage(distance_matrix, method='ward')


ValueError: Dimensions of Z and labels must be consistent.

### Scrapbook
#### Bits of code to use for help

In [27]:
print(list_of_kmers[1:100])

['AAAAAAAAAG', 'AAAAAAAAAT', 'AAAAAAAACA', 'AAAAAAAACC', 'AAAAAAAACG', 'AAAAAAAACT', 'AAAAAAAAGA', 'AAAAAAAAGC', 'AAAAAAAAGG', 'AAAAAAAAGT', 'AAAAAAAATA', 'AAAAAAAATC', 'AAAAAAAATG', 'AAAAAAAATT', 'AAAAAAACAA', 'AAAAAAACAC', 'AAAAAAACAG', 'AAAAAAACAT', 'AAAAAAACCA', 'AAAAAAACCC', 'AAAAAAACCG', 'AAAAAAACCT', 'AAAAAAACGA', 'AAAAAAACGC', 'AAAAAAACGG', 'AAAAAAACGT', 'AAAAAAACTA', 'AAAAAAACTC', 'AAAAAAACTG', 'AAAAAAACTT', 'AAAAAAAGAA', 'AAAAAAAGAC', 'AAAAAAAGAG', 'AAAAAAAGAT', 'AAAAAAAGCA', 'AAAAAAAGCC', 'AAAAAAAGCG', 'AAAAAAAGCT', 'AAAAAAAGGA', 'AAAAAAAGGC', 'AAAAAAAGGG', 'AAAAAAAGGT', 'AAAAAAAGTA', 'AAAAAAAGTC', 'AAAAAAAGTG', 'AAAAAAAGTT', 'AAAAAAATAA', 'AAAAAAATAC', 'AAAAAAATAG', 'AAAAAAATAT', 'AAAAAAATCA', 'AAAAAAATCC', 'AAAAAAATCG', 'AAAAAAATCT', 'AAAAAAATGA', 'AAAAAAATGC', 'AAAAAAATGG', 'AAAAAAATGT', 'AAAAAAATTA', 'AAAAAAATTC', 'AAAAAAATTG', 'AAAAAAATTT', 'AAAAAACAAA', 'AAAAAACAAC', 'AAAAAACAAG', 'AAAAAACAAT', 'AAAAAACACA', 'AAAAAACACC', 'AAAAAACACG', 'AAAAAACACT', 'AAAAAACAGA', 'AAAA

In [98]:
print(kmers[0:10])
list_of_kmers.count('AAAAAAAAAC')

print(ecoli[:1000])

keystring = 'AAAAAAAAAC'
matches = re.findall(keystring, str(ecoli))
print(matches)

positions = [m.start() for m in re.finditer(f'(?={keystring})', ecoli)]
print(positions)

#string='atha'
#[m.start() for m in re.finditer(f'(?={string})', 'athathaathan')]

    

['AAAAAAAAAC', 'AAAAAAAAAC', 'AAAAAAAAAC', 'AAAAAAAAAC', 'AAAAAAAAAC', 'AAAAAAAAAG', 'AAAAAAAAAG', 'AAAAAAAAAG', 'AAAAAAAAAT', 'AAAAAAAACA']
TTTTTTTTCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGGGTTGCCGATATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCATCTGGTAGCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTCTGACGGGACTCGCCGCCGCCCAGCCGGGATTTCCGCTGGCACAATTGAAAACTTTCGTCGACCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATCAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTGTTAGAAGCGCGTGGTCACAACGTTACCGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGTCATTACCTCGAATCTACCGTTGATATTGCTGAATCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGACCACATGGTGCTGATGGCTGGTTTCACTGCCGGTAATGAAAAAGGCGAGCTGGTGGTTCTGGGACGCAACGGTTCCGACTACTCCGCTGCGGTGCTGGCGGCCTGTTTACGCGCCGATTGTTGCGAGATCTGGACGGATGTTGACGGTGTTTATACCTGCGATCCGCGTCAGGTGCCCGATGCGAGGTTGTTGAAGTCGATGTCCTATCAGGAAGCGATGGAGCTTTCTTACTTCGGCGCTAAAGTTCTTCACCCCCGCACCATTACCCCCATCGCCCAG

### Appendix
#### Bits of code that can be useful

In [40]:
# the following line searches a substring on a longer string and records ALL positions where it is found
# and stores them in a list
import re
import regex

import time
start_time = time.time()

keystring = 'ATGCACGACC'
dictkmers[keystring] = [m.start() for m in re.finditer(f'(?={keystring})', ecoli)]

print(keystring, dictkmers[keystring])
print("--- %s seconds ---" % (time.time() - start_time))

# Sorting a dictionary by key value
#sorted_dictkmers = {key: value for key, value in sorted(dictkmers.items())}


ATGCACGACC [1826110]
--- 0.05128931999206543 seconds ---
