In [2]:
#Imports needed for coding
from collections import Counter
import pandas as pd
import numpy as np

# 1. What is the time complexity of the following sorting algorithm? Explain the reasoning behind this time complexity, and then write code (pseudocode is okay) for a sorting algorithm that runs in O(nlogn) time.

This sorting algorithm is known as "insertion sort" and runs at best O(n), and at worst $O(n^2)$
This is because in the best case scenario, everything is already sorted and the algorithm needs to only check each set of numbers once. In the worst case scenario, all of the numbers are in reverse order and you have to iterate through the while loop 1 more time than the previous each time as i increases. For example, if one were to run the code on the array of numbers [4,3,2,1]: This table describes how the algorithm runs through the array each time. 

| i =  | j =  | array[j-1] | < or > | array[j] | Swap? | 0 | 1 | 2 | 3 |
|------|------|------------|--------|----------|-------|---|---|---|---|
| 0    | 0    |            |        |          | N/A   | 4 | 3 | 2 | 1 |
| 1    | 1    | 4          | >      | 3        | Yes   | 3 | 4 | 2 | 1 |
| 1    | 0    | -          | -      | -        | N/A   | - | - | - | - |
| 2    | 2    | 4          | >      | 2        | Yes   | 3 | 2 | 4 | 1 |
| 2    | 1    | 3          | >      | 1        | Yes   | 2 | 3 | 4 | 1 |
| 3    | 3    | 4          | >      | 1        | Yes   | 2 | 3 | 1 | 4 |
| 3    | 2    | 3          | >      | 1        | Yes   | 2 | 1 | 3 | 4 |
| 3    | 1    | 2          | >      | 1        | Yes   | 1 | 2 | 3 | 4 |

In this case, every time i increases by 1, the number of times you must run through the while loop increases by 1. 



## O(nlogn) time pseudocode:

For this, I am writing a pseudocode for the Heapsort algorithm, at best and worst case scenario it is O(nlogn)


    HeapSort(A)
    1. Build-MAX-HEAP(A) (MAX-HEAPIFY)
    
    #MAX-HEAPIFY is a way of organizing the array such that all "parent nodes" are greater than the "children nodes." To execute MAX-HEAPIFY, the parent closest to the bottom of the tree is compared with its children. If the child is greater than the parent, then the parent and child are swapped. This is continued up the tree at each parent node until the apex of tree is reached. Once the apex is reached, you start at bottom parent again and check that all parents are greater than children nodes. When tree is in "Maxheapify" format, continue onto next step. 
    
    2. For i <-- length[A] downto 2
    3.       do exchange A[1] <-> A[i]
    4.              heap-size[A] <-- heapsize[A]-1
    
    #Steps 2-4: Next, swap the first and last number in array (first number should be the greatest number in the array.) Then, make the heapsize 1 smaller so that the last number of the array is no longer in the heap (It will now remain untouched during the next rounds of re-organizing the array)
    
    5.              MAX-HEAPIFY(A,1)
    
    #Now, you repeat the entire process with the new, smaller by 1 heap. It will move the 2nd largest number in array to the 2nd to last position. Process is repeated iteratively until all numbers have been sorted. 
    


# 3

## 3A Given the sequences below: Calculate the observed and expected frequency of each possible 4-mer assuming each nucleotide appears at a probability of ¼

In [3]:
def allpossiblekmers(length):
    """This function takes an input length (length of your k-mer) 
    and tells you how many combinations of nucleotide bases (A,T,G,C) there are
    """
    
    allkmers = 4**(length)
    return allkmers

allpossible4mers = allpossiblekmers(4)
print(allpossible4mers)

256


## 3B Compute the frequency of all possible 4-mers in this sequence

In [76]:
def color_unexpected_red(val):
    """
    Takes a pandas array and colors it red if the observed freq is greater than 0.016129
    """
    if val > 0.0161291 :
        color = 'red'
    else:
        color = 'black'
    return 'color: %s' % color


In [79]:
def listofallkmers(len_of_kmer, sequence):
    """
    This fxn takes a sequence and returns a list of all of the k-mers that exist
    """
    seq_len = len(sequence)
    
    kmers = []
    for i in range (0, seq_len):
        kmer = sequence[i:i+len_of_kmer]
        if len(kmer) == len_of_kmer:
            kmers.append(kmer)
    return kmers

def freq_of_kmers(kmer_list):
    """
    This fxn takes a list of kmers and counts the number of each different kmer, then calculates its frequency 
    of occurence in the sequence. It then takes that dictionary of frequencies and turns it into a Pandas dataframe. 
    The Pandas dataframe is sorted for easy viewing. 
    """
    
    len_kmer_list = len(kmer_list)
    
    kmer_dict = dict(Counter(kmer_list))
    
    for key, value in kmer_dict.items():
        kmer_dict[key] = value / len_kmer_list
    
    df = pd.DataFrame.from_dict(kmer_dict, orient='index', dtype=None)
    df = df.reset_index()
    df = df.rename(index=str, columns={ 0:'Observed_Frequency', 'index':'Sequence'})
    df = df.sort_values(by = 'Observed_Frequency', ascending = False)
    df['Expected_Frequency_at_random'] = 1/62
    
    df = df.style.applymap(color_negative_red, subset=['Observed_Frequency'])
              
    return df


In [80]:
seqquestion3 = 'AGTCGTACGTGACAGTAGACGTGCCGACGTGAGATACGTGAACGGAGTACGTTCGTGACGGTGAT'
list_seqquestion3_kmers = listofallkmers(4, seqquestion3)
print(len(list_seqquestion3_kmers))
freq_4mers_question3 = freq_of_kmers(list_seqquestion3_kmers)
freq_4mers_question3

62


Unnamed: 0,Sequence,Observed_Frequency,Expected_Frequency_at_random
1,ACGT,0.0806452,0.016129
34,CGTG,0.0806452,0.016129
32,GTGA,0.0806452,0.016129
4,GACG,0.0483871,0.016129
16,TACG,0.0483871,0.016129
24,TCGT,0.0322581,0.016129
36,TGAC,0.0322581,0.016129
27,ACGG,0.0322581,0.016129
30,AGTA,0.0322581,0.016129
13,GTAC,0.0322581,0.016129


# 4 Given a file of sequences (sequences.txt, on our course website)

## 4A Implement a simple method that scores the Hamming distance for a pattern against each subsequence

In [104]:
def Hamming_distance(Dna_str, pattern):
    """
    This code computes the hamming distance for all kmers of pattern length
    """
    len_pattern = len(pattern)
    seq_len = len(Dna_str)
    
    hamming_distances = []
    for i in range(0, seq_len):
        kmer = Dna_str[i:i+len_pattern]
        diffs = 0
        for ch1, ch2 in zip(kmer, pattern):
            if ch1 != ch2:
                diffs += 1
        hamming_distances.append(diffs)
    hamming_distances = np.array(hamming_distances)
    return hamming_distances

test_seq = 'ATGCGGCGA'
test_pattern = 'ATCG'
testing = Hamming_distance(test_seq, test_pattern)
testing

#Importing the sequences.txt
sequences = pd.read_csv('atom_sequences', sep=',', header = None)
# sequences

array([2, 2, 3, 4, 2, 4, 3, 2, 0])

In [115]:
## This is not done. I need to figure out how to run through each sequence and make an array 
##of the hamming distances. Also, double check and see if this is a good way to attack problem. 

# def Hamming_distances(all_sequences, pattern):
    
#     for i, seq in enumerate(all_sequences):
#         hd_array = Hamming_distance(seq, pattern)
        
#         hd_array = np.concatenate((hd_array, next_hd_array), axis = 0)
#     return hd_array

# big_hd_array = Hamming_distances(sequences, 'TTGTAGG')
# big_hd_array
        
        
        

SyntaxError: invalid syntax (<ipython-input-115-789f669a575a>, line 4)

# 9
## 9A In any language of your choice, create a suffix array for Gorgonio and print out the results (index, element) of your array in ascending order.

In [47]:
##I am not entirely sure what to do with this anymore. 
# def suffix_array(str): 
#     return sorted(range(len(str)), key=lambda i: str[i:]) 

# test = 'abaaba$'
# testing = get_suffix_array(test)
# print(testing)



[6, 5, 2, 3, 0, 4, 1]


In [62]:
#New idea: Construct a Panda dataframe which each index is connected with a word and as the index increases, 
#the number of letters is decreased by one. 
#Note, I am not entirely sure if Pandas is the way to store this data. But it worked to build the array I think

def build_suffix_array(string):
    """
    This function takes a string and converts it into a sorted pandas dataframe. 
    
    """
    SA_array = []
    for i in range(0, len(string)): 
        SA_array.append(string[i:])
    df_SA = pd.DataFrame(SA_array)
    df_SA.index = df_SA.index + 1
    df_SA = df_SA.rename(index=str, columns={ 0:'Element'})
    df_SA = df_SA.sort_values(by = 'Element', ascending = True)
    
    return df_SA

test = "abaaba$"
testy = build_suffix_array(test)
print(testy)
        

   Element
7        $
6       a$
3    aaba$
4     aba$
1  abaaba$
5      ba$
2   baaba$


## 9B Next, implement a “Query” method for your suffix array using the binary search method. Please include your well-documented code in your submission.

In [None]:
##I don't think this is working. Think more on binary search. 
# def binary_search(df_SA, search_str):
#     if search_str <= df_SA[1]:
#         value = 1
#     else if search_str > df_SA[-1]:
#         value
#     else 
        