# Sequences Lab

### by Jonathan Fischer and Courtney Rauchman

In [10]:
from datascience import *
import numpy as np
import seaborn as sns
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
import scipy.stats as stats
import scipy as scipy
import pandas as pd
plt.style.use('fivethirtyeight')
#from client.api.notebook import Notebook

## Reference Assembly

In [11]:
# Let's imagine we've performed an experiment and obtained the set of 8 reads below. We happen to know 
# that these correspond to the coding sequence of a gene of length 45 nucleotides.

reads = ['ATGCTAGAAA','GGAGACTGCT','AGTTAGTCAT','CCATAGCTGA',
 'AGAAACGGCT','CGGCTAGTTA','CTGCTCCATA','GTCATGGAGA']

In [13]:
# We want to use these reads to obtain the actual sequence of the entire gene. Let's assume that each read overlaps
# with only one other by exactly 5 bp. These overlaps will be in the first and last 5 nt of each read.
# By stitching these reads together, we can thus obtain the entire gene.

# Assemble the reads to yield the sequence of the gene. Call this value full_seq

# Initalize lists to hold prefixes and suffixes. 
# Then extract them from the raw sequences and populate the lists.
prefixes = []
suffixes = []
for i in reads:
    prefixes.append(i[0:5])
    suffixes.append(i[5:10])

# Initialize a matrix of zeros in which to store which reads overlap
match_mat = np.zeros((len(prefixes), len(suffixes)))

# Iterate over prefixes and suffixes to identify when the prefix of read i equals the suffix of read j
for i in np.arange(0,len(prefixes)):
    for j in np.arange(0,len(suffixes)):
        match_mat[i,j] = (prefixes[i] == suffixes[j])
        
print(match_mat)

# Identify which read is the first in the sequence. This is the only row with all zero values
# because no suffixes match the prefix of this read.
ind = int(np.argwhere(np.sum(match_mat, axis = 1) == 0))

# Begin constructin the sequence with the first read
full_seq = reads[ind]

# We now need to use the matrix to find the order of reads and glue them together.

# Your code should look like the following
# Iterate over the number of times we need to find the next read (using a for loop)
# Since we've already found one, this should be np.arange(0,7):
#      Starting from a given read, find the index of the next one by seeing which entry in the corresponding 
#      column equals 1.
#      Hint: You need to update ind using int(np.argwhere(match_mat[:,ind] == 1))
#      Concatenate the last 5 nucleotides of the identified read to full_seq (strings can be concatenated with +)
#      # You can concatenate a string x to a string y using y += x. This is equivalent to y = y+x


## Read mapping

In [4]:
# Now let's assume already have the reference sequences for two different genes and want to quantify
# their expression levels. We perform an experiment and get a set of reads, some of which come from these 
# genes. We can estimate their expression by counting the number of reads which match the sequence
# in the respective references. We only want to count reads which uniquely align to one gene or the other.

# Load the data

# Reference sequences
gene_1 = 'ATGCTAGAAACGGCTAGTTATTCATGGAGACTGCTCCATAGCTGA'
gene_2 = 'ATGTATGATCTCCAGGTATTCACGCAGTGCTCGCCTTACTTATAG'

# Reads from experiment
experimental_reads = [
 'TCATG','CGGCT','TGGAG','GCTCC','ACTGC','TTATT','TATTC','AGAAA','AACGG','AGCTG','CTAGT','GCTCC','ATGGA',
 'AACGG','TAGAA','ACGGC','ATAGC','GGAGA','TAGTT','CGGCT','ACTGC','GGCTA','GCTAG','GCTAG','AGTTA','TGCTA',
 'TTATT','TGCTA','GCTAG','AGAAA','AACGG','ATAGC','TTATT','TCCAG','TACTT','TGATC','CTTAT','TGATC','TTATA',
 'TCGCC','CTCCA','TGATC','CAGTG','TTATA','GCAGT','AGGTA','GCAGT','TGTAT','GCCTT','ATTCA','ATGAT','CTCCA',
 'CCTTA','GATCT','CAGGT','TCCAG','ACTTA','AATAT','ATGAA','TGTGG','GGTGC','GCAAG','CTACC','CACCG','TGGGG',
 'TAGGT','CTGCC','AGGCT','GGTAG','GGCGT','AGTGA','AATGT','CGTCG','CGTCC','GTACC','AGGGA','ATCGG','CTCTT',
 'CCGGA','AATGG','CTTTC','TAAAT','TATAA','GGTTA','AAAGG','TCAGG','GCATT']


In [5]:
# Write a function (read_map_check) that takes two sequences, one test and one reference, as input and 
# checks whether the test sequence aligns to the reference.
# The inputs should be called test_seq and ref_seq

def read_map_check(test_seq, ref_seq):
    map_flag = test_seq in ref_seq
    return(map_flag)

In [6]:
# Now apply your function to the set of experimental reads to produce the expression counts for each gene.
# Remember that only reads mapping uniquely to a given sequence will get counted.
# Please return the values for gene_1_hits and gene_2_hits, the number of reads uniquely mapping to each gene

# Initialize counters at zero
gene_1_hits = 0
gene_2_hits = 0

# Your code should look something like the following:
# for loop which iterates over reads in experimental_reads:
#     read_map_check for the read and gene 1
#     read_map_check for the read and gene 2
#     check uniquely maps to gene 1:
#          if so, add one to gene_1_hits
#     check uniquely maps to gene 2:
#          if so, add one to gene_2_hits

# may be helpful - Make composite Booleans with and, e.g. True and True is True, True and False is False
# Negate Booleans with not, e.g. not True is False and not False is True
# You can add x to a value y using y += x. This is equivalent to y = y+x

gene_1_hits, gene_2_hits  

## Comparing genetic sequences

In [7]:
# A simple way to compare the similarity of genetic sequences is to compute a quantity known as the Hamming distance.
# This measures the number of mismatches between pairs of sequences; e.g., d_H('ACG', 'ATG') = 1, 
# d_H('TAG', 'TAG') = 0, etc. 

# We will now compare the differences in the same gene observed in two "populations". First, load the provided data.

data_pop_1 = ['ATGCTAGAAAGGGCTAGTTAGACATGGAGACGGCTCCATAGCTGA', 'ATGCTAGTAACGGCTAGTTTGTCATGGAGACTGCTCCATACCTGA', 'ATGCTAGATACGGCTAGTTACTCATGGAGAGAGCTCCATAGCTGA', 'ATGCTAGTTACGGCTACTTAGTCATCGAGACTGCACCATAGCTGA', 'ATGCTAGAATCGGGTAGATAGTCATGGAGACAGCTCCATAGCTGA']
data_pop_2 = ['ATGCAAGCAACGGCTTGTTTTTCATGGACACTGATCCATTGCTGA', 'ATGCATGGAAGGGCTAGTTATTCTTGCAGACTGATCCATTGCTGA', 'ATGCAAGCAACGGCTAGTTATTCTTGGACACTGATGCATTGCTGA', 'ATGCAAGGAACGGCAAGTTAATCATGGTGACTCATCCATTGCTGA', 'ATGCAAGGAAGGGCTAGATATTCGTGGAGAGTGATCGATTGCTGA']


In [1]:
# Write a function which takes two sequences as input and outputs the Hamming distance between them.
# As the Hamming distance is the number of differences between the sequences, we're going to iterate
# through the nucleotides of the sequences and count how many times they don't match.

# Name your function compute_hamming_dist. Have it take arguments seq_1 and seq_2.

# First, we'll want to initialize the count of differences to be 0

# Next, we need to iterate through each sequence simultaneously. We can do this with 
# a for loop from 0 to the length of the sequences and referring to the nucleotide at the corresponding
# index for each iteration

# Each time the nucleotides don't match, we should increment our counter by 1. Hint: use !=

# Finally, return the value of the counter. It's also not a bad idea to wrap this all in an 
# if/else statement which verifies that the sequences are the same length before checking the
# distance between them.


In [3]:
# Construct a matrix D in which D_{ij} = d_H(s_i, s_j) for sequences s_i and s_j.

# We want to compute the Hamming distance in the sequence of the given gene for this set of 10 individuals.
# 5 are from population 1 and 5 from population 2.

# We should iterate over all the individuals at once to get the matrix, so start by combining the lists for
# populations 1 and 2. (Can combine lists using +)

# Let's also initialize the matrix D in which we want to store the distances.
# Hint: np.zeros((len(x), len(x))) for the correct object x

# Now let's fill the entries of the matrix. 
# Use for loops to iterate through the combined data and compute the Hamming distance between
# every pair using your function.
# Hint: You probably want to iterate through the list by the numerical indices
# For the ith and jth individuals, change the value of D[i,j] to the distance between sequences i and j
# Hint: Should the matrix be symmetric? Can we also assign D[j,i] at this step?


In [5]:
# Use Multidimensional Scaling (MDS) to examine whether the individuals cluster in any noticeable way.
# Like PCA, MDS is a method to compress data into a form which can be visualized. This algorithm seeks
# to preserve the computed distances between points in the original data when embedding them into 
# two dimensions

# You don't need to edit this part
from sklearn.manifold import MDS
mds = MDS(n_components=2, random_state = 100200300, dissimilarity="precomputed")
proj = mds.fit_transform(D)

# Now make a scatter plot of the MDS projection (proj) with the points colored by population

# Scatter of first five individuals    plt.scatter(df[rows, col1], df[rows, col2])
# Scatter of second five individuals   plt.scatter(df[rows, col1], df[rows, col2])
# xlabel of Coordinate 1               plt.xlabel('xlabel')
# ylabel of Coordinate 2               plt.xlabel('ylabel')
# title of MDS of individuals based on genetic sequences  plt.xlabel('title')
# legend with labels 'Pop 1', and 'Pop 2' in the lower right plt.legend(['Label 1', 'Label 2'], loc = 'location')
# show plot     plt.show()


# BONUS - Obtaining protein sequences

In [None]:
# Genetic code directly maps to specific amino acids via "codons", or sets of three nucleotides in coding regions.
# We can thus construct the sequence of amino acids for each protein based purely on the genetic sequence.
# First, load the provided codon -> amino acid conversion table.

# If you're comfortable with dictionaries, you can use amino_acids_dict
amino_acids_dict = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'}

# Otherwise, this constructs an array from the dictionary for you to use
amino_acids_array = make_array(list(amino_acids_dict.keys()), list(amino_acids_dict.values()))

# Source: https://pythonforbiologists.com/dictionaries

In [2]:
# Write a function which takes a genetic sequence as input and writes the sequence of amino acids as a string.
# Call this function seq_to_AA and name its input seq

# check if length of seq is a multiple of three. If not, return a statement saying you can't 
# convert the sequence because of this (hint: use x % y for modular division)
# If the length is okay, split the sequence into its codons (i.e., split into units of length 3)
# Initialize your amino acids sequence as an empty string
# Iterate through these codons with a for loop and find the appropriate amino acid for each one.
# Add this to your running sequence of amino acids. 
# Hint: If you're using the array, what does amino_acids_array[1,amino_acids_array[0,:] == codon][0] give you?
# Hint: If you're using the dict, what does amino_acids_dict[codon] give you?


In [None]:
# Obtain AA sequences for gene_1 and gene_2. Name them protein_1 and protein_2
# Just run your seq_to_AA function on gene_1 and gene_2

## To submit

In [None]:
ok = Notebook('Lab03_sequences.ok')
_ = ok.auth(inline=True)

In [None]:
# Submit the assignment.
_ = ok.submit()