# <center>All I want is an Identity matrix, to get either really similar or really different sequences</center>

In [48]:
import pylev
import Levenshtein
import tqdm
import matplotlib.pyplot as plt
import numpy as np
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio import AlignIO, SeqIO
from Bio.Align import MultipleSeqAlignment
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
aligned_sequences_file_name = "thioviridamide_msa.txt"
unaligned_sequences_file_name = "thioviridamide_seqdump.txt"

In [70]:
def get_sequences(file_name): #Returns a list of seqRecords from filename instead of iterable like SeqIO would
    return [seq for seq in SeqIO.parse(file_name,"fasta")]

def print_alignment(seq1,seq2):
    alignment = pairwise2.align.globalms(seq1, seq2,2,-1,-3,-0.1)#gap penalty of -5 seems to work well at preventing them from forming!
    #print(pairwise2.align.globalms(seq1, seq2,2,-1,-3,-0.1)[0])
    print(format_alignment(*alignment[0]))
  

In [76]:
aligned_sequences = get_sequences(aligned_sequences_file_name)
unaligned_sequences = get_sequences(unaligned_sequences_file_name)

print_alignment(unaligned_sequences[0].seq,unaligned_sequences[1].seq)

M-------KLRNS-------VPKVAGGGPHR----ERTTEETWRTIQPYLRRV-------GVTRVADITGLDRIGIPVYNAIV----PKS-SDLISVYNGKG----ASHLDAKTSAVME------AVER------FA-----AWQPRTPDVVGSVDDLRRDGIRVVHPDSINIERFKQYRD--------HFPISWVMGTELISGEDIAVP-----------------QYLAGYYQSFHETPPFPICTTNGIASGNSVEEATCHALCELIERDDWTMAEVISNRL--------------S-----------RAVTKGTVAPGIPETVEQWFMERNRSIDQET----LPAPHRRLIERYRAANLSVELKSIMSHNGIPSFLC--VVSEDLGPTFS-RSHQGLGTHPDRDVAALRALSEAAQSRVVDIQAMRED---------------------I---------SLPDEDVPKYMLHIKRSAAFNPQAWANYRTQRQTDFQSLPTYLSADVMEDTR--------------RMIRNLQATGIEEVAVV-DL------SPKWLPVSVVRVVVPGIESWA--------IDRGRLGFRAAAVWEENLGLLRDAL----AE-AAHRQEALR
|       |...|       |      |.||    |||           |.||       |.||||..||||.|||||    |    |.| |  .||..|||    |    ||.|.|||      | ||      ||     .|   |..|| .||.|.|                             |.||.|..|..|..|....||                 ..|||   |            ||.||||...||..||..||.|||..|     ..||              |           |..|.| ||.|..||               |    |||                           

In [77]:
file_location = "first_10_sequences_thioviridamide.txt"
SeqIO.write(unaligned_sequences[:10], file_location, "fasta")

10

In [80]:
content = [str(seq_obj.seq) for seq_obj in unaligned_sequences]
len(content), len(set(content))

(3789, 3789)

In [1]:
from snapy import MinHash, LSH

In [72]:
str(unaligned_sequences[0].seq)

'MKLRNSVPKVAGGGPHRERTTEETWRTIQPYLRRVGVTRVADITGLDRIGIPVYNAIVPKSSDLISVYNGKGASHLDAKTSAVMEAVERFAAWQPRTPDVVGSVDDLRRDGIRVVHPDSINIERFKQYRDHFPISWVMGTELISGEDIAVPQYLAGYYQSFHETPPFPICTTNGIASGNSVEEATCHALCELIERDDWTMAEVISNRLSRAVTKGTVAPGIPETVEQWFMERNRSIDQETLPAPHRRLIERYRAANLSVELKSIMSHNGIPSFLCVVSEDLGPTFSRSHQGLGTHPDRDVAALRALSEAAQSRVVDIQAMREDISLPDEDVPKYMLHIKRSAAFNPQAWANYRTQRQTDFQSLPTYLSADVMEDTRRMIRNLQATGIEEVAVVDLSPKWLPVSVVRVVVPGIESWAIDRGRLGFRAAAVWEENLGLLRDALAEAAHRQEALR'

In [74]:
str(aligned_sequences[0].seq)

'-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MKLRN----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------SVP-KVA------------------------------------------------------

In [57]:
content = [str(seq_obj.seq) for seq_obj in unaligned_sequences[:200]]

labels = range(len(content))
seed = 3


# Create MinHash object.
minhash = MinHash(content, n_gram=9, n_gram_type='char', permutations=100, hash_bits=64, seed=seed)


# Create LSH model.
lsh = LSH(minhash, labels, no_of_bands=50)


# Query to find near duplicates for text 1.
print(lsh.query(1, min_jaccard=0.1))

[0]


In [83]:
def distout_parser(distout_file):
    
    try:
        hat2_handle = open(distout_file, 'r')
    except IOError:
        return {}
    
    domain_pairs_dict = {}    
    linecounter = 0
    seqsdict = {}
    distances = [] #will be of length numberof_seqs * (numberof_seqs-1) / 2
    numberof_seqs = 0
    for line in hat2_handle:
        linecounter += 1
        if linecounter == 2: #contains number of sequences
            numberof_seqs = int(line.replace(" ", "").strip())
            
        elif linecounter >= 4 and linecounter <= 3 + numberof_seqs:
            try:
                #seq_number = int(re.search(r' \d*\. ', str(line.split("=")[0])).group(0).replace(".", "").replace(" ", ""))
                seq_number = int(line.split("=")[0].replace(" ", "").replace(".", ""))
            except AttributeError:
                print("something went wrong during the import of distout file: ", str(distout_file))
                
            
            seqsdict[seq_number] = "".join(line.split("=")[1:]).strip()#in case the header contains an = sign

        elif linecounter > 3 + numberof_seqs:
            distances += line.strip().split(" ")

    keys=[]
    if len(distances) != (numberof_seqs * (numberof_seqs-1)) / 2.0:
        print("something went horribly wrong in importing the distance matrix")
    else:
        print("distance matrix imported correctly")
        keys = seqsdict.keys()

    keys_queue = []
    for key in keys:
        keys_queue.append(key)

    tuples = []
    for key in keys:
        keys_queue.remove(key)
        for key_queue in keys_queue:
            tuples.append((key, key_queue))
            
    for tupl in range(len(tuples)):
        ##    { ('specific_domain_name_1',
        ##    'specific_domain_name_2'): (sequence_identity, alignment_length), ... }
        #1-distance is a representation of the sequence_identity
        domain_pairs_dict[tuple(sorted([seqsdict[tuples[tupl][0]], seqsdict[tuples[tupl][1]]]))] = (1-float(distances[tupl]), 0)

    return domain_pairs_dict


In [89]:
pairs = distout_parser('../processed_sequences/unique_unaligned_sequences.txt.hat2')

distance matrix imported correctly


In [90]:
for 

{('BAN83923.1 hypothetical protein [Streptomyces olivoviridis]',
  'WP_129307703.1 YcaO-like family protein [Streptomyces sp. L2]'): (0.935, 0),
 ('BAN83923.1 hypothetical protein [Streptomyces olivoviridis]',
  'WP_184984777.1 YcaO-like family protein [Streptomyces caelestis]'): (0.11499999999999999,
  0),
 ('BAN83923.1 hypothetical protein [Streptomyces olivoviridis]',
  'WP_031090932.1 MULTISPECIES: YcaO-like family protein [unclassified Streptomyces]'): (0.10399999999999998,
  0),
 ('BAN83923.1 hypothetical protein [Streptomyces olivoviridis]',
  'WP_065848698.1 YcaO-like family protein [Streptomyces mutomycini]'): (0.10399999999999998,
  0),
 ('BAN83923.1 hypothetical protein [Streptomyces olivoviridis]',
  'WP_044449639.1 YcaO-like family protein [Mastigocladus laminosus]'): (0.09499999999999997,
  0),
 ('BAN83923.1 hypothetical protein [Streptomyces olivoviridis]',
  'BBC15204.1 YcaO-like protein [Streptomyces sp.]'): (0.124, 0),
 ('BAN83923.1 hypothetical protein [Streptomyces 