## Create function to count kmers 

In [151]:
import collections
import pandas as pd 
import numpy as np

In [147]:
def kmer_counter(sequence, kmer_length):
    sequence_length=len(sequence)
    number_of_kmers=((sequence_length-kmer_length)+1)
    
    kmer_counts_dict={}
    
    for i in range(number_of_kmers):
        kmer=sequence[i:(i+kmer_length)]
        if kmer in kmer_counts_dict: # here we could use kmer_counts_dict.keys() also
            kmer_counts_dict[kmer]+=1
        else:
            kmer_counts_dict[kmer]=1
    return kmer_counts_dict

kmer_example2=kmer_counter("AAAAAAAAAGGGGGGGAAAAAAAAAGGG", 4)

list(kmer_example2.items())

[('AAAA', 12),
 ('AAAG', 2),
 ('AAGG', 2),
 ('AGGG', 2),
 ('GGGG', 4),
 ('GGGA', 1),
 ('GGAA', 1),
 ('GAAA', 1)]

#### Make small example function to count kmer locations 

In [126]:
def kmer_location_list(sequence, kmer_length):
    sequence_length=len(sequence)
    number_of_kmers=((sequence_length-kmer_length)+1)
    cut_site_poistion=(sequence_length-7)
    
    kmer_location_list=[]
    
    for i in range(number_of_kmers):
        kmer=sequence[i:(i+kmer_length)]
        kmer_loc=cut_site_poistion-((i+kmer_length)-1)
        kmer_location_list.append(kmer + "(" + str(kmer_loc)+ ")")
    return kmer_location_list
        
        
test_locations1=kmer_location_list("ATCTCTACGTCTGTATCATCNGG",4)
test_locations2=kmer_location_list("TCTGTCAGTCTAGTGTCATCNGG",4)
test_locations_counter=collections.Counter(test_locations)
test_locations_counter2=collections.Counter(test_locations2)
test_locations_counter
test_locations_counter2

union= test_locations_counter & test_locations_counter2
union

Counter({'ATCN(-4)': 1,
         'CATC(-3)': 1,
         'CNGG(-6)': 1,
         'TCAT(-2)': 1,
         'TCNG(-5)': 1})

### Make a dictionary of VO guide name and seq key value pairs.


In [131]:
VO_2016_guides=pd.read_csv("VO_guides_2016.csv")
VO_2016_guides_cleaned=VO_2016_guides.drop(VO_2016_guides.index[96:1475])
VO_2016_guides_cleaned=VO_2016_guides_cleaned.drop("Genomic location of spacer (hg19)", axis=1)
VO_2016_guides_cleaned.head()

Unnamed: 0,Spacer,Spacer sequence
0,1.0,CCCTCAAGAGTGACTACATCAGC
1,2.0,AGTGTGCATTGCCACCTCAGTGG
2,3.0,CCGGGCATCTGCAGCCTGCATCT
3,4.0,CCTGGGAGCCGCCGCCGAGGGCC
4,5.0,CCATCTATAACAACATGTTCTGT


In [138]:
spacer_name=list(VO_2016_guides_cleaned['Spacer'])
spacer_seq=list(VO_2016_guides_cleaned['Spacer sequence'])
VO_Guide_dictionary=dict(zip(spacer_name, spacer_seq))


### Make a dictionary of VO guide kmers 
with key=(guide_name,seq) value=kmer count dictionary 

In [139]:
VO_2016_kmers_fulldict={}

for seq_id, sequence in VO_Guide_dictionary.items():
    seq_kmer=kmer_counter(sequence, 4)
    VO_2016_kmers_fulldict[(seq_id,sequence)]=seq_kmer

print(VO_2016_kmers_fulldict)

{(1.0, 'CCCTCAAGAGTGACTACATCAGC'): {'CCCT': 1, 'CCTC': 1, 'CTCA': 1, 'TCAA': 1, 'CAAG': 1, 'AAGA': 1, 'AGAG': 1, 'GAGT': 1, 'AGTG': 1, 'GTGA': 1, 'TGAC': 1, 'GACT': 1, 'ACTA': 1, 'CTAC': 1, 'TACA': 1, 'ACAT': 1, 'CATC': 1, 'ATCA': 1, 'TCAG': 1, 'CAGC': 1}, (2.0, 'AGTGTGCATTGCCACCTCAGTGG'): {'AGTG': 2, 'GTGT': 1, 'TGTG': 1, 'GTGC': 1, 'TGCA': 1, 'GCAT': 1, 'CATT': 1, 'ATTG': 1, 'TTGC': 1, 'TGCC': 1, 'GCCA': 1, 'CCAC': 1, 'CACC': 1, 'ACCT': 1, 'CCTC': 1, 'CTCA': 1, 'TCAG': 1, 'CAGT': 1, 'GTGG': 1}, (3.0, 'CCGGGCATCTGCAGCCTGCATCT'): {'CCGG': 1, 'CGGG': 1, 'GGGC': 1, 'GGCA': 1, 'GCAT': 2, 'CATC': 2, 'ATCT': 2, 'TCTG': 1, 'CTGC': 2, 'TGCA': 2, 'GCAG': 1, 'CAGC': 1, 'AGCC': 1, 'GCCT': 1, 'CCTG': 1}, (4.0, 'CCTGGGAGCCGCCGCCGAGGGCC'): {'CCTG': 1, 'CTGG': 1, 'TGGG': 1, 'GGGA': 1, 'GGAG': 1, 'GAGC': 1, 'AGCC': 1, 'GCCG': 3, 'CCGC': 2, 'CGCC': 2, 'CCGA': 1, 'CGAG': 1, 'GAGG': 1, 'AGGG': 1, 'GGGC': 1, 'GGCC': 1}, (5.0, 'CCATCTATAACAACATGTTCTGT'): {'CCAT': 1, 'CATC': 1, 'ATCT': 1, 'TCTA': 1, 'CTAT'

### Make dictionary of just guide name and kmers 
So we can make a df where col=kmers and rows=guide id.

Trouble here-I first made a dictionary of dictionaries where the key is the seq id and the value is the dict of kmers counts. But pd.DataFrame would not work on a dictionary of dictionaries. 

I next tried to make it one dictionary with the keys as guide names and the value as a list of kmers counts as tuples. This won't turn into a df b/c the lists are different lengths. 

In [169]:
VO_2016_kmers_dict_for_df={}

for seq_id, sequence in VO_Guide_dictionary.items():
    seq_kmer=kmer_counter(sequence, 4)
    seq_kmer_items=list(seq_kmer.items())
    VO_2016_kmers_dict_for_df[seq_id]= seq_kmer_items

print(VO_2016_kmers_dict_for_df)

{1.0: [('CCCT', 1), ('CCTC', 1), ('CTCA', 1), ('TCAA', 1), ('CAAG', 1), ('AAGA', 1), ('AGAG', 1), ('GAGT', 1), ('AGTG', 1), ('GTGA', 1), ('TGAC', 1), ('GACT', 1), ('ACTA', 1), ('CTAC', 1), ('TACA', 1), ('ACAT', 1), ('CATC', 1), ('ATCA', 1), ('TCAG', 1), ('CAGC', 1)], 2.0: [('AGTG', 2), ('GTGT', 1), ('TGTG', 1), ('GTGC', 1), ('TGCA', 1), ('GCAT', 1), ('CATT', 1), ('ATTG', 1), ('TTGC', 1), ('TGCC', 1), ('GCCA', 1), ('CCAC', 1), ('CACC', 1), ('ACCT', 1), ('CCTC', 1), ('CTCA', 1), ('TCAG', 1), ('CAGT', 1), ('GTGG', 1)], 3.0: [('CCGG', 1), ('CGGG', 1), ('GGGC', 1), ('GGCA', 1), ('GCAT', 2), ('CATC', 2), ('ATCT', 2), ('TCTG', 1), ('CTGC', 2), ('TGCA', 2), ('GCAG', 1), ('CAGC', 1), ('AGCC', 1), ('GCCT', 1), ('CCTG', 1)], 4.0: [('CCTG', 1), ('CTGG', 1), ('TGGG', 1), ('GGGA', 1), ('GGAG', 1), ('GAGC', 1), ('AGCC', 1), ('GCCG', 3), ('CCGC', 2), ('CGCC', 2), ('CCGA', 1), ('CGAG', 1), ('GAGG', 1), ('AGGG', 1), ('GGGC', 1), ('GGCC', 1)], 5.0: [('CCAT', 1), ('CATC', 1), ('ATCT', 1), ('TCTA', 1), ('C

In [173]:
x=pd.DataFrame.from_dict(VO_2016_kmers_dict_for_df, orient="index")
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
1.0,"(CCCT, 1)","(CCTC, 1)","(CTCA, 1)","(TCAA, 1)","(CAAG, 1)","(AAGA, 1)","(AGAG, 1)","(GAGT, 1)","(AGTG, 1)","(GTGA, 1)","(TGAC, 1)","(GACT, 1)","(ACTA, 1)","(CTAC, 1)","(TACA, 1)","(ACAT, 1)","(CATC, 1)","(ATCA, 1)","(TCAG, 1)","(CAGC, 1)"
2.0,"(AGTG, 2)","(GTGT, 1)","(TGTG, 1)","(GTGC, 1)","(TGCA, 1)","(GCAT, 1)","(CATT, 1)","(ATTG, 1)","(TTGC, 1)","(TGCC, 1)","(GCCA, 1)","(CCAC, 1)","(CACC, 1)","(ACCT, 1)","(CCTC, 1)","(CTCA, 1)","(TCAG, 1)","(CAGT, 1)","(GTGG, 1)",
3.0,"(CCGG, 1)","(CGGG, 1)","(GGGC, 1)","(GGCA, 1)","(GCAT, 2)","(CATC, 2)","(ATCT, 2)","(TCTG, 1)","(CTGC, 2)","(TGCA, 2)","(GCAG, 1)","(CAGC, 1)","(AGCC, 1)","(GCCT, 1)","(CCTG, 1)",,,,,
4.0,"(CCTG, 1)","(CTGG, 1)","(TGGG, 1)","(GGGA, 1)","(GGAG, 1)","(GAGC, 1)","(AGCC, 1)","(GCCG, 3)","(CCGC, 2)","(CGCC, 2)","(CCGA, 1)","(CGAG, 1)","(GAGG, 1)","(AGGG, 1)","(GGGC, 1)","(GGCC, 1)",,,,
5.0,"(CCAT, 1)","(CATC, 1)","(ATCT, 1)","(TCTA, 1)","(CTAT, 1)","(TATA, 1)","(ATAA, 1)","(TAAC, 1)","(AACA, 2)","(ACAA, 1)","(CAAC, 1)","(ACAT, 1)","(CATG, 1)","(ATGT, 1)","(TGTT, 1)","(GTTC, 1)","(TTCT, 1)","(TCTG, 1)","(CTGT, 1)",
6.0,"(CCGA, 1)","(CGAG, 1)","(GAGG, 1)","(AGGA, 1)","(GGAG, 1)","(GAGC, 1)","(AGCT, 1)","(GCTT, 1)","(CTTT, 1)","(TTTC, 1)","(TTCC, 1)","(TCCA, 1)","(CCAG, 1)","(CAGA, 1)","(AGAA, 1)","(GAAT, 1)","(AATC, 1)","(ATCT, 1)","(TCTG, 1)","(CTGT, 1)"
7.0,"(CCTT, 1)","(CTTA, 1)","(TTAG, 1)","(TAGA, 1)","(AGAG, 1)","(GAGT, 1)","(AGTT, 1)","(GTTC, 1)","(TTCC, 1)","(TCCA, 1)","(CCAC, 1)","(CACT, 1)","(ACTT, 1)","(CTTG, 1)","(TTGT, 1)","(TGTT, 1)","(GTTG, 1)","(TTGA, 1)","(TGAC, 1)","(GACC, 1)"
8.0,"(CCCA, 1)","(CCAA, 1)","(CAAG, 1)","(AAGA, 1)","(AGAG, 1)","(GAGC, 1)","(AGCC, 1)","(GCCC, 1)","(CCCC, 1)","(CCCT, 1)","(CCTG, 1)","(CTGA, 1)","(TGAA, 1)","(GAAA, 1)","(AAAT, 1)","(AATA, 1)","(ATAC, 1)","(TACT, 1)","(ACTC, 1)","(CTCC, 1)"
9.0,"(CCCA, 1)","(CCAC, 1)","(CACT, 1)","(ACTA, 1)","(CTAG, 1)","(TAGC, 1)","(AGCA, 2)","(GCAG, 1)","(CAGC, 2)","(GCAC, 1)","(CACA, 1)","(ACAG, 1)","(AGCC, 1)","(GCCC, 1)","(CCCT, 1)","(CCTC, 1)","(CTCC, 1)","(TCCC, 1)",,
10.0,"(CCAC, 1)","(CACA, 1)","(ACAG, 1)","(CAGA, 1)","(AGAT, 1)","(GATA, 1)","(ATAT, 1)","(TATA, 1)","(ATAA, 1)","(TAAT, 1)","(AATT, 1)","(ATTT, 1)","(TTTC, 1)","(TTCA, 1)","(TCAA, 1)","(CAAG, 1)","(AAGT, 1)","(AGTT, 1)","(GTTG, 1)","(TTGC, 1)"
