In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np


In [2]:
X = pd.read_csv("./PromoterTrain.csv", header=0, names=['id', 'sequence'], index_col='id', sep=",")
X.head()


Unnamed: 0_level_0,sequence
id,Unnamed: 1_level_1
0,CAAACGCATCAGGATCAAAGTGAACATCACGAAACTTCTTACAATG...
1,CCGGTAAACTCTGTGGAAAGAGCAATGTGAAATCAGCGAGATAATG...
2,GGAATTTTCTCGAGCATAGCCAGAGCCGCAGAATTTGCTACGGTTA...
3,TCACCAATACCGCCTACGTCTACGCCCAGCAGTTTCAGCTTGGCGC...
4,GCACGGTATCGTGCTTGGTAACCTGGTAGGATTGATCGATTCTGAC...


In [3]:
y = pd.read_csv("./SigmaTrain.csv", header=0, index_col="id", sep=",")
y.head()

Unnamed: 0_level_0,RPOS,RPOD,RPOH,RPON,RPOF
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,0,0,0,0
1,0,0,0,0,0
2,1,1,1,0,0
3,0,0,0,0,0
4,1,0,0,0,0


In [4]:
def splitter(a: str, n=6):
    results = []
    for start in range(len(a)):
        res = []
        for i in range(start, len(a), n):
            val = a[i:i+n]
            # only append if it contains the full n chars
            if len(val) == n:
                res.append(val)
        if len(res) > 1:
            results.append(res)
    return results
    
splitter_example = splitter("CAAACGCATCAGGATCAAAGTGAACATCACGAAACTTCTTACAATG", n=6)
print(splitter_example)

[['CAAACG', 'CATCAG', 'GATCAA', 'AGTGAA', 'CATCAC', 'GAAACT', 'TCTTAC'], ['AAACGC', 'ATCAGG', 'ATCAAA', 'GTGAAC', 'ATCACG', 'AAACTT', 'CTTACA'], ['AACGCA', 'TCAGGA', 'TCAAAG', 'TGAACA', 'TCACGA', 'AACTTC', 'TTACAA'], ['ACGCAT', 'CAGGAT', 'CAAAGT', 'GAACAT', 'CACGAA', 'ACTTCT', 'TACAAT'], ['CGCATC', 'AGGATC', 'AAAGTG', 'AACATC', 'ACGAAA', 'CTTCTT', 'ACAATG'], ['GCATCA', 'GGATCA', 'AAGTGA', 'ACATCA', 'CGAAAC', 'TTCTTA'], ['CATCAG', 'GATCAA', 'AGTGAA', 'CATCAC', 'GAAACT', 'TCTTAC'], ['ATCAGG', 'ATCAAA', 'GTGAAC', 'ATCACG', 'AAACTT', 'CTTACA'], ['TCAGGA', 'TCAAAG', 'TGAACA', 'TCACGA', 'AACTTC', 'TTACAA'], ['CAGGAT', 'CAAAGT', 'GAACAT', 'CACGAA', 'ACTTCT', 'TACAAT'], ['AGGATC', 'AAAGTG', 'AACATC', 'ACGAAA', 'CTTCTT', 'ACAATG'], ['GGATCA', 'AAGTGA', 'ACATCA', 'CGAAAC', 'TTCTTA'], ['GATCAA', 'AGTGAA', 'CATCAC', 'GAAACT', 'TCTTAC'], ['ATCAAA', 'GTGAAC', 'ATCACG', 'AAACTT', 'CTTACA'], ['TCAAAG', 'TGAACA', 'TCACGA', 'AACTTC', 'TTACAA'], ['CAAAGT', 'GAACAT', 'CACGAA', 'ACTTCT', 'TACAAT'], ['AAAGT

In [5]:
X['ngram_tokens'] = X.sequence.map(lambda x: splitter(x))
X.head()

Unnamed: 0_level_0,sequence,ngram_tokens
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,CAAACGCATCAGGATCAAAGTGAACATCACGAAACTTCTTACAATG...,"[[CAAACG, CATCAG, GATCAA, AGTGAA, CATCAC, GAAA..."
1,CCGGTAAACTCTGTGGAAAGAGCAATGTGAAATCAGCGAGATAATG...,"[[CCGGTA, AACTCT, GTGGAA, AGAGCA, ATGTGA, AATC..."
2,GGAATTTTCTCGAGCATAGCCAGAGCCGCAGAATTTGCTACGGTTA...,"[[GGAATT, TTCTCG, AGCATA, GCCAGA, GCCGCA, GAAT..."
3,TCACCAATACCGCCTACGTCTACGCCCAGCAGTTTCAGCTTGGCGC...,"[[TCACCA, ATACCG, CCTACG, TCTACG, CCCAGC, AGTT..."
4,GCACGGTATCGTGCTTGGTAACCTGGTAGGATTGATCGATTCTGAC...,"[[GCACGG, TATCGT, GCTTGG, TAACCT, GGTAGG, ATTG..."


In [6]:
def word_embedding_feature_pairs(corpus, window_size=1):
    pairs = []
    for doc in corpus:
        for idx, seq in enumerate(doc):
            for neighbour in doc[max(idx - window_size, 0) : min(idx + window_size, len(doc)) + 1] : 
                if neighbour != seq:
                    pairs.append((seq, neighbour))
    return pairs
features = word_embedding_feature_pairs(splitter_example)
print(features)

[('CAAACG', 'CATCAG'), ('CATCAG', 'CAAACG'), ('CATCAG', 'GATCAA'), ('GATCAA', 'CATCAG'), ('GATCAA', 'AGTGAA'), ('AGTGAA', 'GATCAA'), ('AGTGAA', 'CATCAC'), ('CATCAC', 'AGTGAA'), ('CATCAC', 'GAAACT'), ('GAAACT', 'CATCAC'), ('GAAACT', 'TCTTAC'), ('TCTTAC', 'GAAACT'), ('AAACGC', 'ATCAGG'), ('ATCAGG', 'AAACGC'), ('ATCAGG', 'ATCAAA'), ('ATCAAA', 'ATCAGG'), ('ATCAAA', 'GTGAAC'), ('GTGAAC', 'ATCAAA'), ('GTGAAC', 'ATCACG'), ('ATCACG', 'GTGAAC'), ('ATCACG', 'AAACTT'), ('AAACTT', 'ATCACG'), ('AAACTT', 'CTTACA'), ('CTTACA', 'AAACTT'), ('AACGCA', 'TCAGGA'), ('TCAGGA', 'AACGCA'), ('TCAGGA', 'TCAAAG'), ('TCAAAG', 'TCAGGA'), ('TCAAAG', 'TGAACA'), ('TGAACA', 'TCAAAG'), ('TGAACA', 'TCACGA'), ('TCACGA', 'TGAACA'), ('TCACGA', 'AACTTC'), ('AACTTC', 'TCACGA'), ('AACTTC', 'TTACAA'), ('TTACAA', 'AACTTC'), ('ACGCAT', 'CAGGAT'), ('CAGGAT', 'ACGCAT'), ('CAGGAT', 'CAAAGT'), ('CAAAGT', 'CAGGAT'), ('CAAAGT', 'GAACAT'), ('GAACAT', 'CAAAGT'), ('GAACAT', 'CACGAA'), ('CACGAA', 'GAACAT'), ('CACGAA', 'ACTTCT'), ('ACTTCT'

In [7]:
X['word_embedding_features'] = X.ngram_tokens.map(lambda x: word_embedding_feature_pairs(x))
X.head()

Unnamed: 0_level_0,sequence,ngram_tokens,word_embedding_features
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,CAAACGCATCAGGATCAAAGTGAACATCACGAAACTTCTTACAATG...,"[[CAAACG, CATCAG, GATCAA, AGTGAA, CATCAC, GAAA...","[(CAAACG, CATCAG), (CATCAG, CAAACG), (CATCAG, ..."
1,CCGGTAAACTCTGTGGAAAGAGCAATGTGAAATCAGCGAGATAATG...,"[[CCGGTA, AACTCT, GTGGAA, AGAGCA, ATGTGA, AATC...","[(CCGGTA, AACTCT), (AACTCT, CCGGTA), (AACTCT, ..."
2,GGAATTTTCTCGAGCATAGCCAGAGCCGCAGAATTTGCTACGGTTA...,"[[GGAATT, TTCTCG, AGCATA, GCCAGA, GCCGCA, GAAT...","[(GGAATT, TTCTCG), (TTCTCG, GGAATT), (TTCTCG, ..."
3,TCACCAATACCGCCTACGTCTACGCCCAGCAGTTTCAGCTTGGCGC...,"[[TCACCA, ATACCG, CCTACG, TCTACG, CCCAGC, AGTT...","[(TCACCA, ATACCG), (ATACCG, TCACCA), (ATACCG, ..."
4,GCACGGTATCGTGCTTGGTAACCTGGTAGGATTGATCGATTCTGAC...,"[[GCACGG, TATCGT, GCTTGG, TAACCT, GGTAGG, ATTG...","[(GCACGG, TATCGT), (TATCGT, GCACGG), (TATCGT, ..."


In [8]:
X['feature_count'] = X.word_embedding_features.map(lambda x: len(x))
X.head()

Unnamed: 0_level_0,sequence,ngram_tokens,word_embedding_features,feature_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,CAAACGCATCAGGATCAAAGTGAACATCACGAAACTTCTTACAATG...,"[[CAAACG, CATCAG, GATCAA, AGTGAA, CATCAC, GAAA...","[(CAAACG, CATCAG), (CATCAG, CAAACG), (CATCAG, ...",308
1,CCGGTAAACTCTGTGGAAAGAGCAATGTGAAATCAGCGAGATAATG...,"[[CCGGTA, AACTCT, GTGGAA, AGAGCA, ATGTGA, AATC...","[(CCGGTA, AACTCT), (AACTCT, CCGGTA), (AACTCT, ...",308
2,GGAATTTTCTCGAGCATAGCCAGAGCCGCAGAATTTGCTACGGTTA...,"[[GGAATT, TTCTCG, AGCATA, GCCAGA, GCCGCA, GAAT...","[(GGAATT, TTCTCG), (TTCTCG, GGAATT), (TTCTCG, ...",308
3,TCACCAATACCGCCTACGTCTACGCCCAGCAGTTTCAGCTTGGCGC...,"[[TCACCA, ATACCG, CCTACG, TCTACG, CCCAGC, AGTT...","[(TCACCA, ATACCG), (ATACCG, TCACCA), (ATACCG, ...",308
4,GCACGGTATCGTGCTTGGTAACCTGGTAGGATTGATCGATTCTGAC...,"[[GCACGG, TATCGT, GCTTGG, TAACCT, GGTAGG, ATTG...","[(GCACGG, TATCGT), (TATCGT, GCACGG), (TATCGT, ...",308


In [9]:
total_features = X.feature_count.sum()
total_features

1046240

In [10]:
def get_word_embedding_input_data(nested_lists):
    unique_seqs = set()
    x = []
    y = [] 
    
    for i in nested_lists:
        for a, b in i:
            x.append(a)
            y.append(b)
            unique_seqs.add(a)
            unique_seqs.add(b)
        
    dictionary = {}
    for i,seq in enumerate(unique_seqs):
        dictionary[seq] = i
        
    df = pd.DataFrame({'X': x, 'y': y})
    return df, dictionary


df, dictionary = get_word_embedding_input_data(X.word_embedding_features.values)

df.head()

Unnamed: 0,X,y
0,CAAACG,CATCAG
1,CATCAG,CAAACG
2,CATCAG,GATCAA
3,GATCAA,CATCAG
4,GATCAA,AGTGAA


In [11]:
# this will be the number of input neurons
ONE_HOT_DIM = len(dictionary)
ONE_HOT_DIM

4090

In [12]:
def one_hot_encode_seq(data_point_index):
    res = np.zeros(ONE_HOT_DIM)
    res[data_point_index] = 1
    return res

In [13]:
def train(X_strings, y_strings):
    X_encoded = [] # input seq as one hot encoed
    Y_encoded = [] # target seq as . one hot encoded

    for seq, target in zip(X_strings, y_strings):
        X_encoded.append(one_hot_encode_seq(dictionary[seq]))
        Y_encoded.append(one_hot_encode_seq(dictionary[target]))

    X_train = np.asarray(X_encoded)
    Y_train = np.asarray(Y_encoded)

    # make placeholders for X_train and Y_train
    x = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
    y_label = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))

    # how many neurons should be in the hidden layer
    EMBEDDING_DIM = 6

    # hidden layer: which represents word vector eventually
    W1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM]))
    b1 = tf.Variable(tf.random_normal([1])) #bias
    hidden_layer = tf.add(tf.matmul(x,W1), b1)

    # output layer
    W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_DIM]))
    b2 = tf.Variable(tf.random_normal([1]))
    prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))

    # loss function: cross entropy
    loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))

    # training operation
    train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)

    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init) 

    iteration = 20000
    for i in range(iteration):
        # input is X_train which is one hot encoded word
        # label is Y_train which is one hot encoded neighbor word
        sess.run(train_op, feed_dict={x: X_train, y_label: Y_train})
        if i % 3000 == 0:
            print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X_train, y_label: Y_train}))
            

    # Now the hidden layer (W1 + b1) is actually the word look up table
    return sess.run(W1 + b1)

In [None]:
word_vectors = train(df.X, df.y)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [None]:
vector_dictionary = {}
for k, v in enumerate(dictionary):
    vector_dictionary[k] = word_vectors[v]
    print(k, v)

vector_dictionary

In [101]:
def flatten(arr):
    res = []
    for i in arr:
        res += i
    return res
X['tokens_flat'] = X.ngram_tokens.map(lambda x: flatten(x))
X['token_vectors'] = X.tokens_flat.map(lambda x: vector_dictionary[])
X['token_vectors_flat'] = X.token_vectors.map(lambda x: flatten(x))
X.head()


Unnamed: 0_level_0,sequence,ngram_tokens,word_embedding_features,feature_count,tokens_flat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,CAAACGCATCAGGATCAAAGTGAACATCACGAAACTTCTTACAATG...,"[[CAAACG, CATCAG, GATCAA, AGTGAA, CATCAC, GAAA...","[(CAAACG, CATCAG), (CATCAG, CAAACG), (CATCAG, ...",308,"[CAAACG, CATCAG, GATCAA, AGTGAA, CATCAC, GAAAC..."
1,CCGGTAAACTCTGTGGAAAGAGCAATGTGAAATCAGCGAGATAATG...,"[[CCGGTA, AACTCT, GTGGAA, AGAGCA, ATGTGA, AATC...","[(CCGGTA, AACTCT), (AACTCT, CCGGTA), (AACTCT, ...",308,"[CCGGTA, AACTCT, GTGGAA, AGAGCA, ATGTGA, AATCA..."
2,GGAATTTTCTCGAGCATAGCCAGAGCCGCAGAATTTGCTACGGTTA...,"[[GGAATT, TTCTCG, AGCATA, GCCAGA, GCCGCA, GAAT...","[(GGAATT, TTCTCG), (TTCTCG, GGAATT), (TTCTCG, ...",308,"[GGAATT, TTCTCG, AGCATA, GCCAGA, GCCGCA, GAATT..."
3,TCACCAATACCGCCTACGTCTACGCCCAGCAGTTTCAGCTTGGCGC...,"[[TCACCA, ATACCG, CCTACG, TCTACG, CCCAGC, AGTT...","[(TCACCA, ATACCG), (ATACCG, TCACCA), (ATACCG, ...",308,"[TCACCA, ATACCG, CCTACG, TCTACG, CCCAGC, AGTTT..."
4,GCACGGTATCGTGCTTGGTAACCTGGTAGGATTGATCGATTCTGAC...,"[[GCACGG, TATCGT, GCTTGG, TAACCT, GGTAGG, ATTG...","[(GCACGG, TATCGT), (TATCGT, GCACGG), (TATCGT, ...",308,"[GCACGG, TATCGT, GCTTGG, TAACCT, GGTAGG, ATTGA..."


In [None]:
# use the seuence embeddings to predict the target class
