## Python Core Demo

How to use the python core from a Jupyter notebook. It also shows how to debug the JSON application configs which are used to connect to external database(s) and redis server(s).


In [104]:
import redis
import time 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import base64
import pickle
from collections import defaultdict
rDB = redis.Redis(host='localhost', port=6379, db=0)
def store_numpy(name,mat):
    """
      Creates a matrix from a numpy matrix
    """
    if len(mat.shape) != 2:
        raise BaseException('Shape of input matrix must be of size 2')
    rows,cols = mat.shape
    array_dtype = str(mat.dtype)
    m = mat.ravel().tostring()
    key = '{0}|{1}#{2}#{3}'.format(int(time.time()), array_dtype, rows, cols)
    rDB.set(name,m)
    return key

def get_numpy(name,key):
    d_mat = rDB.get(name)
    array_dtype, row, col = key.split('|')[1].split('#')
    return np.fromstring(d_mat, dtype=array_dtype).reshape(int(row), int(col))
def set_builder(lst):
    output = []
    for l in lst:
        output+=list(set(l))
    return list(set(output))
pipe = rDB.pipeline()
keys = ['doc_by_vocab_data']
[pipe.get(k) for k in keys]
result = pipe.execute()
data = {k:result[i] for i,k in enumerate(keys)}
key = data['doc_by_vocab_data']
doc_by_vocab = get_numpy('doc_by_vocab',key)
print(doc_by_vocab.shape)

(5210, 4676)


In [105]:
doc_by_vocab.dtype

dtype('float64')

In [106]:
vocab_to_index = pickle.loads(rDB.get('vocab_to_index'))
index_to_vocab = pickle.loads(rDB.get('index_to_vocab'))
episode_to_index = pickle.loads(rDB.get('e_t_to_i'))
index_to_episode = pickle.loads(rDB.get('e_i_to_t'))
series_to_index = pickle.loads(rDB.get('s_t_to_i'))
index_to_series = pickle.loads(rDB.get('s_i_to_t'))
series_episodes = pickle.loads(rDB.get('series_episodes'))
episode_tags = pickle.loads(rDB.get('e_tags'))

In [107]:
n_users = 1

In [108]:
index_to_tag = set_builder(episode_tags.values())
n_tags = len(index_to_tag)
tag_to_index = {t:i for i,t in enumerate(index_to_tag)}
episode_tag_matrix = np.zeros([len(index_to_episode),n_tags])
for i in xrange(n_tags):
    for tag in episode_tags[index_to_episode[i]]:
        episode_tag_matrix[i][tag_to_index[tag]] = 1

In [136]:
from keras.layers import Dense, Input, Embedding, Flatten, SimpleRNN
from keras.models import Model
from keras.layers.merge import dot
from keras.losses import mean_squared_error
user_in = Input(shape=(1,), dtype = 'int32')
true_rating_in = Input((1,), dtype = 'float32')
meta_features_in = Input(shape=(doc_by_vocab.shape[1],), dtype = 'float32')
user_embeddings = Embedding(n_users, 8)
user_encoded = Flatten()(user_embeddings(user_in))
episode_encoded = Dense(8)(meta_features_in)
prediction = dot([user_encoded, episode_encoded], axes = 1)
model = Model(inputs = [user_in, meta_features_in,true_rating_in],
                     outputs = prediction)
model.compile(loss = 'mse', optimizer = 'adam')
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_148 (InputLayer)           (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_88 (Embedding)         (None, 1, 8)          8                                            
____________________________________________________________________________________________________
input_150 (InputLayer)           (None, 4676)          0                                            
____________________________________________________________________________________________________
flatten_83 (Flatten)             (None, 8)             0                                            
___________________________________________________________________________________________

In [137]:
ratings = np.zeros((doc_by_vocab.shape[0],n_users))
series_name = index_to_series[437]
series_name2 = index_to_series[139]
for i in [episode_to_index[e] for e in series_episodes[series_name]]:
    ratings[i] = 1.0
for i in [episode_to_index[e] for e in series_episodes[series_name2]]:
    ratings[i] = 1.0
# for i in [k for k,v in episode_tags.iteritems() if ('comedy' in v) or ('humor' in v)]:
#     ratings[i] = 5.0
users = np.zeros((doc_by_vocab.shape[0],1))
labels = np.zeros((doc_by_vocab.shape[0],1))

In [140]:
model.fit([users,doc_by_vocab,ratings],labels,epochs=10,verbose=None)

<keras.callbacks.History at 0x11b8f3c50>

In [141]:
[index_to_episode[e_i] for e_i in np.array([model.predict([np.array([0]),doc_by_vocab[v].reshape(1,4676),np.array([1.])], batch_size=32, verbose=2)[0][0] for v in range(doc_by_vocab.shape[0])]).argsort()][:10]

['6 - Stay Sixy',
 'MFM Minisode 12',
 "Gay Pimpin' with Jonny McGovern, Bonus Video - 7/21/2006",
 'Harriet Tubman is our Auntie',
 '27 - Your Hometown Murder Email Round-Up',
 'EP13: Horrible Bosses',
 '35: Maneuver Warfare, Being a Rebel, Disarming w/ Words, Sliding into Old Habits',
 'Italian Fashion Brand Doing $12m/Year In Transaction Volume EP 227',
 'Episode 22: Youve Got Questions, Weve Got Answers',
 '7 Reasons Your Website Isnt Working']

In [113]:
# from keras.layers import Dense, Input, Embedding, Flatten, Dropout, Bidirectional, GRU, LSTM
# from keras.models import Model
# from keras.layers.merge import dot
# from keras.layers.pooling import AveragePooling1D
# from keras.losses import mean_squared_error
# from keras.regularizers import l2 as l2reg

# user_in = Input(shape=(1,), dtype = 'int32')
# true_rating_in = Input((1,), dtype = 'float32')
# meta_features_in = Input(shape=(doc_by_vocab.shape[1],), dtype = 'float32')
##########
# user_embeddings = Embedding(n_users, 5)
# user_encoded = Flatten()(user_embeddings(user_in))
##########
# episode_embeddings = Embedding(doc_by_vocab.shape[1],200,embeddings_regularizer=l2reg(0.01))
# episode_encoded = Flatten()(episode_embeddings(meta_features_in))
# print(episode_encoded.get_shape)
# episode_first_dropout = Dropout(0.1)(episode_encoded)
# print(episode_first_dropout.get_shape)
# episode_post_first_rnn = GRU(400, input_shape=(doc_by_vocab.shape[1],200))(episode_first_dropout)
# episode_post_first_rnn = Bidirectional(GRU(400))(episode_first_dropout)
# episode_second_dropout = Dropout(0.5)(episode_post_first_rnn)
# episode_post_second_rnn = GRU(units=200)(episode_second_dropout)
# episode_third_dropout = Dropout(0.3)(episode_post_second_rnn)
# episode_encoded_final = AveragePooling1D()(episode_third_dropout)
# episode_final = Dense(5)(episode_first_dropout)
# prediction = dot([user_encoded, episode_final], axes = 1)
# model = Model(inputs = [user_in, meta_features_in,true_rating_in],
#                      outputs = prediction)
# model.compile(loss = 'mse', optimizer = 'adam')
# model.summary()