In [2]:
import pandas as pd
import tensorflow as tf

# Calculate Genre one hot encoding

In [10]:
# helper functions for Genre one hot encoding

def preprocess_genre_data(df, keep_name_col=False):
  """
  """
  df = df[[NAME_COL, GENRE_COL]].dropna()
  formatted_genre_col = GENRE_COL+'_formatted'
  df[formatted_genre_col] = df[GENRE_COL].apply(lambda x: x.replace("|", " "))
  df.drop(GENRE_COL, axis=1, inplace=True)
  df.rename(columns={formatted_genre_col: GENRE_COL}, inplace=True)
  if keep_name_col:
    return df
  else:
    return df[GENRE_COL].values

def load_genre_training_data(training_data_path, keep_name_col=False):
  """
  """
  df = pd.read_csv(training_data_path, sep='\t')
  return preprocess_genre_data(df, keep_name_col)

def fit_genre_vectorizer_layer(layer, training_data_path):
  """
  """
  training_data = load_genre_training_data(training_data_path)
  layer.adapt(training_data)
  return layer, layer.get_vocabulary()

def load_genre_vectorizer_layer(layer, vocab):
  """
  Sets `vocab` as the vocabulary of `layer`.
  
  :param layer: tf.keras.layers.experimental.preprocessing.TextVectorization
  :param vocab: List of strings (vocabulary elements)
  
  :return tf.keras.layers.experimental.preprocessing.TextVectorization 
  """
  layer.set_vocabulary(vocab[1:])  # index 0 is OOV token
  
def read_genre_vocab(vocab_path):
  """
  Reads vocabulary for genre one hot encoder from CSV file. 
  CSV file must have no header.
  Each element of vocabulary must be in its separate lines.
  """
  try:
    vocab = pd.read_csv(vocab_path, header=None)[0].values.tolist()
  except ValueError:
      print("""VOCAB_PATH not found, please retrain the genre model with 
      get_genre_one_hot_encoder_model(train=True, overwrite_vocab=True)""")
  return vocab

def get_genre_one_hot_encoder_model(train=False, write_vocab=False):
  """
  Returns Genre one hot encoder model. 
  If `train` is True, fits the genre vectorizer layer to training data.
  If `train` is False, loads pre-fitted genre vectorizer layer.
  """
  textVectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(output_mode='binary')
  
  if train:
    textVectorizer, vocab = fit_genre_vectorizer_layer(textVectorizer, GENRE_TRAINING_DATA_PATH)
    if write_vocab:
      pd.DataFrame(vocab).to_csv(VOCAB_PATH, index=False, header=False)
  else:
    vocab = read_genre_vocab(VOCAB_PATH)
    load_genre_vectorizer_layer(textVectorizer, vocab)
  
  model = tf.keras.models.Sequential()
  model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
  model.add(textVectorizer)

  return model


In [11]:
# ALL CONSTANTS

VOCAB_PATH = '/content/drive/MyDrive/drama/resources/genre_one_hot_encoder_vocab.csv'
GENRE_ONE_HOT_LOOKUP_PATH = '/content/drive/MyDrive/drama/resources/genre_one_hot_lookup.csv'
GENRE_TRAINING_DATA_PATH = "/content/drive/My Drive/drama/clean_fields.csv"
GENRE_COL = 'genre'
NAME_COL = 'main_name'

In [13]:
# train one hot encoder

genre_one_hot = get_genre_one_hot_encoder_model(train=True, write_vocab=True)

In [14]:
# infer on test data

test_data = pd.DataFrame({'main_name': ['SoapOpera'],
                          'genre': ['drama']})

genre_one_hot.predict(test_data['genre'].values)

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [15]:
# load encoder and infer

genre_one_hot = get_genre_one_hot_encoder_model()

genre_one_hot.predict(test_data['genre'].values)

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

# Store one hot encodings of all known dramas

In [16]:
def store_genre_one_hot():
  # load all data
  all_data = load_genre_training_data(GENRE_TRAINING_DATA_PATH, keep_name_col=True)

  # load vocab
  vocab = read_genre_vocab(VOCAB_PATH)
  vocab = ['OOV'] + vocab

  # create one hot Pandas DataFrame
  all_data_genre_one_hot = pd.DataFrame(genre_one_hot.predict(all_data[GENRE_COL].values), index=all_data[NAME_COL], columns=vocab)

  # TODO: save to file
  all_data_genre_one_hot.to_csv(GENRE_ONE_HOT_LOOKUP_PATH)

# Calculate pairwise cosine similarity

In [54]:
def load_one_hot_vectors(lookup_path):
  return pd.read_csv(lookup_path, index_col=NAME_COL)

def lookup_genres_from_one_hot_vector(genre_one_hot, vocab):
  """
  Returns genre list from genre one hot encoded vector.
  """
  return [x for (x, y) in zip(vocab, genre_one_hot) if y == 1]


def _get_top_k(cosine_simil, k):
  return cosine_simil.argsort()[-k:]


def _print_info(request, selected_drama_names, vocab, one_hot_selected_dramas):
  print("Genre of request:", 
        lookup_genres_from_one_hot_vector(request[0], vocab))
  
  for i in range(len(selected_drama_names)):
    print("Drama", i)
    print("Name:", selected_drama_names[i])
    
    selected_drama_genre_one_hot = one_hot_selected_dramas[i, :]
    print("One hot encoding of genre: ", selected_drama_genre_one_hot)
    print("Genre:", 
          lookup_genres_from_one_hot_vector(selected_drama_genre_one_hot, vocab))


def calculate_cosine_similarity_and_retrieve_top_k(request, all_data, k=3, debug=False, vocab=None):
  from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity
  
  drama_names = all_data.index
  features = all_data.values

  cosine_simil = sklearn_cosine_similarity(request, features)[0]
  top_k_indices = _get_top_k(cosine_simil, k)
  
  if debug:
    if vocab is None:
      raise ValueError("Must pass vocab in debug mode.")

    selected_drama_names = drama_names[top_k_indices]
    one_hot_selected_dramas = features[top_k_indices, :]
    _print_info(request, selected_drama_names, vocab, one_hot_selected_dramas)
  
  # return names
  return drama_names[top_k_indices].tolist()
  
  

In [55]:
request_data = pd.DataFrame({'main_name': ['SoapOpera'],
                          'genre': ['drama family mystery']})

# get one hot encoding of genre of request drama
genre_one_hot_model = get_genre_one_hot_encoder_model()
request_genre_one_hot = genre_one_hot_model.predict(request_data[GENRE_COL].values)

# get top dramas
all_data_genre_one_hot = load_one_hot_vectors(GENRE_ONE_HOT_LOOKUP_PATH)
vocab = read_genre_vocab(VOCAB_PATH)

top_dramas = calculate_cosine_similarity_and_retrieve_top_k(request_genre_one_hot, 
                                                            all_data_genre_one_hot, 
                                                            debug=True,
                                                            vocab=vocab)
top_dramas


Genre of request: ['romance', 'melodrama', 'crime']
Drama 0
Name: Chip In
One hot encoding of genre:  [0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Genre: ['romance', 'melodrama']
Drama 1
Name: Mother of Mine
One hot encoding of genre:  [0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Genre: ['romance', 'crime']
Drama 2
Name: Once Again
One hot encoding of genre:  [0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Genre: ['romance', 'crime']


['Chip In', 'Mother of Mine', 'Once Again']