In [None]:
from nltk.tokenize import RegexpTokenizer
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import gzip
from collections import defaultdict
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
# from transformers import AutoTokenizer, AutoModel
from sentence_transformers import util, SentenceTransformer

In [None]:
# Basic file paths
ANNOTATIONS_FILE = './data/annotated_data.csv'
MOVIE_DESC_PATH = './data/dataset.csv'
MOVIE_METADATA_PATH = './data/movies.csv'
# Create genre files
ENCODED_GENRE_FILE = './data/encoded_genres.npy'
GENRE_ORDER_FILE = './data/genre_order.txt'
# Create movie description files
ENCODED_MOVIE_DESC_FILE = './data/encoded_movie_desc.npy'
MOVIE_ID_TO_GENRE_FILE = './data/movie_id_to_genre.json'
MOVIE_ID_ORDER_FILE = './data/movie_id_order.txt'

#### Movie Genres

In [None]:
uniqure_genres = set()
movie_metadata_df = pd.read_csv(MOVIE_METADATA_PATH)
movie_id_to_genres = defaultdict(list)
for i, row in tqdm(movie_metadata_df.iterrows(), total=len(movie_metadata_df)):
    movie_id = str(row['movieId'])
    genres = row['genres'].split('|')
    genres = [genre.lower().strip() for genre in genres]
    movie_id_to_genres[movie_id] = genres
    uniqure_genres.update(set(genres))

print("Found {} unique genres".format(len(uniqure_genres)))

In [None]:
with open(MOVIE_ID_TO_GENRE_FILE, 'w') as f:
    json.dump(movie_id_to_genres, f)

**Vector Representaion with Pre-trained Model**

In [None]:
class VectorFeatExtractor:
    def __init__(self, bi_encoder_name, device):
        self.device = device
        self.model = SentenceTransformer(bi_encoder_name).to(self.device)
        self.cache_doc_emb = None
        self.cache_docids = None

    def generate_doc_emb(self, doc, docids):
        self.cache_doc_emb = self.model.encode(doc, batch_size=128, show_progress_bar=True, normalize_embeddings=True)
        self.cache_docids = {docid: i for i, docid in enumerate(docids)}
        return self.cache_doc_emb

    def save_doc_emb(self, doc, docids, doc_emb_path, docid_order_path):
        if not isinstance(docids[0], str):
            docids = [str(docid) for docid in docids]
        doc_emb = self.generate_doc_emb(doc, docids)
        np.save(doc_emb_path, doc_emb)
        with open(docid_order_path, 'w') as f:
            f.write('\n'.join(docids))
    
    def fetch_all_embds(self):
        try:
            return self.cache_doc_emb
        except TypeError:
            raise ValueError("No docids in cache")
    
    def fetch_doc_with_id(self, docids):
        try:
            if not isinstance(docids, list):
                docids = [docids]
            ids = [self.cache_docids[docid] for docid in docids]
            return np.array(self.fetch_all_embds()[ids])
        except IndexError:
            raise ValueError("Invalid docid")

**Genre Vector Representations**

In [None]:
bi_encoder_name = 'sentence-transformers/msmarco-distilbert-dot-v5'
device = torch.device('cuda')
genre_vec_extractor = VectorFeatExtractor(bi_encoder_name, device)

In [None]:
genre_vec_extractor.save_doc_emb(list(uniqure_genres), list(uniqure_genres), ENCODED_GENRE_FILE, GENRE_ORDER_FILE)

**Text Vector Representation**

In [None]:
movie_desc_df = pd.read_csv(MOVIE_DESC_PATH)

text_vec_extractor = VectorFeatExtractor(bi_encoder_name, device)
movie_desc = movie_desc_df['data'].tolist()
movie_ids = movie_desc_df['id'].tolist()
text_vec_extractor.save_doc_emb(movie_desc, movie_ids, ENCODED_MOVIE_DESC_FILE, MOVIE_ID_ORDER_FILE)

**Query Vector Representation**

In [None]:
annotated_data_df = pd.read_csv(ANNOTATIONS_FILE)
query_to_qid = annotated_data_df[['query', 'qid']].drop_duplicates().set_index('qid').to_dict()['query']

In [None]:
query_vec_extractor = VectorFeatExtractor(bi_encoder_name, device)
query_texts = list(query_to_qid.values())
query_ids = list(query_to_qid.keys())

query_embs = query_vec_extractor.generate_doc_emb(query_texts, query_ids)

**FeatureExtractor**

In [None]:
class FeatureExtractor:
    def __init__(self, text_vec_extractor, genre_vec_extractor, movie_id_to_genres):
        self.text_vec_extractor = text_vec_extractor
        self.genre_vec_extractor = genre_vec_extractor
        self.movie_id_to_genres = movie_id_to_genres
        # create a genre mapper for indexing
        self.all_doc_genres = set()
        for genres in self.movie_id_to_genres.values():
            self.all_doc_genres.update(set(genres))
        self.genre_to_id = {genre: i for i, genre in enumerate(self.all_doc_genres)}
        self.movieId_to_genreId = defaultdict(list)
        for movie_id, genres in self.movie_id_to_genres.items():
            genre_ids = [self.genre_to_id[genre] for genre in genres]
            self.movieId_to_genreId[movie_id] = genre_ids

    def get_doc_feat(self, docids):
        if not isinstance(docids, list):
            docids = [docids]
        text_emb = self.text_vec_extractor.fetch_doc_with_id(docids)
        genres = [self.movie_id_to_genres.get(docid, []) for docid in docids]
        genres = [genre for genre_list in genres for genre in genre_list]
        genre_emb = self.genre_vec_extractor.fetch_doc_with_id(list(self.all_doc_genres))
        doc_genres = [self.movieId_to_genreId.get(docid, []) for docid in docids]
        return text_emb, genre_emb, doc_genres


In [None]:
fe = FeatureExtractor(text_vec_extractor, genre_vec_extractor, movie_id_to_genres)

**VectorRanker**

In [None]:
class VectorRanker:
    def __init__(self, fe, query_vec_extractor):
        self.query_vec_extractor = query_vec_extractor
        self.fe = fe

    def doc_emb_score(self, query_emb, doc_embs):
        return np.dot(query_emb, doc_embs.T)

    def aux_emb_score(self, query_emb, aux_emb):
        return np.dot(query_emb, aux_emb.T)
    
    def agg_genre_score_func(self, genre_query_score):
        agg_val = 0
        if len(genre_query_score) != 0:
            agg_val = np.max(genre_query_score).clip(min=0)
        return agg_val

    def score(self, query_emb, doc_emb, genre_emb, doc_genres):
        doc_query_score = self.doc_emb_score(query_emb, doc_emb).reshape(-1)
        genre_query_score = self.aux_emb_score(query_emb, genre_emb).reshape(-1)
        print("Genre order:", *sorted(zip( genre_query_score, self.fe.all_doc_genres), reverse=True), sep="\n")
        genre_doc_score = np.zeros(len(doc_emb))
        for i, doc_genre in enumerate(doc_genres):
            genre_score = [genre_query_score[genre_id] for genre_id in doc_genre]
            genre_doc_score[i] = self.agg_genre_score_func(genre_score)
        return doc_query_score, genre_doc_score

    def query(self, qid, docids=None):
        if docids is None:
            docids = self.fe.movie_id_to_genres.keys()
            docids = [docid for docid in docids]
        query_emb = self.query_vec_extractor.fetch_doc_with_id(qid)
        doc_emb, genre_emb, doc_genres = self.fe.get_doc_feat(list(docids))
        doc_query_score, genre_doc_score = self.score(query_emb, doc_emb, genre_emb, doc_genres)
        doc_score = list(zip(docids, doc_query_score, genre_doc_score))
        filtered_doc_score = [score for score in doc_score if score[2] > 0.6]
        sorted_doc_score = sorted(filtered_doc_score, key=lambda x: (-x[1] * x[2], x[0]))
        # docids = [docid for docid, _, _ in sorted_doc_score]
        return sorted_doc_score

#### Bi-Encoder

In [None]:
vec_ranker = VectorRanker(fe, query_vec_extractor)

In [None]:
qid = 26
# docids = vec_ranker.query(qid, ["91500", "108190", "106002", "7022", "113378", "217903"])[:20]
docids = vec_ranker.query(qid)
a = [docid for docid, _, _ in docids]
print("Query:", query_to_qid[int(qid)])
print("91500", a.index("91500"))
for docid, score1, score2 in docids[:50]:
    print(docid, score1, score2, movie_metadata_df[movie_metadata_df['movieId'] == int(docid)][['title', 'genres']].values[0])

#### Doc2Query

In [None]:
# TO-DO: it might not be useful, but for movie the genre might be useful

#### Movie Genres