In [6]:
import spacy
nlp = spacy.load('en_core_web_lg', disable=["parser", "tagger", "ner"])
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
pd.set_option('display.max_colwidth', -1)
EPS = 1e-6

In [9]:
#from . import FUN_FACT_CSV, REQUIRED_COLUMNS
TIL_TITLE_CSV = '../data/til_title.csv'
REQUIRED_COLUMNS= ["title", "score", "permalink"]

In [37]:
class KMeans:
    def __init__(self):
        #fun_fact_title_data = pd.read_csv(FUN_FACT_TITLE_CSV).dropna(subset=REQUIRED_COLUMNS)
        self.til_title_data = pd.read_csv(TIL_TITLE_CSV).dropna(subset=REQUIRED_COLUMNS)
        #ysk_title_data = pd.read_csv(YSK_TITLE_CSV).dropna(subset=REQUIRED_COLUMNS)
        
        self.data = self.dataframe()
        self.vectorizer = TfidfVectorizer(stop_words='english', max_df=.8, ngram_range=(1,1), dtype=np.float32)
        self.titles = self.data['title']
        self.scores = self.data['score']
        
        features = self.vectorizer.get_feature_names()
        self.f_vectors = np.array([nlp.vocab[f].vector for f in self.features])
        embeddings = self.weighted_embeddings(self.vectorizer, self.titles)
        self.n_weighted_embedding = self.embeddings / (np.linalg.norm(self.embeddings, axis=1)[:, np.newaxis] + EPS)
        
        
    def dataframe(self):
        required_columns = ['title', 'subreddit', 'permalink']
        data = self.til_title_data.dropna(axis='rows', subset=required_columns)
        banned_subreddits = ['circlejerk', 'ShittyTodayILearned', 'TheOnion']
        data = data[~data['subreddit'].isin(banned_subreddits)]
        data = data.reset_index(drop=True)
        return data
    
    def weighted_embeddings(self):
        fun_fact_tfidf = self.vectorizer.fit_transform(self.titles)
        weighted_embedding = fun_fact_tfidf.dot(self.f_vectors)
        return weighted_embedding

    def search(self, query, top = 10):
        query_tfidf = self.vectorizer.transform([query])
        query_weighted = query_tfidf.dot(f_vectors).flatten()
        n_query_weighted = query_weighted / np.linalg.norm(query_weighted)
        rankings = n_weighted_embedding.dot(n_query_weighted)
        rankings_index = np.argsort(-rankings)
        fun_fact_df[["subreddit", "title", "score"]].loc[rankings_index]
        
        
        

In [40]:
class WeightedEmbeddingSearch:

    def __init__(self):
        print("Loading data csv")
        #fun_fact_title_data = pd.read_csv(FUN_FACT_TITLE_CSV).dropna(subset=REQUIRED_COLUMNS)
        til_title_data = pd.read_csv(TIL_TITLE_CSV).dropna(subset=REQUIRED_COLUMNS)
        #ysk_title_data = pd.read_csv(YSK_TITLE_CSV).dropna(subset=REQUIRED_COLUMNS)

        title_data = pd.concat([
            #fun_fact_title_data,
            til_title_data,
            #ysk_title_data,
        ], join='inner').reset_index(drop=True)

        print("Computing tf-idf matrix")
        self.vectorizer = TfidfVectorizer(stop_words='english', dtype=np.float32)
        tfidf_matrix = self.vectorizer.fit_transform(title_data["title"])

        print("Loading spacy")
        self.nlp = spacy.load('en_core_web_lg')

        print("Computing weighted embeddings")
        features = self.vectorizer.get_feature_names()
        self.f_vectors = np.array([self.nlp.vocab[f].vector for f in features])
        weighted_embeddings = tfidf_matrix.dot(self.f_vectors)
        assert weighted_embeddings.shape == (len(title_data.index), 300)
        self.n_weighted_embeddings = weighted_embeddings / (np.linalg.norm(weighted_embeddings, axis=1)[:, np.newaxis] + EPS)

        print("Compressing pandas dataframe into index")
        self.index = list(title_data.itertuples())

        print("Done loading {} rows".format(len(title_data.index)))

    def search(self, query, top=10):
        query_tfidf = self.vectorizer.transform([query])
        if query_tfidf.count_nonzero() > 0:
            query_weighted = query_tfidf.dot(self.f_vectors).flatten()
        # average word embeddings if query words don't exist in our corpus (tfidf matrix)
        else:
            tokens = self.vectorizer.build_analyzer()(query)
            # query was all stopwords, so we'll have to manually tokenize
            if not tokens:
                tokens = query.lower().split()
            query_weighted = np.average([self.nlp.vocab[t].vector for t in tokens], axis=0).flatten()

        # if we have no embeddings for the given query, we're out of luck
        if np.count_nonzero(query_weighted) == 0:
            return []

        n_query_weighted = query_weighted / (np.linalg.norm(query_weighted) + EPS)
        rankings = self.n_weighted_embeddings.dot(n_query_weighted)
        rel = np.argsort(-rankings)[:top]
        results = [
            {
                "type": "submission",
                "title": self.index[d].title,
                "subreddit": self.index[d].subreddit,
                "permalink": self.index[d].permalink,
                "score": self.index[d].score,
            }
            for d in rel
        ]
        return results


In [41]:
w = WeightedEmbeddingSearch()

Loading data csv
Computing tf-idf matrix
Loading spacy
Computing weighted embeddings
Compressing pandas dataframe into index
Done loading 324996 rows
