In [166]:
import spacy
nlp = spacy.load('en_core_web_lg', disable=["parser", "tagger", "ner"])
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import collections
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from scipy import sparse
pd.set_option('display.max_colwidth', -1)
EPS = 1e-6

In [167]:
#from . import FUN_FACT_CSV, REQUIRED_COLUMNS
TIL_TITLE_CSV = 'data/til_title.csv'
REQUIRED_COLUMNS= ["title", "score", "permalink"]

In [174]:
print("Loading data csv")
#fun_fact_title_data = pd.read_csv(FUN_FACT_TITLE_CSV).dropna(subset=REQUIRED_COLUMNS)
til_title_data = pd.read_csv(TIL_TITLE_CSV).dropna(subset=REQUIRED_COLUMNS)
#ysk_title_data = pd.read_csv(YSK_TITLE_CSV).dropna(subset=REQUIRED_COLUMNS)

title_data = pd.concat([
    #fun_fact_title_data,
    til_title_data,
    #ysk_title_data,
], join='inner').reset_index(drop=True)

print("Computing tf-idf matrix")
vectorizer = TfidfVectorizer(stop_words='english', dtype=np.float32)
tfidf_matrix = vectorizer.fit_transform(title_data["title"])

print("Loading spacy")
nlp = spacy.load('en_core_web_lg')

print("Computing weighted embeddings")
features = vectorizer.get_feature_names()
f_vectors = np.array([nlp.vocab[f].vector for f in features])
weighted_embeddings = tfidf_matrix.dot(f_vectors)
assert weighted_embeddings.shape == (len(title_data.index), 300)
n_weighted_embeddings = weighted_embeddings / (np.linalg.norm(weighted_embeddings, axis=1)[:, np.newaxis] + EPS)

#print("Compressing pandas dataframe into index")
#self.index = list(title_data.itertuples())

print("Done loading {} rows".format(len(title_data.index)))

Loading data csv


  interactivity=interactivity, compiler=compiler, result=result)


Computing tf-idf matrix
Loading spacy
Computing weighted embeddings
Done loading 324996 rows


In [216]:
def search(query, method = 'similarity', top=10, rocchio = False):
    if rocchio:
        query_weighted = query
    else:
        query_tfidf = vectorizer.transform([query])
        if query_tfidf.count_nonzero() > 0:
            query_weighted = query_tfidf.dot(f_vectors).flatten()
        # average word embeddings if query words don't exist in our corpus (tfidf matrix)
        else:
            tokens = vectorizer.build_analyzer()(query)
            # query was all stopwords, so we'll have to manually tokenize
            if not tokens:
                tokens = query.lower().split()
            query_weighted = np.average([nlp.vocab[t].vector for t in tokens], axis=0).flatten()
#     print('query_weighted: ', repr(query_weighted))

    # if we have no embeddings for the given query, we're out of luck
    if np.count_nonzero(query_weighted) == 0:
        return []

    n_query_weighted = query_weighted / (np.linalg.norm(query_weighted) + EPS)
    rankings = n_weighted_embeddings.dot(n_query_weighted)
    rankings_index = np.argsort(-rankings)
    ranked_df = title_data.loc[rankings_index]
    ranked_titles = list(ranked_df['title'])
    ranked_scores = list(ranked_df['score'])
    top_ranked_em = n_weighted_embeddings[rankings_index]
    ranked_rankings = rankings[rankings_index]
    print('about to call kmeans')
    results = kMeans(ranked_titles, ranked_scores, ranked_rankings, top_ranked_em, method)

#         index = list(ranked_df.itertuples())
    print('done with itertuple')
    print(results)
    indices = [i[1][0] for i in results]
    print(indices)
    results = [
        {
            "type": "submission",
            "title": ranked_df.iloc[d]["title"],
            "subreddit": ranked_df.iloc[d]['subreddit'],
            "permalink": ranked_df.iloc[d]['permalink'],
            "score": ranked_df.iloc[d]['score']
        }
        for d in [i[1][0] for i in results]
    ]
    return results, indices, query_weighted, ranked_df, weighted_embeddings[rankings_index]



def kMeans(titles, scores, rankings, embeddings, method):
    TOP_HITS_KMEANS = max(40,np.sum(scipy.stats.zscore(rankings) > 3.5))
    if TOP_HITS_KMEANS > 200:
        TOP_HITS_KMEANS = 200
    kmeans = KMeans(n_clusters=20, random_state=0).fit(embeddings[:TOP_HITS_KMEANS])

    counter = collections.Counter(kmeans.labels_)
    most_common = counter.most_common(10)
    most_common = set([i[0] for i in most_common])
    results = topSimOfEachCluster(kmeans.labels_, 10, most_common)
    topScoreOfEachCluster(results, 4, scores)
    results = topResultsSorted(results, rankings, scores, method)
    return results


# cluster number to top num based on similarity
def topSimOfEachCluster(cluster_labels, num, most_common):
    print('topsimofeachcluster')
    res = {}
    clusters_included = set(most_common)
    for i, el in enumerate(cluster_labels):
        if el not in clusters_included:
            continue
        if el not in res:
            res[el] = [i]
        elif len(res[el]) < num:
            res[el].append(i) 
    return res 

#takes topOfEachCluster and gets the top num by score
def topScoreOfEachCluster(sim_results, num, scores):
    print('topscoreofeachcluster')
    for key in sim_results:
        sim_results[key].sort(key=lambda x: scores[x], reverse = True)
        sim_results[key] = sim_results[key][:num]

#sort results by method        
def topResultsSorted(results, rankings, scores, method = 'similarity'):
    print('topresultssorted')
    if method == 'similarity':
        for key in results:
            results[key].sort(key=lambda x: rankings[x], reverse = True) #sorts within a cluster
            sorted_results = sorted(results.items(), key=lambda x: rankings[x[1][0]], reverse = True) #sorts all clusters
    elif method == 'score':
        for key in results:
            results[key].sort(key=lambda x: scores[x], reverse = True)
            sorted_results = sorted(results.items(), key=lambda x: scores[x[1][0]], reverse = True)
    return sorted_results

In [283]:
def rocchio(result_indices, select_index, query_weighted, ranked_df, weighted_embeddings):
    alpha = 0.2
    beta = 0.7
    gamma = 0.1
    print(weighted_embeddings[select_index].shape)
    print(query_weighted.shape)
    new_em = alpha * query_weighted + beta*weighted_embeddings[result_indices[select_index]]
    irrel_ems = np.zeros(len(query_weighted))
    for i in result_indices:
        if i == result_indices[select_index]:
            continue
        irrel_ems += weighted_embeddings[i]
    new_em -+ gamma/(len(result_indices) - 1) * irrel_ems
    return new_em

In [397]:
results, indices, query_weighted, ranked_df, ranked_weighted_embeddings = search("information")

about to call kmeans
topsimofeachcluster
topscoreofeachcluster
topresultssorted
done with itertuple
[(8, [0, 3]), (1, [4, 9, 34, 39]), (11, [5, 18]), (5, [6, 8, 30]), (3, [11, 23]), (17, [12, 13, 20]), (9, [15, 24, 35, 44]), (4, [16, 19, 28, 42]), (2, [21, 27, 31, 37]), (15, [22, 46])]
[0, 4, 5, 6, 11, 12, 15, 16, 21, 22]


In [401]:
results

[{'type': 'submission',
  'title': 'TIL if you put a UPS tracking number in the search, it will give you the link to your tracking information.',
  'subreddit': 'til',
  'permalink': '/r/til/comments/f8rax/til_if_you_put_a_ups_tracking_number_in_the/',
  'score': 1.0},
 {'type': 'submission',
  'title': 'TIL Google recognizes USPS routing numbers + gives you a direct link to tracking info',
  'subreddit': 'todayilearned',
  'permalink': '/r/todayilearned/comments/c5xsi/til_google_recognizes_usps_routing_numbers_gives/',
  'score': 31.0},
 {'type': 'submission',
  'title': 'TIL that there are over 300 companies tracking my every click online and saving my information. I also just found out how to Opt-out.',
  'subreddit': 'technology',
  'permalink': '/r/technology/comments/p0vge/til_that_there_are_over_300_companies_tracking_my/',
  'score': 0.0},
 {'type': 'submission',
  'title': 'TIL it is possible to search for information on Google in a specific date range',
  'subreddit': 'todayi

In [399]:
shifted_query_emb = rocchio(indices, 9, query_weighted, ranked_df, ranked_weighted_embeddings)

(300,)
(300,)


In [400]:
results, indices, query_weighted, ranked_df, ranked_weighted_embeddings = search(shifted_query_emb, rocchio = True)

about to call kmeans
topsimofeachcluster
topscoreofeachcluster
topresultssorted
done with itertuple
[(0, [0, 1, 3]), (6, [2, 4, 5, 28]), (13, [6, 16]), (4, [7, 26]), (11, [8]), (8, [11, 13, 15]), (16, [12, 19]), (12, [17, 23, 30, 33]), (1, [20, 21, 22, 31]), (10, [25, 27])]
[0, 2, 6, 7, 8, 11, 12, 17, 20, 25]
