In [1]:
import pandas as pd
import json
import numpy as np

In [42]:
df = pd.read_csv("data/www2017/wikihowURLID-wikihowURL-youtubeURL", sep="\t", names=["fid", "wikihow", "youtube"])
# df['query'] = [i.split("/")[-1].replace("-"," ") for i in df.wikihow.tolist()]
# df['query'] = df['query'].str.lower()
df['label'] = [i.split("/")[-1] for i in df.youtube.tolist()]


In [3]:

def read_youtube_video_json():
    vids, titles, descs, tags, comments = [], [], [], [], []
    with open("data/www2017/uniq-youtube-video.json", encoding="utf-8") as file:
        for l in file:
            y = json.loads(l)
            vids.append(y['id'])
            titles.append(y['title'])
            descs.append(y['description'])
            tags.append(' '.join(y['tags']))
            comments.append(' '.join([i['comment'] for i in y['comment']]))
    
    return pd.DataFrame({'vid':vids, 'title':titles, 'desc':descs, 'comment':comments, 'tag':tags})

df_video = read_youtube_video_json()

In [67]:
import collections
def get_wikihow_with_video():
    linkids = []
    queries = []
    with open("data/www2017/task-frame-have-video.json", encoding="utf-8") as file:
        for l in file:
            y = json.loads(l)
            wid = int(y['activity']['linkid'])
            linkids.append(wid)
            queries.append(y['activity']['verb'] + " "+y['activity']['object'])
    return linkids, queries
linkids, queries = get_wikihow_with_video()

In [69]:
_df = pd.DataFrame({"fid": linkids, "query": queries})

In [74]:
df = _df.merge(df, how="inner", on="fid")

In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
# corpus = [i+" "+j+" "+k+" "+l for i, j, k, l in df_video[['title', 'desc', 'tag', 'comment']].values]
corpus = [i+" "+j for i, j in df_video[['title', 'desc']].values]
corpus = corpus + df['query'].tolist()
vectorizer = TfidfVectorizer(use_idf=False)
doc_vec = vectorizer.fit_transform(corpus)

In [96]:
query_vec = vectorizer.transform(df['query'])

In [97]:
from sklearn.metrics.pairwise import cosine_similarity
scores = cosine_similarity(query_vec, doc_vec)

In [98]:
evaluate(scores)

0.12366702937976061 0.217519042437432 0.2634929270946681 0.33340587595212184 0.19499216852637427


In [8]:
%%time

def getHitRatio(ranklist, gtDoc, K):
    for i in range(K):
        item = ranklist[i]
        if item == gtDoc:
            return 1
    return 0

def getMRR(ranklist, gtDoc):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtDoc:
            return 1.0 / (i+1)
    return 0

def evaluate(scores):
    hr1s, hr3s, hr5s, hr10s, mrrs = [], [], [], [], []
    for score, gtDoc in zip(scores, df.label):
        ranklist = {i:j for i,j in zip(df_video.vid, score)}
        ranklist = sorted(ranklist, key=ranklist.get, reverse=True)
        hr1 = getHitRatio(ranklist, gtDoc, 1)
        hr3 = getHitRatio(ranklist, gtDoc, 3)
        hr5 = getHitRatio(ranklist, gtDoc, 5)
        hr10 = getHitRatio(ranklist, gtDoc, 10)
        mrr = getMRR(ranklist, gtDoc)
        hr1s.append(hr1)
        hr3s.append(hr3)
        hr5s.append(hr5)
        hr10s.append(hr10)
        mrrs.append(mrr)
    print(np.mean(hr1s), np.mean(hr3s), np.mean(hr5s), np.mean(hr10s), np.mean(mrrs))
# evaluate(scores)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


In [9]:
from gensim.models import KeyedVectors

In [13]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
glove_file = 'data/w2v/glove.42B.300d.txt'
tmp_file = 'data/w2v/glove_word2vec.txt'
_ = glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

In [18]:
term2simterm_glove = generateTerm2SimTerm(model)

HBox(children=(IntProgress(value=0, max=11525), HTML(value='')))




In [10]:
wv_from_text = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [14]:
# Get unique terms
terms = []
for q in df['query']:
    terms.extend(q.split(" "))
terms = list(set(terms))

In [15]:
from tqdm.autonotebook import tqdm



In [16]:
import collections

def generateTerm2SimTerm(wv_from_text):
    term2simterm = collections.defaultdict(str)
    for term in tqdm(terms, total=len(terms)):
        try:
            sim_terms = wv_from_text.most_similar(positive=[term], topn=2)
            term2simterm[term] = " ".join([i[0] for i in sim_terms])
        except:
            continue
        break
    return term2simterm


In [83]:
# def w2v_expansion():
res = []
for q in tqdm(df['query'], total=len(df)):
    res.append(q + " " +" ".join([term2simterm_glove[term] for term in q.split()]))
df['glove_query'] = res

HBox(children=(IntProgress(value=0, max=18380), HTML(value='')))




In [101]:
df

Unnamed: 0,fid,query,wikihow,youtube,label,glove_query
0,2,patrol change,http://www.wikihow.com/Patrol-Recent-Changes-o...,https://www.youtube.com/embed/QRtw3rD78AU,QRtw3rD78AU,patrol change patrols police changes changing
1,7,compost horse manure,http://www.wikihow.com/Compost-Horse-Manure,https://www.youtube.com/embed/sFU3Y4rTYJQ,sFU3Y4rTYJQ,compost horse manure composting manure horses ...
2,33,make tribute,http://www.wikihow.com/Make-an-Online-Memorial...,https://www.youtube.com/embed/7vsSf8FM37Y,7vsSf8FM37Y,make tribute making want homage tributes
3,48,become member,http://www.wikihow.com/Become-a-Member-of-the-...,https://www.youtube.com/embed/5naY7np8_hU,5naY7np8_hU,become member becoming became members membership
4,100,reduce emission,http://www.wikihow.com/Reduce-Your-Greenhouse-...,https://www.youtube.com/embed/ztgAs72f40g,ztgAs72f40g,reduce emission reducing minimize
5,103,plant tree,http://www.wikihow.com/Plant-a-Tree,https://www.youtube.com/embed/OZXmgh_cRFY,OZXmgh_cRFY,plant tree plants soil trees branches
6,105,take action,http://www.wikihow.com/Take-Action-to-Reduce-G...,https://www.youtube.com/embed/2Bp27pW2WTI,2Bp27pW2WTI,take action taking give actions qc_alsoviewed_...
7,127,brush hair,http://www.wikihow.com/Condition-and-Brush-You...,https://www.youtube.com/embed/-qFyDMEj1Sk,-qFyDMEj1Sk,brush hair brushes bristles curly wig
8,130,become model,http://www.wikihow.com/Become-a-Promotional-Model,https://www.youtube.com/embed/hYWK8_WDHjw,hYWK8_WDHjw,become model becoming became models modeling
9,175,patrol change,http://www.wikihow.com/Patrol-Recent-Changes,https://www.youtube.com/embed/QRtw3rD78AU,QRtw3rD78AU,patrol change patrols police changes changing


In [99]:
query_vec = vectorizer.transform(df['glove_query'])
scores = cosine_similarity(query_vec, doc_vec)

In [100]:
evaluate(scores)

0.12878128400435257 0.2221436343852013 0.27437431991294886 0.3523939064200218 0.20356564106612637


In [91]:
df_wiki = pd.read_csv("data/wikihowSep.csv")
df_wiki['headline'] = df_wiki['headline'].str.replace("\n", "")
df_wiki['title'] = df_wiki['title'].str.replace("How to", "")

In [113]:
corpus = df_wiki.title.unique().tolist() + df_wiki.overview.unique().tolist() + df_wiki.headline.unique().tolist() + df_wiki.text.unique().tolist()

In [48]:
from gensim.models import Word2Vec

In [116]:
corpus = [str(i).split() for i in corpus]

In [117]:
model = Word2Vec(corpus, size=100, window=5, min_count=1, workers=4)
# model.save("word2vec.model")

In [129]:
Word2Vec?

In [125]:
model.most_similar(positive=['kitchen'], topn=2)

  """Entry point for launching an IPython kernel.


[('kitchen,', 0.7349006533622742), ('pantry', 0.7168283462524414)]

In [126]:
term2simterm_wiki = generateTerm2SimTerm(model)

HBox(children=(IntProgress(value=0, max=11525), HTML(value='')))

  import sys


In [127]:
# def w2v_expansion():
res = []
for q in tqdm(df['query'], total=len(df)):
    res.append(q + " " +" ".join([term2simterm_wiki[term] for term in q.split()]))
df['wiki_w2v_query'] = res

HBox(children=(IntProgress(value=0, max=18054), HTML(value='')))

In [128]:
query_vec = vectorizer.transform(df['wiki_w2v_query'])
scores = cosine_similarity(query_vec, doc_vec)
evaluate(scores)

0.18045862412761715 0.29843801927550684 0.35831394704774566 0.4429489309848233 0.2676783339623393
