In [86]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz 

Queries

In [87]:
thresh = "0.7"
data_query_path = f"..\\data_output\\queries_thresh_{thresh}_pos.parquet"
cw_query_repr_path = f"..\\data_output\\cw_thresh_{thresh}_pos.npz"

In [88]:
queries_df = pd.read_parquet(data_query_path)
queries_df["expaded_words"] = queries_df.apply(lambda row: set(row.cluwords_textual_repr).difference(set(row.text)), axis=1)
queries_df["n_expanded_words"] = queries_df["expaded_words"].apply(len)

queries_df.head(10)

Unnamed: 0,query_id,text,cluwords_textual_repr,expaded_words,n_expanded_words
0,156493,"[do, goldfish, grow]","[do, grow, goldfish]",{},0
1,1110199,"[what, is, wifi, vs, bluetooth]","[wifi, is, what, bluetooth, vs]",{},0
2,1063750,"[why, did, the, us, volunterilay, enter, ww1]","[enter, ww1, wwi, why, did, the, us]",{wwi},1
3,130510,"[definition, declaratory, judgment]","[definition, definitions, declaratory, judgmen...","{definitions, judgments, judgement}",3
4,489204,"[right, pelvic, pain, causes]","[pelvic, discomfort, anguish, pain, causes, ri...","{anguish, discomfort}",2
5,573724,"[what, are, the, social, determinants, of, hea...","[determinants, determinant, of, what, societal...","{economic, political, determinant, societal, c...",5
6,168216,"[does, legionella, pneumophila, cause, pneumonia]","[pneumophila, clostridium, sporogenes, catarrh...","{meningitidis, pyogenes, catarrhalis, sporogen...",9
7,1133167,"[how, is, the, weather, in, jamaica]","[how, in, is, weather, the, jamaican, jamaica]",{jamaican},1
8,527433,"[types, of, dysarthria, from, cerebral, palsy]","[palsy, types, kinds, type, of, from, cerebral...","{type, kinds, apraxia, dysprosody}",4
9,1037798,"[who, is, robert, gray]","[robert, richard, leonard, who, is, brown, gre...","{brown, richard, leonard, grey, blue}",5


Corpus

In [89]:
thresh_corpus = "0.7"
data_corpus_path = f"..\\data_output\\corpus_thresh_{thresh_corpus}_pos.parquet"
cw_corpus_repr_path = f"..\\data_output\\cw_corpus_thresh_{thresh_corpus}_pos.npz"

In [90]:
corpus_df = pd.read_parquet(data_corpus_path)
corpus_df["expaded_words"] = corpus_df.apply(lambda row: set(row.cluwords_textual_repr).difference(set(row.text)), axis=1)
corpus_df["n_expanded_words"] = corpus_df["expaded_words"].apply(len)

corpus_df.head(10)

Unnamed: 0.1,Unnamed: 0,query_id,doc_id,rank,score,text,cluwords_textual_repr,expaded_words,n_expanded_words
0,0,156493,2928707,0,109.77907,"[Goldfish, Only, Grow, to, the, Size, of, Thei...","[but, indeterminate, the, as, not, for, only, ...","{tanks, fully, fish, component, their, element...",16
1,1,156493,8182162,1,109.76814,"[Depending, on, his, type, and, his, environme...","[but, depending, bowls, bowl, his, the, for, a...","{aquarium, types, depending, slightly, cm, mod...",35
2,2,156493,1960257,2,108.87568,"[Goldfish, Only, Grow, to, the, Size, of, Thei...","[but, the, as, not, for, only, size, sizes, wh...","{tanks, fully, component, their, elements, asp...",13
3,3,156493,8182159,3,107.93494,"[', Goldfish, usually, grow, with, their, envi...","[but, the, not, with, like, are, keep, heard, ...","{tanks, substantial, traditionally, big, certa...",34
4,4,156493,8182160,4,107.91983,"[The, rate, at, which, your, goldfish, grows, ...","[grows, factors, factor, his, the, not, keep, ...","{tanks, big, factor, poorer, cm, pretty, quick...",26
5,5,156493,8820526,5,107.55801,"[Their, size, is, limited, in, the, tank, ,, b...","[but, basically, actually, obviously, essentia...","{tanks, posts, actually, their, resource, temp...",19
6,6,156493,2612493,6,107.416565,"[In, clean, ,, uncrowded, conditions, in, tank...","[year, week, decade, month, day, the, children...","{decade, black, cm, feet, children, day, it, g...",26
7,7,156493,3288600,7,107.24935,"[A, goldfish, will, grow, to, the, depth, of, ...","[the, as, not, feeders, feeder, water, unhealt...","{larger, tanks, smaller, substantial, big, if,...",17
8,8,156493,2259182,8,106.89768,"[Common, goldfish, ,, comet, goldfish, ,, and,...","[up, transition, transitional, laying, them, b...","{decade, lengths, do, substantial, big, consid...",32
9,9,156493,1960260,9,106.89032,"[Report, Abuse, ., Fish, do, not, grow, to, th...","[report, reports, until, but, them, the, not, ...","{tanks, ideas, big, smaller, fish, reports, re...",19


CluWords Repr

In [91]:
cw_queries = load_npz(cw_query_repr_path)
cw_corpus = load_npz(cw_corpus_repr_path)

cw_queries.shape, cw_corpus.shape

((43, 9390), (430, 9390))

In [92]:
data_df = queries_df.merge(corpus_df, on="query_id")
data_df.shape

(430, 13)

In [93]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)
svd.fit(cw_corpus)
Sigma = svd.singular_values_
V_T = svd.components_.T

In [94]:
lsa_queries = np.dot(cw_queries.toarray(),  V_T)
lsa_docs = svd.transform(cw_corpus)

In [95]:
from sklearn.metrics.pairwise import cosine_similarity

cos_sim = cosine_similarity(X=lsa_queries,  Y=lsa_docs)

In [96]:
list_cos = []
for query_id in range(cw_queries.shape[0]):
    list_cos.append(cos_sim[query_id, query_id*10 : (query_id+1)*10])


In [97]:
list_cos[0]

array([ 0.01442825,  0.09931907, -0.09875438,  0.08377668, -0.03320344,
        0.13185717, -0.10504439,  0.00285874, -0.0771562 ,  0.03415829],
      dtype=float32)

In [64]:
corpus = pd.read_csv("..\\data\\corpus_top10.csv")
relevance = pd.read_csv("..\\data\\qrel_top10.csv")
queries = pd.read_csv("..\\data\\queries_samples.csv")

corpus = corpus.merge(queries.merge(relevance, on="query_id"), on="doc_id")
corpus.head(5)

Unnamed: 0,query_id_x,doc_id,rank,score,text_x,query_id_y,text_y,relevance
0,156493,2928707,0,109.77907,Goldfish Only Grow to the Size of Their Enclos...,156493,do goldfish grow,2
1,156493,8182162,1,109.76814,"Depending on his type and his environment, gol...",156493,do goldfish grow,2
2,156493,1960257,2,108.87568,Goldfish Only Grow to the Size of Their Enclos...,156493,do goldfish grow,2
3,156493,8182159,3,107.93494,'Goldfish usually grow with their environment ...,156493,do goldfish grow,2
4,156493,8182160,4,107.91983,The rate at which your goldfish grows will dep...,156493,do goldfish grow,2


In [68]:
corpus[corpus["query_id_y"] == 527433]

Unnamed: 0,query_id_x,doc_id,rank,score,text_x,query_id_y,text_y,relevance
77,527433,8617271,0,110.13462,There are three major types of dysarthria in c...,527433,types of dysarthria from cerebral palsy,3
78,527433,5466810,1,105.96605,Dysarthria is a feature of many neurological d...,527433,types of dysarthria from cerebral palsy,2
79,527433,1379245,2,104.932495,Dysarthria is caused by damage to the brain. T...,527433,types of dysarthria from cerebral palsy,1
80,527433,5466807,3,104.57021,Dysarthria is a motor speech disorder. It resu...,527433,types of dysarthria from cerebral palsy,2
81,527433,7607669,4,104.23117,1 Developmental verbal dyspraxia also known as...,527433,types of dysarthria from cerebral palsy,3
82,527433,1379240,6,103.0704,"In dysarthria, you may have difficulty moving ...",527433,types of dysarthria from cerebral palsy,2
83,527433,3224400,7,102.64935,Traumatic brain injury and brain tumors are al...,527433,types of dysarthria from cerebral palsy,0
84,527433,1664518,8,102.539856,"The key point is that dysarthria is a symptom,...",527433,types of dysarthria from cerebral palsy,1
85,527433,1664523,9,102.46382,Slurred speech is the result of weakened or un...,527433,types of dysarthria from cerebral palsy,1


In [60]:
relevance.head(4)

Unnamed: 0,query_id,doc_id,relevance
0,19335,1017759,0
1,19335,1082489,0
2,19335,109063,0
3,19335,1160863,0
