In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import nltk
from nltk.corpus import stopwords
import torch
import random
from tqdm import tqdm
import re
from rank_bm25 import BM25Okapi
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
my_stop_words = text.ENGLISH_STOP_WORDS.union(["book"])

# nltk.download('stopwords')
# nltk.download('wordnet')

In [2]:
def same_seeds(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
same_seeds(42)

In [3]:
train_ans = pd.read_csv('dataset/train_ans.csv')
document = pd.read_csv('document.csv')
train_query = pd.read_csv('train_query.csv')
test_query = pd.read_csv('test_query.csv')

In [4]:
def mean_average_precision(df):  
    MAP = 0
    for query_id, doc_list_str in df.iterrows():
        doc_list = doc_list_str["doc"].split()[:50]
        ans_doc_set = set(train_ans.loc[query_id, "doc"].split())
        AP = 0
        rel_cnt = 0
        for i, doc in enumerate(doc_list):
            if doc in ans_doc_set:
                rel_cnt += 1
                AP += rel_cnt / (i + 1)
        AP /= min(len(ans_doc_set), 50)
        MAP += AP
    MAP /= len(df)
    return MAP

In [5]:
def chunking(max_len, sent):
    tokenized_text = sent.lower().split(" ")
    # using list comprehension
    final = [tokenized_text[i * max_len:(i + 1) *max_len] for i in range((len(tokenized_text) + max_len - 1) // max_len)] 
    
    # join back to sentences for each of the chunks
    sent_chunk = []
    for item in final:
        sent_chunk.append(' '.join(item))
    return sent_chunk

In [6]:
# tfidf
vectorizer = TfidfVectorizer(lowercase=True, stop_words=my_stop_words, min_df=4, ngram_range=(1,3))
X = vectorizer.fit_transform(tqdm(document['document']))

100%|██████████| 100000/100000 [13:55<00:00, 119.62it/s]


In [8]:
tfidf_result = []
for i in tqdm(range(len(train_query['train_query']))):
    query = train_query['train_query'][i]
    query_vec = vectorizer.transform([query])
    results = cosine_similarity(X, query_vec)
    values, indices = torch.topk(torch.tensor(results).squeeze(-1), 50)
    r = []
    for idxx in indices:
        r.append(document['doc'][int(idxx)])
    tfidf_result.append(' '.join(list(map(str, r))))
    
res_df = pd.DataFrame({'topic':train_ans['topic'], 'doc':tfidf_result})
print('MAP:', mean_average_precision(res_df)) # 0.100

100%|██████████| 15/15 [01:32<00:00,  6.15s/it]

MAP: 0.10079090144345436





In [9]:
# bm25
tokenized_corpus = [doc.split(" ") for doc in document['document']]
bm25 = BM25Okapi(tqdm(tokenized_corpus))

100%|██████████| 100000/100000 [03:10<00:00, 525.29it/s]


In [10]:
bm_result = []
for i in tqdm(range(len(train_query['train_query']))):
    query = train_query['train_query'][i]
    tokenized_query = query.split(" ")
    results = bm25.get_scores(tokenized_query)
    values, indices = torch.topk(torch.tensor(results).squeeze(-1), 50)
    r = []
    for idxx in indices:
        r.append(document['doc'][int(idxx)])
    bm_result.append(' '.join(list(map(str, r))))
    
bm_res_df = pd.DataFrame({'topic':train_ans['topic'], 'doc':bm_result})
print('MAP:', mean_average_precision(bm_res_df)) # 0.1109

100%|██████████| 15/15 [00:33<00:00,  2.27s/it]

MAP: 0.11097825998590742





In [12]:
combine_output = []
for i in range(len(tfidf_result)):
    combine_output.append(list(set(tfidf_result[i].split()+bm_result[i].split())))

In [13]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [None]:
combine_result = []
for i in range(len(combine_output)):
    r = []
    print(f'Train Query {i+1}')
    for j in tqdm(range(len(combine_output[i]))):
        doc_inputs = torch.mean(torch.tensor(model.encode(chunking(300, document[document['doc'] == int(combine_output[i][j])]['document'].values[0]))), 0).tolist()
        query_inputs = model.encode(train_query['train_query'][i]).tolist()
        r.append(float(cosine_similarity([doc_inputs], [query_inputs])[0]))
    values, indices = torch.topk(torch.tensor(r), 50)
    res = []
    for indx in indices:
        res.append(combine_output[i][int(indx)])
    combine_result.append(' '.join(list(map(str, res))))
    
res_df2 = pd.DataFrame({'topic':train_query['topic'], 'doc':combine_result})
print()
print('MAP:', mean_average_precision(res_df2)) # # 0.17091147408326807

Train Query 1


  0%|          | 0/86 [00:00<?, ?it/s]

In [None]:
tfidf_testing_result = []
for i in tqdm(range(len(test_query['test_query']))):
    query = test_query['test_query'][i]
    query_vec = vectorizer.transform([query])
    results = cosine_similarity(X, query_vec)
    values, indices = torch.topk(torch.tensor(results).squeeze(-1), 50)
    r = []
    for idxx in indices:
        r.append(document['doc'][int(idxx)])
    tfidf_testing_result.append(r) #' '.join(list(map(str, r))))

bm_testing_result = []
for i in tqdm(range(len(test_query['test_query']))):
    query = test_query['test_query'][i]
    tokenized_query = query.split(" ")
    results = bm25.get_scores(tokenized_query)
    values, indices = torch.topk(torch.tensor(results).squeeze(-1), 50)
    r = []
    for idxx in indices:
        r.append(document['doc'][int(idxx)])
    bm_testing_result.append(r) #' '.join(list(map(str, r))))
    
testing_esemble = []
for i in range(len(tfidf_testing_result)):
    testing_esemble.append(list(set(tfidf_testing_result[i]+bm_testing_result[i])))

In [None]:
testing_result2 = []
for i in range(len(testing_esemble)):
    r = []
    print(f'Test Query {i+1}')
    for j in tqdm(range(len(testing_esemble[i]))):
        doc_inputs = torch.mean(torch.tensor(model.encode(chunking(300, document[document['doc'] == testing_esemble[i][j]]['document'].values[0]))), 0).tolist()
        query_inputs = model.encode(test_query['test_query'][i]).tolist()
        r.append(float(cosine_similarity([doc_inputs], [query_inputs])[0]))
    values, indices = torch.topk(torch.tensor(r), 50)
    res = []
    for indx in indices:
        res.append(testing_esemble[i][int(indx)])
    testing_result2.append(' '.join(list(map(str, res))))
    
testing_df = pd.DataFrame({'topic':test_query['topic'], 'doc':testing_result2})

In [None]:
testing_df.to_csv('dataset/output.csv', index=False) # 0.06358