In [None]:
raise Exception("RUNNING THE CELLS BELOW WILL TAKE A LONG TIME")

# Analysis Question 3

(5 points) For each of the LSI models you built over AP, and for LDA, select the
ve top signicant topics from your model. Check the top terms in each topic. Which
topics actually represent a particular subject? Analyse the results. Do you observe a
dierence?

In [None]:
import os
import json
import pickle as pkl
from collections import defaultdict, Counter

import numpy as np
from tqdm import tqdm

import read_ap
import download_ap

from gensim_corpus import GensimCorpus

from trec import TrecAPI
%load_ext autoreload
%autoreload 2
from lda import LatentDirichletAllocation
from lsi import LatentSemanticIndexing

from trec import TrecAPI

# ensure dataset is downloaded
# download_ap.download_dataset()
# pre-process the text
docs_by_id = None
docs_by_id = read_ap.get_processed_docs()

os.makedirs("results", exist_ok=True)
os.makedirs("saved_models/sim_temps", exist_ok=True)
os.makedirs("raw_output", exist_ok=True)

gensim_corpus = GensimCorpus(docs_by_id, embedding="bow")
lda = LatentDirichletAllocation(gensim_corpus)

In [None]:
lda.model.print_topics(num_topics=-1)[:5]

In [None]:
gensim_corpus = GensimCorpus(docs_by_id, embedding="tfidf")
lsi = LatentSemanticIndexing(gensim_corpus, embedding="tfidf")
lsi.model.print_topics(num_topics=-1)[:5]

In [None]:
gensim_corpus = GensimCorpus(docs_by_id, embedding="bow")
lsi = LatentSemanticIndexing(gensim_corpus, embedding="bow")
lsi.model.print_topics(num_topics=-1)[:5]

# Analysis Question 4

In [None]:
import os
import json
import pickle as pkl
import operator
from pprint import pprint
import argparse

model_names = [
    "lsi_bow",
    "lsi_tfidf",
    "doc2vec",
    "doc2vec_vocab_size",
    "doc2vec_window_size",
    "doc2vec_vec_dim",
    "word2vec",
    "lsa_bow"
]

best_run_results = {
    "tfidf":"./results/tfidf.json",
    "word2vec": "./results/skip_gram.json",
    "doc2vec":"./results/doc2vec_vocab_size_50000_results_trec.json",
    "lsi_tfidf":"./results/lsi-tfidf-embedding-2000-topics.json",
    "lsi_bow":"./results/lsi-bow-embedding-2000-topics.json",
    "lda_bow":"./results/lda-500-topics.json"
}

default_run_results = {
    "tfidf":"./results/tfidf.json",
    "word2vec": "./results/skip_gram.json",
    "doc2vec":"./results/doc2vec_vocab_size_10000_results_trec.json",
    "lsi_tfidf":"./results/lsi-tfidf-embedding-500-topics.json",
    "lsi_bow":"./results/lsi-bow-embedding-500-topics.json",
    "lda_bow":"./results/lda-500-topics.json"
}

lsi_bow_results = {
    "10 topics":"./results/lsi-bow-embedding-10-topics.json",
    "50 topics":"./results/lsi-bow-embedding-50-topics.json",
    "100 topics":"./results/lsi-bow-embedding-100-topics.json",
    "500 topics":"./results/lsi-bow-embedding-500-topics.json",
    "1000 topics":"./results/lsi-bow-embedding-1000-topics.json",
    "2000 topics":"./results/lsi-bow-embedding-2000-topics.json",
}

lsi_tfidf_results = {
    "10 topics":"./results/lsi-tfidf-embedding-10-topics.json",
    "50 topics":"./results/lsi-tfidf-embedding-50-topics.json",
    "100 topics":"./results/lsi-tfidf-embedding-100-topics.json",
    "500 topics":"./results/lsi-tfidf-embedding-500-topics.json",
    "1000 topics":"./results/lsi-tfidf-embedding-1000-topics.json",
    "2000 topics":"./results/lsi-tfidf-embedding-2000-topics.json",
}

doc2vec_window_size_results = {
    "window size 5":"./results/doc2vec_window_size_5_results_trec.json",
    "window size 10":"./results/doc2vec_window_size_10_results_trec.json",
    "window size 15":"./results/doc2vec_window_size_15_results_trec.json",
    "window size 20":"./results/doc2vec_window_size_20_results_trec.json"
}

doc2vec_vec_dim_results = {
    "vec dim 200":"./results/doc2vec_vec_dim_200_results_trec.json",
    "vec dim 300":"./results/doc2vec_vec_dim_300_results_trec.json",
    "vec dim 400":"./results/doc2vec_vec_dim_400_results_trec.json",
    "vec dim 500":"./results/doc2vec_vec_dim_500_results_trec.json"
}

doc2vec_vocab_size_results = {
    "vocab size 1000":"./results/doc2vec_vocab_size_10000_results_trec.json",
    "vocab size 25000":"./results/doc2vec_vocab_size_25000_results_trec.json",
    "vocab size 50000":"./results/doc2vec_vocab_size_50000_results_trec.json",
    "vocab size 100000":"./results/doc2vec_vocab_size_100000_results_trec.json",
    "vocab size 200000":"./results/doc2vec_vocab_size_200000_results_trec.json"
}

doc2vec_results = dict(
    list(doc2vec_window_size_results.items()) + 
    list(doc2vec_vec_dim_results.items()) + 
    list(doc2vec_vocab_size_results.items()))

# AQ 4.1

(5 points) Report the retrieval performance in terms of MAP and nDCG for all
of the methods, on a) all queries, and b) queries 76-100. To be precise, you need to report
24 numbers in a table.

In [None]:
print("\n\n############# AQ4.1 #############\n\n")
default_all_results_per_setup = {}
default_results_per_setup = {}
default_eval_results_per_setup = {}
for model, fn in default_run_results.items():
    with open(fn, "r") as f:
        res = json.load(f)
        res.pop("all", None)
        default_all_results_per_setup[model] = res

    default_results_per_setup[model] = {}
    default_results_per_setup[model]["map"] = sum([q["map"] for q in res.values()]) / len(res)
    default_results_per_setup[model]["ndcg"] = sum([q["ndcg"] for q in res.values()]) / len(res)

    # query ids range from 51 to 200. evaluation set is 76 - 100, we should only use that for
    # parameter tuning.
    query_id_range = list(str(qid) for qid in range(76, 101)) 
    default_eval_results_per_setup[model] = {}
    default_eval_results_per_setup[model]["map"] = sum([res[qid]["map"] for qid in query_id_range]) / len(query_id_range)
    default_eval_results_per_setup[model]["ndcg"] = sum([res[qid]["ndcg"] for qid in query_id_range]) / len(query_id_range)        

print("RETRIEVAL PERFORMANCE ON ALL QUERIES:")
pprint(default_results_per_setup)

print("\nRETRIEVAL PERFORMANCE ON EVAL QUERIES 76-100")
pprint(default_eval_results_per_setup)

# AQ 4.2

(5 points) Signicance testing is a way of showing that if the dierence observed
between the performance of two models is due to chance or not. Use pytrec eval to
perform a t-test between every pair of models, on the complete query set. You need to
report the results of six tests.a

In [None]:
from itertools import combinations
from scipy.stats import ttest_rel

for model_1, model_2 in combinations(list(default_all_results_per_setup.keys()), 2):
    
    first_results = default_all_results_per_setup[model_1]
    second_results = default_all_results_per_setup[model_2]
    
    query_ids = list(
    set(first_results.keys()) & set(second_results.keys()))

    first_scores = [
        first_results[query_id]["map"] for query_id in query_ids]
    second_scores = [
        second_results[query_id]["map"] for query_id in query_ids]

    print("For {} and {}: {}".format(model_1, model_2, ttest_rel(first_scores, second_scores)[1]))


# AQ 4.3

(5 points) Report the retrieval performance in terms of MAP and nDCG for all
of the methods, on a) all queries, and b) test queries. Use the parameters leading to the
best performance on validation set. Attach the results corresponding to each run to
your report.

In [None]:
# set the model type to determine the best settings for.
model = "lsi_bow"

# query ids range from 51 to 200. evaluation set is 76 - 100, we should only use that for
# parameter tuning.
query_id_range = list(str(qid) for qid in range(76, 101))

results_per_setup = {}
param_results = {}
for setup, fn in eval(model + "_results").items():
    with open(fn, "r") as f:
        all_results = json.load(f)
        param_results[setup] = all_results
            
    results_per_setup[setup] = sum([all_results[qid]["map"] for qid in query_id_range]) / len(query_id_range)
    #results_per_setup[fn] = all_results["all"]["map"]

setup, mean_eval_map = max(results_per_setup.items(), key=operator.itemgetter(1))

print("######### TO DETERMINE THE BEST SETUP ###########\n\n")
print("ALL RESULTS:")
pprint(results_per_setup)
print("\nBEST RESULT:\nWith MAP of: {0:.4f} best setup: {1}".format(mean_eval_map, setup))

In [None]:
all_results_per_setup = {}
results_per_setup = {}
eval_results_per_setup = {}
for model, fn in best_run_results.items():
    with open(fn, "r") as f:
        res = json.load(f)
        res.pop("all", None)
        all_results_per_setup[model] = res

    results_per_setup[model] = {}
    results_per_setup[model]["map"] = sum([q["map"] for q in res.values()]) / len(res)
    results_per_setup[model]["ndcg"] = sum([q["ndcg"] for q in res.values()]) / len(res)

    # query ids range from 51 to 200. evaluation set is 76 - 100, we should only use that for
    # parameter tuning.
    query_id_range = list(str(qid) for qid in range(76, 101)) 
    eval_results_per_setup[model] = {}
    eval_results_per_setup[model]["map"] = sum([res[qid]["map"] for qid in query_id_range]) / len(query_id_range)
    eval_results_per_setup[model]["ndcg"] = sum([res[qid]["ndcg"] for qid in query_id_range]) / len(query_id_range)

print("RETRIEVAL PERFORMANCE ON ALL QUERIES:")
pprint(default_results_per_setup)

print("\nRETRIEVAL PERFORMANCE ON EVAL QUERIES 76-100")
pprint(default_eval_results_per_setup)

# AQ 4.4

AQ4.4: (5 points) Perform a t-test between the result of each method using the best
parameters and the default ones used in AQ4.1. Describe your observations.

In [None]:
from itertools import combinations
from scipy.stats import ttest_rel

for model in default_all_results_per_setup:
    
    first_results = default_all_results_per_setup[model]
    second_results = all_results_per_setup[model]
    
    query_ids = list(
    set(first_results.keys()) & set(second_results.keys()))

    first_scores = [
        first_results[query_id]["map"] for query_id in query_ids]
    second_scores = [
        second_results[query_id]["map"] for query_id in query_ids]

    print("For {}: {}".format(model, ttest_rel(first_scores, second_scores)[1]))


# AQ 4.5

AQ4.5: (5 points)For each parameter, plot the retrieval performance in terms of MAP
with respect to the value of the parameter, for a) all queries, and b) test queries. Describe
your ndings.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
tune_models = [
    "lsi_bow", "lsi_tfidf", "doc2vec_vocab_size", "doc2vec_window_size", "doc2vec_vec_dim"
]

query_id_range = list(str(qid) for qid in range(76, 101))

for model in tune_models:
    results_per_setup = {}
    eval_results_per_setup = {}
    for setup, fn in eval(model + "_results").items():
        with open(fn, "r") as f:
            res = json.load(f)

        results_per_setup[setup] = sum([q["map"] for q in res.values()]) / len(res)
        eval_results_per_setup[setup] = sum([res[qid]["map"] for qid in query_id_range]) / len(query_id_range)

    fig, ax = plt.subplots(1, 2, figsize=(20, 6))
    setups = list(results_per_setup.keys())
    maps = list(results_per_setup.values())
    ax[0].bar(setups, maps, color="orange", label="MAP on full results")
    ax[0].set_title(model)

    setups = list(eval_results_per_setup.keys())
    maps = list(results_per_setup.values())
    ax[1].bar(setups, maps, label="MAP on eval results")
    ax[1].set_title(model)
    plt.legend()
    plt.savefig("./results/param-tuning-plots-{}.png".format(model))
    plt.show()

# AQ 4.6

AQ4.6: (5 points) Doing a query-level analysis provides insights on the weaknesses and
strengths of the retrieval models. For each of the four retrieval methods implemented
above, find success and failure cases: queries for which the MAP was highest or lowest.
Analyse the results, possibly with checking the qrels file. Discuss why you think each case
happened

In [None]:
qrels, queries = read_ap.read_qrels()

In [None]:
res_per_model = {}

for model, fn in best_run_results.items():
    with open(fn, "r") as f:
        res = json.load(f)
        res.pop("all", None)
        res = {qid:res[qid]["map"] for qid in res.keys()}

    res_per_model[model] = {
        "best":max(res.items(), key=operator.itemgetter(1)),
        "worst":min(res.items(), key=operator.itemgetter(1)),
    }

for model, stats in res_per_model.items():
    best = stats["best"][0]
    worst = stats["worst"][0]
    
    print("\n#######{}########".format(model))
    print("With MAP of {}, BEST query {}:\n".format(stats["best"][1], best))
    print(queries[best])
    print("relevant docs:\n", qrels[best].keys(), len(qrels[best].keys()))
    print("\nWith MAP of {}, WORST query {}:\n".format(stats["worst"][1], worst))
    print(queries[worst])
    print("relevant docs:\n", qrels[worst].keys(), len(qrels[worst].keys()))

# AQ 4.7

AQ4.7: (5 points) Find the top-5 queries that have the highest variance in terms of MAP
between different retrieval models. Provide an analysis why the performance of the models
differs a lot on these queries compared to the rest

In [None]:
from collections import defaultdict

map_per_qid = defaultdict(lambda:{})

for model, fn in best_run_results.items():
    with open(fn, "r") as f:
        res = json.load(f)
        res.pop("all", None)
        res = {qid:res[qid]["map"] for qid in res.keys()}

    for qid, val in res.items():
        map_per_qid[qid][model] = val

var_per_qid = {}
        
for qid, vals in map_per_qid.items():
    var_per_qid[qid] = np.var(list(vals.values()))
    
sorted_vars = [(k, v) for k, v in sorted(var_per_qid.items(), key=lambda item: -item[1])]
highest_var = sorted_vars[:5]

In [None]:
for q, var in highest_var:
    print("Variance: {} Query - {}: {}".format(var, q, queries[q]))
    
    for model, val in map_per_qid[q].items():
        print("MAP for model: {} - {}".format(model, val))
    print()