# Initialization

In [1]:
print("Hello there.")

Hello there.


In [2]:
# Colab specific data upload
!unzip cran.zip
!unzip cisi.zip

Archive:  cran.zip
replace cran/cran.all.1400? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
Archive:  cisi.zip
replace cisi/CISI.ALL? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [2]:
# Auto re-import .py files
%load_ext autoreload
%autoreload 2

# Plotting with plt
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd
import pprint

from preprocessing import *
from synonym_enrich import *
from validate import *

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
!pip install --upgrade gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

Requirement already up-to-date: gensim in /usr/local/lib/python3.7/dist-packages (4.0.1)




# Validation Core




## Cisi dataset

In [5]:
params = {
    "clean_baseline": {
        "data_parse_fnt": parseDocs,
        "retrieval_fnt": read_cisi_relevancy,
        "doc_args": ["cisi/CISI.ALL",],
        "query_args": ["cisi/CISI.QRY",],
        "retrieval_args": {
            "path": "cisi/CISI.REL",},
        "preprocessings": [
            tokenize_and_clean,
        ],
        "use_tfidf": True,
        "use_lsi": False,
        "d": 50,
        "embedding": None,
        "k": 10,
        "test_size": 0.3,
    },
    "lemma_baseline": {
        "data_parse_fnt": parseDocs,
        "retrieval_fnt": read_cisi_relevancy,
        "doc_args": ["cisi/CISI.ALL",],
        "query_args": ["cisi/CISI.QRY",],
        "retrieval_args": {
            "path": "cisi/CISI.REL",},
        "preprocessings": [
            tokenize_and_clean,
            lemmatize,
        ],
        "use_tfidf": True,
        "use_lsi": False,
        "d": 50,
        "embedding": None,
        "k": 10,
        "test_size": 0.3,
    },
    "lemma_baseline_dimred": {
        "data_parse_fnt": parseDocs,
        "retrieval_fnt": read_cisi_relevancy,
        "doc_args": ["cisi/CISI.ALL",],
        "query_args": ["cisi/CISI.QRY",],
        "retrieval_args": {
            "path": "cisi/CISI.REL",},
        "preprocessings": [
            tokenize_and_clean,
            lemmatize,
        ],
        "use_tfidf": True,
        "use_lsi": True,
        "d": 50,
        "embedding": None,
        "k": 10,
        "test_size": 0.3,
    },
    "synEn_baseline": {
        "data_parse_fnt": parseDocs,
        "retrieval_fnt": read_cisi_relevancy,
        "doc_args": ["cisi/CISI.ALL",],
        "query_args": ["cisi/CISI.QRY",],
        "retrieval_args": {
            "path": "cisi/CISI.REL",},
        "preprocessings": [
            tokenize_and_clean,
            lemmatize,
            synonym_enrich,
        ],
        "use_tfidf": True,
        "use_lsi": False,
        "d": 50,
        "embedding": None,
        "k": 10,
        "test_size": 0.3,
    },
    "synEn_baseline_dimred": {
        "data_parse_fnt": parseDocs,
        "retrieval_fnt": read_cisi_relevancy,
        "doc_args": ["cisi/CISI.ALL",],
        "query_args": ["cisi/CISI.QRY",],
        "retrieval_args": {
            "path": "cisi/CISI.REL",},
        "preprocessings": [
            tokenize_and_clean,
            lemmatize,
            synonym_enrich,
        ],
        "use_tfidf": True,
        "use_lsi": True,
        "d": 50,
        "embedding": None,
        "k": 10,
        "test_size": 0.3,
    },
    "clean_posVec": {
        "data_parse_fnt": parseDocs,
        "retrieval_fnt": read_cisi_relevancy,
        "doc_args": ["cisi/CISI.ALL",],
        "query_args": ["cisi/CISI.QRY",],
        "retrieval_args": {
            "path": "cisi/CISI.REL",},
        "preprocessings": [
            tokenize_and_clean,
        ],
        "use_tfidf": False,
        "use_lsi": False,
        "d": 50,
        "embedding": Word2Vec.load("w2v.model"),
        "use_position_vector": True,
        "k": 10,
        "test_size": 0.3,
    },
    "lemma_posVec": {
        "data_parse_fnt": parseDocs,
        "retrieval_fnt": read_cisi_relevancy,
        "doc_args": ["cisi/CISI.ALL",],
        "query_args": ["cisi/CISI.QRY",],
        "retrieval_args": {
            "path": "cisi/CISI.REL",},
        "preprocessings": [
            tokenize_and_clean,
            lemmatize,
        ],
        "use_tfidf": False,
        "use_lsi": False,
        "d": 50,
        "embedding": Word2Vec.load("w2v.model"),
        "use_position_vector": True,
        "k": 10,
        "test_size": 0.3,
    },
    "synEn_posVec": {
        "data_parse_fnt": parseDocs,
        "retrieval_fnt": read_cisi_relevancy,
        "doc_args": ["cisi/CISI.ALL",],
        "query_args": ["cisi/CISI.QRY",],
        "retrieval_args": {
            "path": "cisi/CISI.REL",},
        "preprocessings": [
            tokenize_and_clean,
            lemmatize,
            synonym_enrich,
        ],
        "use_tfidf": False,
        "use_lsi": False,
        "d": 50,
        "embedding": Word2Vec.load("w2v.model"),
        "use_position_vector": True,
        "k": 10,
        "test_size": 0.3,
    },
    "clean_d2v": {
        "data_parse_fnt": parseDocs,
        "retrieval_fnt": read_cisi_relevancy,
        "doc_args": ["cisi/CISI.ALL",],
        "query_args": ["cisi/CISI.QRY",],
        "retrieval_args": {
            "path": "cisi/CISI.REL",},
        "preprocessings": [
            tokenize_and_clean,
        ],
        "use_tfidf": False,
        "use_lsi": False,
        "d": 50,
        "embedding": Doc2Vec.load("d2v.model"),
        "use_position_vector": False,
        "k": 10,
        "test_size": 0.3,
    },
    "lemma_d2v": {
        "data_parse_fnt": parseDocs,
        "retrieval_fnt": read_cisi_relevancy,
        "doc_args": ["cisi/CISI.ALL",],
        "query_args": ["cisi/CISI.QRY",],
        "retrieval_args": {
            "path": "cisi/CISI.REL",},
        "preprocessings": [
            tokenize_and_clean,
            lemmatize,
        ],
        "use_tfidf": False,
        "use_lsi": False,
        "d": 50,
        "embedding": Doc2Vec.load("d2v.model"),
        "use_position_vector": False,
        "k": 10,
        "test_size": 0.3,
    },
    "synEn_d2v": {
        "data_parse_fnt": parseDocs,
        "retrieval_fnt": read_cisi_relevancy,
        "doc_args": ["cisi/CISI.ALL",],
        "query_args": ["cisi/CISI.QRY",],
        "retrieval_args": {
            "path": "cisi/CISI.REL",},
        "preprocessings": [
            tokenize_and_clean,
            lemmatize,
            synonym_enrich,
        ],
        "use_tfidf": False,
        "use_lsi": False,
        "d": 5,
        "embedding": Doc2Vec.load("d2v.model"),
        "use_position_vector": False,
        "k": 10,
        "test_size": 0.3,
    },
}

results = {}
# pprint.pprint({k: validate(**v) for k, v in params.items()})

In [6]:
results["clean_baseline"] = validate(**params["clean_baseline"])

In [7]:
results["lemma_baseline"] = validate(**params["lemma_baseline"])

In [8]:
results["lemma_baseline_dimred"] = validate(**params["lemma_baseline_dimred"])

In [9]:
results["synEn_baseline_dimred"] = validate(**params["synEn_baseline_dimred"])

In [10]:
results["clean_posVec"] = validate(**params["clean_posVec"])

  cos_sim = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))


In [11]:
results["lemma_posVec"] = validate(**params["lemma_posVec"])

  cos_sim = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))


In [12]:
results["synEn_posVec"] = validate(**params["synEn_posVec"])

In [13]:
results["clean_d2v"] = validate(**params["clean_d2v"])

In [14]:
results["lemma_d2v"] = validate(**params["lemma_d2v"])

In [15]:
results["synEn_d2v"] = validate(**params["synEn_d2v"])

In [17]:
pprint.pprint(results)

{'clean_baseline': {'F1': 5.508094176247431,
                    'MRR': 62.2463768115942,
                    'mAP': 55.13041561229966,
                    'precision': 2.8320428826682553,
                    'precision@10': 33.043478260869556,
                    'recall': 100.0,
                    'recall@10': 33.6231884057971},
 'clean_d2v': {'F1': 4.829850919764029,
               'MRR': 10.91614906832298,
               'mAP': 11.17494824016563,
               'precision': 2.474687313877308,
               'precision@10': 4.347826086956523,
               'recall': 100.0,
               'recall@10': 5.6521739130434785},
 'clean_posVec': {'F1': 4.181123713444324,
                  'MRR': 5.041407867494825,
                  'mAP': 5.041407867494825,
                  'precision': 2.135199523525908,
                  'precision@10': 2.1739130434782608,
                  'recall': 100.0,
                  'recall@10': 2.1739130434782608},
 'lemma_baseline': {'F1': 4.931888814662058,

In [30]:
params = {
    "data_parse_fnt": parseDocs,
    "retrieval_fnt": read_cisi_relevancy,
    "doc_args": ["cisi/CISI.ALL",],
    "query_args": ["cisi/CISI.QRY",],
    "retrieval_args": {
        "path": "cisi/CISI.REL",},
    "preprocessings": [
        tokenize_and_clean,
        lemmatize,
        # synonym_enrich,
    ],
    "use_tfidf": False,
    "use_lsi": False,
    "use_position_vector": True,
    "d": 5,
    # "embedding": None,
    # "embedding": Word2Vec.load("w2v.model"),
    "embedding": Doc2Vec.load("d2v.model"),
    "simimarity_threshold": 0.8,
    "k": 10,
    "test_size": 0.3,
}

pprint.pprint(validate(**params))

{'F1': 5.952036983530771,
 'MRR': 8.899240855762594,
 'mAP': 8.816425120772946,
 'precision': 3.067301965455629,
 'precision@10': 5.6521739130434785,
 'recall': 100.0,
 'recall@10': 5.6521739130434785}


In [31]:
recall([0,1,1], 10)

0.2