In [2]:
import argparse
import json
import logging
import os
import random
import sys
from glob import glob
from os.path import join, dirname, basename, exists

import pandas as pd

sys.path.append('.')

from scorer.main import evaluate
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import sent_tokenize

random.seed(0)
ROOT_DIR = os.getcwd()

sbert = SentenceTransformer('paraphrase-distilroberta-base-v1')

INFO : Load pretrained SentenceTransformer: paraphrase-distilroberta-base-v1
INFO : Did not find folder paraphrase-distilroberta-base-v1
INFO : Try to download model from server: https://sbert.net/models/paraphrase-distilroberta-base-v1.zip
INFO : Load SentenceTransformer from folder: /home/borisovai/.cache/torch/sentence_transformers/sbert.net_models_paraphrase-distilroberta-base-v1
INFO : Use pytorch device: cpu


In [3]:
def load_vclaims(dir):
    vclaims_fp = glob(f'{dir}/*.json')
    vclaims_fp.sort()
    vclaims = {}
    vclaims_list = []
    for vclaim_fp in vclaims_fp:
        with open(vclaim_fp) as f:
            vclaim = json.load(f)
        vclaims[vclaim['vclaim_id']] = vclaim
        vclaims_list.append(vclaim)
    return vclaims, vclaims_list

In [11]:
vclaims, vclaims_list = load_vclaims("baselines/politifact-vclaims")
all_iclaims = pd.read_csv("baselines/v1/iclaims.queries", sep='\t', names=['iclaim_id', 'iclaim'])
wanted_iclaim_ids = pd.read_csv("baselines/v1/train.tsv", sep='\t', names=['iclaim_id', '0', 'vclaim_id', 'relevance'])
wanted_iclaim_ids = wanted_iclaim_ids.iclaim_id.tolist()

iclaims = []
for iclaim_id in wanted_iclaim_ids:
    iclaim = all_iclaims.iclaim[all_iclaims.iclaim_id == iclaim_id].iloc[0]
    iclaims.append((iclaim_id, iclaim))

# index = "2b-english"

# # options are title, vclaim, text
# scores = get_scores(iclaims, vclaims_list, index, search_keys=args.keys, size=args.size)
# ngram_baseline_fpath = join(ROOT_DIR,
#                                 'baselines/data/subtask_2b_bm25_english_baselines/v1/train.tsv')
# formatted_scores = format_scores(scores)
# with open(ngram_baseline_fpath, 'w') as f:
#     f.write(formatted_scores)
#     maps, mrr, precisions = evaluate(args.dev_file_path, ngram_baseline_fpath)
# logging.info(f"S-BERT Baseline for Subtask-{args.subtask}--{args.lang}")
# logging.info(f'All MAP scores on threshold from [1, 3, 5, 10, 20, 50, 1000]. {maps}')
# logging.info(f'MRR score {mrr}')
# logging.info(f'All P scores on threshold from [1, 3, 5, 10, 20, 50, 1000]. {precisions}')

In [132]:
query = sbert.encode(iclaim)
texts = [vclaim['text'] for vclaim in vclaims_list]
encoddings_list = []

encoddings_list.append(sbert.encode(sent_tokenize(texts[0])))
encoddings_list.append(sbert.encode(sent_tokenize(texts[1])))
                           



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [143]:
print(encoddings_list[0])
# print(sentences[33:34].shape)
# test_results = util.semantic_search(query, sentences[6:7])
# print(test_results)


[[ 0.13679294  0.34706733 -0.18449005 ...  0.5020159   0.06291248
  -0.18408431]
 [ 0.02565829  0.25984177  0.21223696 ... -0.24807763 -0.13124526
   0.03622891]
 [ 0.09517999  0.21162288  0.00453614 ...  0.12868911 -0.14914143
   0.38807896]
 ...
 [ 0.36941418  0.45837346  0.06119155 ...  0.29594317 -0.01883352
   0.01633249]
 [ 0.07529091  0.3617548   0.0852778  ...  0.2632263  -0.03113406
  -0.16800287]
 [-0.01616891  0.43360144 -0.04188025 ...  0.30927122 -0.00233839
   0.14829366]]


In [144]:
# Compute scores by finding cosine similarity between all claims and the query

score = []

for encodding in encoddings_list:
    print(encodding.shape)
    results = util.semantic_search(query, encodding, top_k=5)
    result_sum = 0
    for result in results:
        print(result)
        result_sum+=result[0]['score']
    average = result_sum / len(results)
    score.append((vclaims_list[encoddings_list.index(encodding)]['vclaim_id'], average))




print("SCORE: :",score)



# score = []

# for result in results:
#     score.append(())

(43, 768)
[{'corpus_id': 6, 'score': 0.16315478}, {'corpus_id': 13, 'score': 0.15716958}, {'corpus_id': 38, 'score': 0.15340334}, {'corpus_id': 18, 'score': 0.15297776}, {'corpus_id': 23, 'score': 0.15165034}]
(32, 768)
[{'corpus_id': 29, 'score': 0.07074496}, {'corpus_id': 5, 'score': 0.061870046}, {'corpus_id': 11, 'score': 0.054680146}, {'corpus_id': 19, 'score': 0.05001956}, {'corpus_id': 27, 'score': 0.049646538}]
SCORE: : [('vclaim-pol-00000', 0.16315478086471558), ('vclaim-pol-00001', 0.07074496150016785)]


  score.append((vclaims_list[encoddings_list.index(encodding)]['vclaim_id'], average))


In [None]:
def get_scores(iclaims, vclaims_list, index, search_keys, size):
    iclaims_count, vclaims_count = len(iclaims), len(vclaims_list)
    scores = {}

    logging.info(f"Geting RM5 scores for {iclaims_count} iclaims and {vclaims_count} vclaims")

    for iclaim_id, iclaim in iclaims:
        score = get_score(iclaim, vclaims_list, index, search_keys=search_keys, size=size)
        scores[iclaim_id] = score
    return scores


def format_scores(scores):
    output_string = ''
    for iclaim_id in scores:
        for i, (vclaim_id, score) in enumerate(scores[iclaim_id]):
            output_string += f"{iclaim_id}\tQ0\t{vclaim_id}\t{i + 1}\t{score}\telasic\n"
    return output_string

In [None]:

def run_baselines(args):
    if not exists('baselines/data'):
        os.mkdir('baselines/data')
    vclaims, vclaims_list = load_vclaims(args.vclaims_dir_path)
    all_iclaims = pd.read_csv(args.iclaims_file_path, sep='\t', names=['iclaim_id', 'iclaim'])
    wanted_iclaim_ids = pd.read_csv(args.dev_file_path, sep='\t', names=['iclaim_id', '0', 'vclaim_id', 'relevance'])
    wanted_iclaim_ids = wanted_iclaim_ids.iclaim_id.tolist()

    iclaims = []
    for iclaim_id in wanted_iclaim_ids:
        iclaim = all_iclaims.iclaim[all_iclaims.iclaim_id == iclaim_id].iloc[0]
        iclaims.append((iclaim_id, iclaim))

    index = f"{args.subtask}-{args.lang}"

    # options are title, vclaim, text
    scores = get_scores(iclaims, vclaims_list, index, search_keys=args.keys, size=args.size)
    ngram_baseline_fpath = join(ROOT_DIR,
                                f'baselines/data/subtask_{args.subtask}_bm25_{args.lang}_{basename(args.dev_file_path)}')
    formatted_scores = format_scores(scores)
    with open(ngram_baseline_fpath, 'w') as f:
        f.write(formatted_scores)
    maps, mrr, precisions = evaluate(args.dev_file_path, ngram_baseline_fpath)
    logging.info(f"S-BERT Baseline for Subtask-{args.subtask}--{args.lang}")
    logging.info(f'All MAP scores on threshold from [1, 3, 5, 10, 20, 50, 1000]. {maps}')
    logging.info(f'MRR score {mrr}')
    logging.info(f'All P scores on threshold from [1, 3, 5, 10, 20, 50, 1000]. {precisions}')


# python baselines/bm25.py --train-file-path=baselines/v1/train.tsv --dev-file-path=baselines/v1/train.tsv --vclaims-dir-path=baselines/politifact-vclaims --iclaims-file-path=baselines/v1/iclaims.queries --subtask=2b --lang=english
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-file-path", "-t", required=True, type=str,
                        help="The absolute path to the training data")
    parser.add_argument("--dev-file-path", "-d", required=True, type=str,
                        help="The absolute path to the dev data")
    parser.add_argument("--vclaims-dir-path", "-v", required=True, type=str,
                        help="The absolute path to the directory with the verified claim documents")
    parser.add_argument("--iclaims-file-path", "-i", required=True,
                        help="TSV file with iclaims. Format: iclaim_id iclaim_content")
    parser.add_argument("--keys", "-k", default=['vclaim', 'title'],
                        help="Keys to search in the document")
    parser.add_argument("--size", "-s", default=19250,
                        help="Maximum results extracted for a query")
    parser.add_argument("--subtask", "-m", required=True,
                        choices=['2a', '2b'],
                        help="The subtask you want to check the format of.")
    parser.add_argument("--lang", "-l", required=True, type=str,
                        choices=['arabic', 'english'],
                        help="The language of the subtask")

    args = parser.parse_args()
    run_baselines(args)