## Info

This file trains credibility classifier based on the NIST assessors credibility judgments to be used to improve BM25 results in credibility and correctness. To this end, 

- we split the 50 topics into 10 validation sets of 5 topics and trained classifiers on the remaining 45 topics.
- this ensured the classifier is alien to the documents discussing validation topics
- and it predicts the documents' credibility. 
- ClueWeb12-B13 documents

The inputs

- BM25 run: Our BM25 run computed using Anserini
- 10fold_groups.txt: the 10 validation sets of 5 topics
- qrels_correctness.txt shared by NIST
- infected.txt to filter out malicious documents.

Outputs:

- 10 logistic regression classifiers

In [1]:

import pandas as pd
import os,sys,re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.pipeline import Pipeline
from joblib import dump, load


In [2]:
infecteds = pd.read_csv('/home/ludwig/Documents/DecisionRUN/trec/infected.txt', header=None, sep = ' ')
infecteds.columns = ['DOCID']

In [3]:
bm25run = pd.read_csv('treceval/UWatMDS_BM25.txt', header=None, sep = ' ')
bm25run.columns = ['TID','QID','DOCID','REL','COR','CRE']
bm25run = bm25run[~bm25run.DOCID.isin(infecteds.DOCID)]

qrels = pd.read_csv('qrels_correctness.txt', header=None, sep = ' ')
qrels.columns = ['TID','QID','DOCID','REL','COR','CRE']
qrels = qrels[~qrels.DOCID.isin(infecteds.DOCID)]
qrels = qrels[qrels.CRE.isin([0,1])]
qrels.head()

Unnamed: 0,TID,QID,DOCID,REL,COR,CRE
0,1,0,clueweb12-0000wb-03-01030,1,0,0
1,1,0,clueweb12-0000wb-47-24784,1,0,1
6,1,0,clueweb12-0002wb-08-02435,2,0,1
8,1,0,clueweb12-0002wb-42-30714,2,0,1
13,1,0,clueweb12-0004wb-25-32570,1,0,1


## Classification

In [4]:
qrels.shape

(4159, 6)

In [6]:
# for docname in doclist['DOCID']:
DOCS_DIR = '/media/ludwig/story/DecisionRunDocs/trec_decision_parts/trec_decision_docs/'
SAVE_DIR = 'model/'
counter = 1
docs = []
for docname in qrels['DOCID'].drop_duplicates():
    try:
        with open(DOCS_DIR + docname) as fh:
            docs.append(fh.read())
    except:
        docs.append('!DOCTYPE')
        print('there is a problem with %s' % counter)
    if counter % 1000 == 0:
        print(counter)
    counter += 1

there is a problem with 546
there is a problem with 567
there is a problem with 841
there is a problem with 895
there is a problem with 998
1000
there is a problem with 1334
there is a problem with 1898
there is a problem with 1996
2000
there is a problem with 2488
there is a problem with 2593
3000
there is a problem with 3040
there is a problem with 3455
there is a problem with 3591
there is a problem with 3617
there is a problem with 3619
there is a problem with 3626
4000


In [7]:
mapper = pd.DataFrame(qrels.DOCID.drop_duplicates())
mapper['DOCS'] = docs

In [8]:
mapper.head()

Unnamed: 0,DOCID,DOCS
0,clueweb12-0000wb-03-01030,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
1,clueweb12-0000wb-47-24784,"<html>\n<head>\n<META http-equiv=""Content-Type..."
6,clueweb12-0002wb-08-02435,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
8,clueweb12-0002wb-42-30714,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T..."
13,clueweb12-0004wb-25-32570,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.01 T..."


In [9]:
qrels   = qrels.merge(mapper, on = 'DOCID', how = 'left')

In [10]:
qrels.head()

Unnamed: 0,TID,QID,DOCID,REL,COR,CRE,DOCS
0,1,0,clueweb12-0000wb-03-01030,1,0,0,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
1,1,0,clueweb12-0000wb-47-24784,1,0,1,"<html>\n<head>\n<META http-equiv=""Content-Type..."
2,1,0,clueweb12-0002wb-08-02435,2,0,1,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
3,1,0,clueweb12-0002wb-42-30714,2,0,1,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T..."
4,1,0,clueweb12-0004wb-25-32570,1,0,1,"<!DOCTYPE html PUBLIC ""-//W3C//DTD HTML 4.01 T..."


In [11]:
from sklearn.linear_model import LogisticRegression 
from sklearn.pipeline import Pipeline
from joblib import dump, load

## Kfold

In [12]:
topics = sorted(qrels['TID'].unique())

In [13]:
splts = pd.read_csv('10fold_groups.txt', header=None)
splts.shape

(50, 2)

In [14]:
splts[splts[0] == 1][1].values

array([34, 23, 31, 24, 48])

In [27]:
counter = 7

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [29]:
splts[splts[0] == 7]

Unnamed: 0,0,1
30,7,19
31,7,46
32,7,4
33,7,47
34,7,41


In [31]:
train_index = splts[splts[0] != 7][1].values
test_index  = splts[splts[0] == 7][1].values

In [33]:
test_index

array([19, 46,  4, 47, 41])

In [34]:
vect = CountVectorizer(ngram_range=(4,4), analyzer='char', binary=True)
logreg = LogisticRegression()
pline  = Pipeline([('vectorizer', vect), ('logreg', logreg)])
test_topics  = test_index
train_topics = train_index
train, test  = qrels[~qrels['TID'].isin(test_topics)], qrels[qrels['TID'].isin(test_topics)]

In [37]:
train, test  = qrels[~qrels['TID'].isin(test_topics)], qrels[qrels['TID'].isin(test_topics)]

In [41]:
train        = train[~train['DOCID'].isin(test['DOCID'])]
X_train      = train['DOCS'].values.tolist()
X_test       = test['DOCS'].values.tolist()
y_train, y_test = train['CRE'], test['CRE']

In [43]:
vect = CountVectorizer(ngram_range=(4,4), analyzer='char', binary=True)
logreg = LogisticRegression()
pline  = Pipeline([('vectorizer', vect), ('logreg', logreg)])
pline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='char', binary=True,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(4, 4), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                  

In [29]:
# accs = []
counter = 1
for gr in splts[0].unique():
    train_index = splts[splts[0] != gr][1].values
    test_index  = splts[splts[0] == gr][1].values
    vect = CountVectorizer(ngram_range=(4,4), analyzer='char', binary=True)
    logreg = LogisticRegression()
    pline  = Pipeline([('vectorizer', vect), ('logreg', logreg)])
    test_topics  = test_index
    train_topics = train_index
    train, test  = qrels[~qrels['TID'].isin(test_topics)], qrels[qrels['TID'].isin(test_topics)]
    print(train.TID.unique())
    print(test.TID.unique())
    train        = train[~train['DOCID'].isin(test['DOCID'])]
    X_train      = train['DOCS'].values.tolist()
    X_test       = test['DOCS'].values.tolist()
    y_train, y_test = train['CRE'], test['CRE']
    print('# of datapoint is %s' % len(X_train))
    vect = CountVectorizer(ngram_range=(4,4), analyzer='char', binary=True)
    logreg = LogisticRegression()
    pline  = Pipeline([('vectorizer', vect), ('logreg', logreg)])
    pline.fit(X_train, y_train)
    dump(pline, 'LOGREG_10fold_v2_%s.joblib' % counter)
    counter += 1
#     y_pred_class = pline.predict(X_test)
#     print(metrics.accuracy_score(y_test, y_pred_class))
#     accs.append(metrics.accuracy_score(y_test, y_pred_class))


[ 1  2  3  4  5  6  7  8  9 10 11 12 13 15 16 17 18 19 20 21 22 25 26 27
 28 29 30 32 33 35 36 37 38 39 40 41 42 43 44 45 46 47 49 50 51]
[23 24 31 34 48]
# of datapoint is 3731
[ 2  4  5  6  8  9 10 11 12 13 16 17 18 19 20 21 22 23 24 25 26 27 28 29
 30 31 32 33 34 35 36 37 38 39 40 41 42 43 45 46 47 48 49 50 51]
[ 1  3  7 15 44]
# of datapoint is 3922
[ 1  2  3  4  5  6  7  8 10 12 13 15 16 17 18 19 20 22 23 24 27 28 29 30
 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51]
[ 9 11 21 25 26]
# of datapoint is 3720
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 15 17 19 20 21 22 23 24 25 26 27
 28 29 31 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50]
[16 18 30 32 51]
# of datapoint is 3799
[ 1  2  3  4  6  7  8  9 11 12 13 15 16 17 18 19 21 22 23 24 25 26 27 28
 29 30 31 32 33 34 36 37 39 40 41 42 43 44 45 46 47 48 49 50 51]
[ 5 10 20 35 38]
# of datapoint is 3817
[ 1  2  3  4  5  6  7  8  9 10 11 15 16 17 18 19 20 21 22 23 24 25 26 27
 29 30 31 32 33 34 35 36 37 38 39 40 