# Challenges

## Setup

Our collection is in the same directory as the code files. The folder is called rcv1, which has 2 folders inside: D_train and D_test. Each one of those contains folders corresponding to a certain day and each one of those folders contains documents.

In [13]:
D_train = getEvaledDocs('qrels.train')
D_test = getEvaledDocs('qrels.test')
trainIndex = WooshDocumentIndex(False, "trainIndex", D_train)
testIndex = WooshDocumentIndex(False, "testIndex", D_test)

100%|██████████| 4557/4557 [01:27<00:00, 52.37it/s]
100%|██████████| 33920/33920 [10:28<00:00, 53.97it/s]  


In [30]:
topicIndex=TopicIndex("topics.txt","topicsindexdir")

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from scipy.cluster.hierarchy import dendrogram
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from Preprocess import Preprocess
from xml_documents_parser import DocumentParser
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import * 
from util import *
import matplotlib.pyplot as plt
import math
import functools
import sys
from yellowbrick.cluster import KElbowVisualizer

def file_to_string(file):
    ptr=open(file)
    _str=ptr.read()        
    return _str 

def calcOptimNclusters(vectorspace, krange=(2,30)):
    model = KMeans()
    visualizer = KElbowVisualizer(model, k=krange, metric='silhouette', timings= True)
    visualizer.fit(vectorspace)        # Fit the data to the visualizer
    visualizer.show()
    exit()
    
def cohesion(labels, distances):
    sse = 0
    sseDict = {}
    uniqueLabels = np.unique(labels)
    for label in uniqueLabels:
        sse = 0
        clusterDocs = np.where(labels == label)[0]
        for doc1 in clusterDocs:
            for doc2 in clusterDocs:
                sse += (pointDistance(distances[doc1], distances[doc2])/2) ** 2
        sseDict[label] = sse
    return sseDict

def separation(labels, distances):
    ssb = 0
    ssbDict = {}
    uniqueLabels = np.unique(labels)
    for label in uniqueLabels:
        ssb = 0
        clusterDocs = np.where(labels == label)[0]
        for doc1 in clusterDocs:
            nonClsuterDocs = np.where(labels != label)[0]
            for doc2 in nonClsuterDocs:
                ssb += (pointDistance(distances[doc1], distances[doc2])/2) ** 2
        ssbDict[label] = ssb
    return ssbDict

## Clustering
__@input:__

__D__ has the structure of a list of filenames or a list of topic names

__col__ 'docs' for documents, anything else for topics

__@output__

__clustersDescription__ list of pairs medoid coordinates and list of documents

__clusters__ array of arrays of document id's

__vectorspace__ Tfidfvectorizer's fit transform output

__labels__ labels

__distances__ distances vector of distances between documents

In [32]:
def clustering(D, col, number_clusters, minDocFreq = 2, maxDocFreq = 0.9):
    if(col == 'docs'):
        D = noPreprocessDocs(D)
 
    vectorizer = TfidfVectorizer(use_idf = False, min_df= minDocFreq, max_df=maxDocFreq, stop_words='english')
    vectorspace = vectorizer.fit_transform(D)
    

    kmeans = KMeans(n_clusters=number_clusters)
    labels = kmeans.fit_predict(vectorspace)
    centroids = kmeans.cluster_centers_
    distances = kmeans.fit_transform(vectorspace.toarray())

    pca = PCA(2)
    distances = pca.fit_transform(distances)

    uniqueLabels = np.unique(labels)
    clustersDescription = []
    clusters = []
    for i in uniqueLabels:
        clustersDescription.append(((centroids[:,0][i], centroids[:,0][i]), list(map(lambda x: D[x], list(np.where(labels == i)[0])))))
        clusters.append(np.where(labels == i)[0])
    return clustersDescription, clusters, vectorspace, labels, distances

## Interpret
__@input:__

__cluster__ array of document id's

__D__ has the structure of a list of filenames or a list of topic name

__col__ 'docs' for document analysis, 'topics' for topic analysis

__@output__

__medoid__ document Id or topic name

__mean__ float

In [None]:
def interpret(cluster, D, distances, col):
    nDistances = len(distances)
    distanceMatrix = {}

    for i in  range(nDistances):
        if i not in cluster: 
            continue
        pointDistances = []
        for j in range(nDistances):
            if j not in cluster: 
                continue
            pointDistances.append(pointDistance(distances[i], distances[j]))
        distanceMatrix[i] = pointDistances
    min = sys.maxsize
    medoid = 0
    for i in distanceMatrix.keys():
        j = float(functools.reduce(lambda x, y: x + y, distanceMatrix[i]))
        if j < min:
            min = j
            medoid = i
    mean = np.mean(list(distanceMatrix.values()))
    if(col == 'docs'):
        medoid = D[medoid]
    elif(col == 'topics'):
        medoid = 'R' + str(101+medoid)
    return medoid, mean

## Evaluation
__@input:__

__D__ has the structure of a list of filenames or a list of topic name

__vectorspace__ Tfidfvectorizer's fit transform output

__labels__ labels

__distances__ distances vector of distances between documents

__col__ 'docs' for document analysis, 'topics' for topic analysis

__@output__

__sil_score__ float

__avgCohesion__ float

__avgSeparation__ float

In [1]:
    def evaluation(D, vectorspace, labels, distances):
    sil_score = silhouette_score(vectorspace, labels, metric = 'cosine')
    #print('silhouette score = {}'.format(sil_score))
    avgCoheision = np.average(list(cohesion(labels, distances).values()))
    avgSeparation = np.average(list(separation(labels, distances).values()))
    return sil_score, avgCoheision, avgSeparation

In [33]:
D = []
for i in ['R101' , 'R121', 'R150', 'R170', 'R180']:
    D += getEvaledDocsForTopic('qrels.test', i, 'test')
description, clusters, vectorspace, labels, distances = clustering(D, 'docs', number_clusters = 11)
print(description)
medoid, mean = interpret(clusters[0], D, distances, 'docs')
print("Medoid: ", medoid)
print("Mean: ", mean)
sil_score, avgCoheision, avgSeparation = evaluation(D, vectorspace, labels, distances)
print("Silhouette: ", sil_score)
print("AvgCoheision: ", avgCoheision)
print("AvgSeparation: ", avgSeparation)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



NameError: name 'interpret' is not defined

## Classification

## Training

__@input:__

__q__ string with topic name

__Dtrain__ list of document names

__Rtrain__ relevance feedback list

__distances__ distances vector of distances between documents

__calssifier_type__ 'logistic' or 'XGBOOST'

__@output__

__classifier__ classifier object

In [10]:
from logistic import LogisticClassifier
from xGBoost import XGBOOSTClassifier
from util import getCollection,setupEvalManyFeatures,setupEvalOneFeature
from util import getRelevantNonRelevant,compute_metrics,RRF
from util import show_graphics_metrics_IRmodels,show_metrics_classifiers
from statistics import mean
from woosh_index import WooshDocumentIndex,TopicIndex

def training(q,DTrain,RTrain,**args):
    '''
    @input topic document q ∈ Q, training collection Dtrain, judgments Rtrain, and
    optional arguments on the classification process
    @behavior learns a classification model to predict the relevance of documents on the
    topic q using Dtrain and Rtrain, where the training process is subjected to
    proper preprocessing, classifier’s selection and hyperparameterization
    @output q-conditional classification model
    '''
    classifier_type = args.get('classifier_type')
    if classifier_type=='logistic':
        classifier=LogisticClassifier()
    elif classifier_type=='XGBOOST':
        classifier=XGBOOSTClassifier()
    classifier.train(q,DTrain,RTrain)
    return classifier

## Classify

__@input:__

__d__ document features (document belongs to Dtest)

__q__ topic name

__M__ classifier model


__@output__

__probability__ proabability of document d being classified as relvant for topic q

In [22]:
def classify(d,q,M,**args):
    '''
    @input document d ∈ Dtest, topic q ∈ Q, and classification model M
    @behavior classifies the probability of document d to be relevant for topic q given M
    @output probabilistic classification output on the relevance of document d to the
    topic t
    '''
    classifier=M[q]
    return classifier.classify(d,q)

## Evaluate

__@input:__

__Qtest__ list of topic names

__Dtest__ list of document names

__RTrain__ list of relevance feedback for train collection

__RTest__ list of relevance feedback for test collection

__trainX__ list of document features for train collection

__testX__ list of document features for test collection

__@output__

__aidedRanked__ statistics for ranked retrieval aided by relevance relevance feedback 

__aidedNonRanked__ statistics for simple retrieval aided by relevance relevance feedback 

__nonAided__ statistics for simple retrieval

__classifier_metrics__ general statistics

In [23]:
def evaluate(Qtest,DTest,**args):
    '''
    @input subset of topics Qtest ⊆ Q, testing document collection Dtest, judgments
    Rtest, and arguments for the classification and retrieval modules
    @behavior evaluates the behavior of the IR system in the presence and absence of
    relevance feedback. In the presence of relevance feedback, training and
    testing functions are called for each topic in Qtest for a more comprehensive assessment
    @output performance statistics regarding the underlying classification system and
    the behavior of the aided IR system
    '''
    RTrain=args.get('RTrain')
    RTest=args.get('RTest')
    trainX=args.get('trainX')
    testX=args.get('testX')
    classifier_type=args.get('classifier_type')
    Model=dict()
    #metrics
    aidedRanked=dict()
    aidedNonRanked=dict()
    nonAided = dict()
    classifier_metrics=dict()
    #k used for retrieval
    k=args.get('k')
    ranking_type=args.get('ranking_type')
    for topic in Qtest:
        relevant,nonRelevant=getRelevantNonRelevant(topic)
        '''
        Non-Aided IR
        '''
        ranked_docs_names=[name for score, name in sorted(zip(testX[topic],DTest[topic]), 
                            key=lambda pair: RRF(pair[0]),reverse=True)]
        precision,recall,fscoreVal,precision_recall_curve,bpref,avg_prec=compute_metrics(ranked_docs_names
                                                                          ,relevant, nonRelevant,k)
        nonAided[topic] = [precision, recall, fscoreVal,precision_recall_curve,bpref,avg_prec]
        '''
        Train the model
        '''
        try:
            Model[topic]=training(topic,trainX,RTrain,classifier_type=classifier_type)
            aidedNonRanked[topic],aidedRanked[topic],classifier_metrics[topic]=Model[topic].evaluate(topic,
                          DTest,RTest,k = k,testX=testX,ranking_type=ranking_type,
                          relevant=relevant,nonRelevant=nonRelevant)
        except ValueError:
            print("For topic ", topic
                  ,"the classifier needs samples of at least 2 classes in the data"
                  , "but the data contains only one class: 1")
            aidedNonRanked[topic]=nonAided[topic]
            aidedRanked[topic]=nonAided[topic]
            #values from Non Aided
            classifier_metrics[topic]=[precision,recall,fscoreVal,avg_prec]
        
    '''
    Calculate Average values for the metrics
    '''
    nonAided['Avg MAP'] = mean([nonAided[topic][5] for topic in nonAided])
    nonAided['Avg BPREF'] = mean([nonAided[topic][4] for topic in nonAided if topic != 'Avg MAP'])
    
    aidedRanked['Avg MAP']= mean([aidedRanked[topic][5] for topic in aidedRanked])
    aidedRanked['Avg BPREF']= mean([aidedRanked[topic][4] for topic in aidedRanked if topic != 'Avg MAP'])
    
    aidedNonRanked['Avg MAP']= mean([aidedNonRanked[topic][5] for topic in aidedNonRanked])
    aidedNonRanked['Avg BPREF']= mean([aidedNonRanked[topic][4] for topic in aidedNonRanked if topic != 'Avg MAP'])

    classifier_metrics['Avg MAP']=mean([classifier_metrics[topic][3] for topic in classifier_metrics])    
    return aidedRanked,aidedNonRanked,nonAided,classifier_metrics

### Setup

In [16]:
Qtest=['R'+str(i) for i in range(101,201,1)]
#Qtest=['R103','R102','R142','R145','R131','R145','R198']#,'R145']#,'R106','R107','R195','R175']

trainfiles,testfiles,DTrain,RTrain,DTest,RTest=getCollection(Qtest)
trainDocsIndex=WooshDocumentIndex(load=True,dir_name="trainIndex",files=trainfiles)
testDocsIndex=WooshDocumentIndex(load=True,dir_name="testIndex",files=testfiles)
topicIndex=TopicIndex("topics.txt","topicsindexdir")

features='many'#features='one' for only one feature in the vector (bm25)
if features=='many':
    trainX, testX = setupEvalManyFeatures(Qtest, DTrain, DTest,
                              trainDocsIndex,testDocsIndex,topicIndex)
elif features=='one':
    trainX, testX = setupEvalOneFeature(Qtest, DTrain, DTest,
                                  trainDocsIndex,testDocsIndex,topicIndex)

### Training

In [21]:
classifierModel=dict()
classifierModel['R103']=training('R103',trainX,RTrain,classifier_type='logistic')
classifierModel['R102']=training('R102',trainX,RTrain,classifier_type='logistic')

### Classify

In [24]:
document1_name=DTrain['R103'][0]#choose first document in the collection
document1_features=trainX['R103'][0]#pick the feature representation
probabilities1=classify(document1_features,'R103',classifierModel)
print('DOCUMENT WITH ID ',document1_name, ' with probabilities ', probabilities1,'for topic R103')
document2_name=DTrain['R103'][1]
document2_features=trainX['R103'][1]
probabilities2=classify(document2_features,'R103',classifierModel)
print('DOCUMENT WITH ID ',document2_name, ' with probabilities ', probabilities2,'for topic R103')

DOCUMENT WITH ID  2738  with probabilities  [0.77665883 0.22334117] for topic R103
DOCUMENT WITH ID  6062  with probabilities  [0.71780783 0.28219217] for topic R103


### Evaluate

In [29]:
k=10
IRmodels=[]
IRmodels_names=[]
classifiers=[]
classifiers_names=[]
'''
Ranking by Probability
'''
hyper_parameters=None

l_aidedRanked,l_aidedNonRanked,l_nonAided,l_classifier_metrics=evaluate(Qtest,DTest,
    trainX=trainX,testX=testX,RTrain=RTrain,RTest=RTest,k=k,classifier_type='logistic',ranking_type='proba',
    hyper_parameters=hyper_parameters)

IRmodels+=l_nonAided,l_aidedNonRanked,l_aidedRanked
IRmodels_names+='Not Aided',' Logistic Aided Not Ranked without Hyper Paremeter Tuning','Logistic Aided Ranked By Probability without Hyper Parameter Tuning'
classifiers.append(l_classifier_metrics)
classifiers_names.append('Logistic Classifier without Hyper Parametrization')
'''
Hyper parameter Tuning
'''
hyper_parameters=hyper_parameter_search('logistic',trainX,RTrain)

l_aidedRanked,l_aidedNonRanked,l_nonAided,l_classifier_metrics=evaluate(Qtest,DTest,
    trainX=trainX,testX=testX,RTrain=RTrain,RTest=RTest,k=k,classifier_type='logistic',ranking_type='proba',
    hyper_parameters=hyper_parameters)

IRmodels+=l_aidedNonRanked,l_aidedRanked
IRmodels_names+='Logistic Aided Not Ranked with Hyper Paremeter Tuning','Logistic Aided Ranked By Probability with Hyper Parameter Tuning'
classifiers.append(l_classifier_metrics)
classifiers_names.append('Logistic Classifier with Hyper Parametrization')

'''
Ranked by Score
'''
x_aidedRanked,x_aidedNonRanked,x_nonAided,x_classifier_metrics=evaluate(Qtest,DTest,
    trainX=trainX,testX=testX,RTrain=RTrain,RTest=RTest,k=k,classifier_type='XGBOOST',ranking_type='score')

IRmodels+=x_aidedNonRanked,x_aidedRanked
IRmodels_names+='XGBoost Aided Not Ranked', 'XGBoost Aided Ranked By Probability'
classifiers.append(x_classifier_metrics)
classifiers_names.append('XGBOOST Classifier')

For topic  R175 the classifier needs samples of at least 2 classes in the data but the data contains only one class: 1
Best Score: 0.74
Best Hyperparameters: {'C': 0.48777805499545607, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.773974358974359
Best Hyperparameters: {'C': 0.04716237184469433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.7820512820512822
Best Hyperparameters: {'C': 0.0002012896661878088, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.7878542510121458
Best Hyperparameters: {'C': 0.013653836035069424, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.7785714285714287
Best Hyperparameters: {'C': 0.48777805499545607, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.9333333333333332
Best Hyperparameters: {'C': 0.015864765456573147, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.9512820512820512
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.9454545454545453
Best Hyperparameters: {'C': 0.0002012896661878088, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.7
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.9450292397660819
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.9436363636363637
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.9651515151515152
Best Hyperparameters: {'C': 33.72108309441093, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8824175824175823
Best Hyperparameters: {'C': 0.0043963310508178515, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.9199999999999999
Best Hyperparameters: {'C': 0.015864765456573147, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.9355555555555555
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.6733333333333332
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.8
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.9666666666666666
Best Hyperparameters: {'C': 0.48777805499545607, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.8466666666666667
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8345454545454546
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8639705882352942
Best Hyperparameters: {'C': 0.0002012896661878088, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8428571428571429
Best Hyperparameters: {'C': 0.0004493894149083743, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.9618181818181817
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.819047619047619
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.7535714285714287
Best Hyperparameters: {'C': 0.0005290265559831401, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.86
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8428571428571429
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.9218181818181819
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.819047619047619
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.96
Best Hyperparameters: {'C': 33.72108309441093, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.9047619047619048
Best Hyperparameters: {'C': 0.48777805499545607, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.9419047619047619
Best Hyperparameters: {'C': 0.01970088126798927, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.9155555555555555
Best Hyperparameters: {'C': 0.015864765456573147, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8380952380952381
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.7266666666666668
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8688888888888888
Best Hyperparameters: {'C': 0.007722748477617412, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.9800000000000001
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.9489473684210525
Best Hyperparameters: {'C': 0.0002012896661878088, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.9099999999999999
Best Hyperparameters: {'C': 0.015864765456573147, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8803030303030303
Best Hyperparameters: {'C': 0.015864765456573147, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8075757575757576
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.86
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.9236363636363636
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.9200000000000002
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.9473684210526315
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8142857142857143
Best Hyperparameters: {'C': 0.007722748477617412, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.9358974358974359
Best Hyperparameters: {'C': 2.8101186312421578, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8761904761904763
Best Hyperparameters: {'C': 0.09149222561428132, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8066666666666666
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.9418181818181818
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.8777777777777779
Best Hyperparameters: {'C': 0.0002012896661878088, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.909090909090909
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8166666666666667
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.9800000000000001
Best Hyperparameters: {'C': 0.48777805499545607, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8514285714285714
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8642857142857142
Best Hyperparameters: {'C': 0.48777805499545607, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.9305555555555556
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.9366666666666668
Best Hyperparameters: {'C': 4.4627818740211815e-05, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.6794871794871795
Best Hyperparameters: {'C': 0.0043963310508178515, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.7464285714285714
Best Hyperparameters: {'C': 0.007722748477617412, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.9436363636363637
Best Hyperparameters: {'C': 0.48777805499545607, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8533333333333333
Best Hyperparameters: {'C': 0.48777805499545607, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.8666666666666668
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.6871794871794872
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.890909090909091
Best Hyperparameters: {'C': 0.48777805499545607, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.8714285714285713
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.9358974358974359
Best Hyperparameters: {'C': 0.48777805499545607, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.7694444444444445
Best Hyperparameters: {'C': 0.0004493894149083743, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8571428571428571
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.7975
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.8555555555555555
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.8716666666666667
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.8285714285714286
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.9111111111111111
Best Hyperparameters: {'C': 0.48777805499545607, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.9454545454545455
Best Hyperparameters: {'C': 0.48777805499545607, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.888888888888889
Best Hyperparameters: {'C': 0.0004493894149083743, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.9333333333333332
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.9121212121212121
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.9179487179487179
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.9384615384615385
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.7821428571428571
Best Hyperparameters: {'C': 0.0004493894149083743, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.5454545454545454
Best Hyperparameters: {'C': 1.876650861993697e-05, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8133333333333335
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.7127272727272729
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.8428571428571429
Best Hyperparameters: {'C': 3.522289925286184, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8755555555555556
Best Hyperparameters: {'C': 0.007722748477617412, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.9
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.8227272727272729
Best Hyperparameters: {'C': 0.9793160285030893, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.9083333333333332
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.8833333333333332
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.925
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.9217948717948719
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.7875
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}


  "Setting penalty='none' will ignore the C and l1_ratio "


Best Score: 0.8321428571428571
Best Hyperparameters: {'C': 0.008301451461243866, 'penalty': 'none', 'solver': 'lbfgs'}
Best Score: 0.9179487179487179
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.6523809523809524
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}




Best Score: 0.9
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.65
Best Hyperparameters: {'C': 0.0013071577689307433, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.8238095238095239
Best Hyperparameters: {'C': 2.8101186312421578, 'penalty': 'l2', 'solver': 'lbfgs'}
For topic  R175 the classifier needs samples of at least 2 classes in the data but the data contains only one class: 1
For topic  R175 the classifier needs samples of at least 2 classes in the data but the data contains only one class: 1


In [31]:
show_metrics_classifiers(classifiers,classifiers_names,Qtest)

Logistic Classifier without Hyper Parametrization  TOPIC  R101
Precision: 0.709397535151822  Recall: 0.7094884787067197  F-Score 0.7094408578157181  AP: 0.7520561451823972 

Logistic Classifier with Hyper Parametrization  TOPIC  R101
Precision: 0.709397535151822  Recall: 0.7094884787067197  F-Score 0.7094408578157181  AP: 0.7520561451823972 

XGBOOST Classifier  TOPIC  R101
Precision: 0.6796674116261745  Recall: 0.6803957051514055  F-Score 0.6791293552527558  AP: 0.6726361101190428 

Logistic Classifier without Hyper Parametrization  TOPIC  R102
Precision: 0.7138655462184874  Recall: 0.6503946646405808  F-Score 0.6280321590007246  AP: 0.6881654475455521 

Logistic Classifier with Hyper Parametrization  TOPIC  R102
Precision: 0.7138655462184874  Recall: 0.6503946646405808  F-Score 0.6280321590007246  AP: 0.6881654475455521 

XGBOOST Classifier  TOPIC  R102
Precision: 0.6049973418394472  Recall: 0.5917014900173062  F-Score 0.5815217391304348  AP: 0.6387073765628998 

Logistic Classifier 

Precision: 0.4856348470806302  Recall: 0.4894053315105947  F-Score 0.48437500000000006  AP: 0.2743094555821511 

Logistic Classifier with Hyper Parametrization  TOPIC  R189
Precision: 0.4856348470806302  Recall: 0.4894053315105947  F-Score 0.48437500000000006  AP: 0.2743094555821511 

XGBOOST Classifier  TOPIC  R189
Precision: 0.5886260236578709  Recall: 0.5832194121667806  F-Score 0.5856054265956041  AP: 0.2712027973087835 

Logistic Classifier without Hyper Parametrization  TOPIC  R190
Precision: 0.7879464285714286  Recall: 0.734873949579832  F-Score 0.7544391179290508  AP: 0.6656624338768464 

Logistic Classifier with Hyper Parametrization  TOPIC  R190
Precision: 0.7879464285714286  Recall: 0.734873949579832  F-Score 0.7544391179290508  AP: 0.6656624338768464 

XGBOOST Classifier  TOPIC  R190
Precision: 0.7256734006734007  Recall: 0.6251633986928105  F-Score 0.6415154826958106  AP: 0.5455166640155992 

Logistic Classifier without Hyper Parametrization  TOPIC  R191
Precision: 0.47376

KeyError: 'MAP'

## Page Ranking

## build_graph

__@input:__

__D__ list of document names

__sim__ similarity criteria between docs ex: 'tfidf'

__teta__ float minimum similarity to link documents

__minDocFreq__ paramter for tfidf vectorizer

__maxDocFreq__ paramter for tfidf vectorizer


__@output__

__graph__ dictionary representing an undirected graph 

In [None]:
from util import *
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from woosh_index import *

from statistics import mean
from woosh_index import *
from whoosh import scoring
from util import *
import matplotlib.pyplot as plt
import pprint


def build_graph(D, sim, teta, minDocFreq=2, maxDocFreq=0.9):
	vectorizer = None
	docs = noPreprocessDocs(D)
	if (sim == 'tfidf'):
		vectorizer = TfidfVectorizer(min_df= minDocFreq, max_df=maxDocFreq, stop_words='english')
	tfidf = vectorizer.fit_transform(docs)
	
	pairWiseDocSim = tfidf * tfidf.T
	pairWiseDocSim = pairWiseDocSim.toarray()

	graph = {
		'outlinks': {},
		'inlinks': {},
		'weights': {}
	}
	# the graph is already indirected
	for i in range(len(pairWiseDocSim)):
		for j in range(len(pairWiseDocSim[i])):
			if(i == j):
				#print("Same doc")
				continue
			if(pairWiseDocSim[i][j] > teta):

				if(graph.get('outlinks').get(D[i]) == None):
					graph['outlinks'][D[i]] = [] 
				graph['outlinks'][D[i]] += [D[j]]

				if(graph.get('inlinks').get(D[j]) == None):
					graph['inlinks'][D[j]] = []
				graph['inlinks'][D[j]] += [D[i]]

				if(graph.get('weights').get(D[i]) == None):
					graph['weights'][D[i]] = {}

				graph['weights'][D[i]][D[j]] = pairWiseDocSim[i][j]

	for i in graph['inlinks'].keys():
		if(len(graph['inlinks'][i]) == 0):
			print("Sink")

	return graph

## undirected_page_rank

__@input:__

__q__ topic name

__D__ list of document names

__p__ limit of retrieved documents

__varaint__ 'standard' for traditional page ranking, 'priors' for page ranking with priors

__iterations__ number of iterations for page ranking

__d__ damping factor


__@output__

__graph__ list of pair document name and page ranking value

In [None]:
def undirected_page_rank(q, D, p, graph, variant='standard', priorWeighting = scoring.TF_IDF(), iterations = 10, d = 0.15):
	outDegrees = {}
	N = len(graph['outlinks'].keys())
	prestigeVector = {}


	topicIndex=TopicIndex("topics.txt","topicsindexdir")
	topics=topicIndex.topics

	trainIndex = WooshDocumentIndex(True, "trainIndex", D)
	Dids = list(map(lambda x: extractFileId(x)+"newsML.xml", D))


	for u in graph['outlinks'].keys():
		outDegrees[u] = len(graph['outlinks'][u])
		prestigeVector[u] = float(1/N)

	if (variant == 'standard'):
		for i in range(iterations):
			tempPrestigeVector = prestigeVector.copy() 
			for v in prestigeVector.keys():
				newValue = sum([prestigeVector[u] / outDegrees[u] for u in graph['inlinks'][v]])
				tempPrestigeVector[v] = d/N + (1 - d) * newValue
			prestigeVector = tempPrestigeVector.copy()


	elif (variant == 'priors'):

		priors = trainIndex.rank(topics[q.lower()], weighting=priorWeighting, k=None)
		fillVector(Dids, priors)
		normalizeDict(priors)
		for i in range(iterations):
			tempPrestigeVector = prestigeVector.copy() 
			for v in prestigeVector.keys():
				# For all inlinks of v
				newValue = 0
				for inlinkV in graph['inlinks'][v]:
					weightSum = sum([graph['weights'][inlinkV][ininlinkV] for ininlinkV in graph['inlinks'][inlinkV]])
					newValue += (prestigeVector[inlinkV] * graph['weights'][inlinkV][v] / weightSum)
				tempPrestigeVector[v] = d * priors[extractFileId(v)+"newsML.xml"] + (1 - d) * newValue

			prestigeVector = tempPrestigeVector.copy()
			
	orderedPrestigeVector = sorted(prestigeVector.items(), key=lambda x: x[1], reverse=True)
	orderedPrestigeVector = list(map(lambda x: (extractFileId(x[0]) + "newsML.xml", x[1]), orderedPrestigeVector))
	return orderedPrestigeVector[:p]

In [4]:
def evaluatePageRank1(D, p, sim, teta, variant='standard', minDocFreq=2, maxDocFreq=0.9, iterations = 10, d = 0.15):

	allTopics = ['R'+str(i) for i in range(101, 200+1)]
	#allTopics = ['R101']
	topicCount = len(allTopics)
	graph = build_graph(D, sim, teta, minDocFreq, maxDocFreq)


	topicIndex=TopicIndex("topics.txt","topicsindexdir")
	topicQueries=topicIndex.topics
	trainIndex = WooshDocumentIndex(True, "trainIndex", D)
	Dids = list(map(lambda x: extractFileId(x)+"newsML.xml", D))

	precisionSum1 = 0
	recallSum1 = 0
	fScoreSum1 = 0
	mapSum1 = 0
	bprefSum1 = 0

	precisionSum2 = 0
	recallSum2 = 0
	fScoreSum2 = 0
	mapSum2 = 0
	bprefSum2 = 0

	if (variant == 'standard'):
		pageRankResult = undirected_page_rank(None, D, p, graph, variant=variant, iterations=iterations)

	for topic in allTopics:
		normalRetrievalResult = trainIndex.rank(topicQueries[topic.lower()], weighting=scoring.TF_IDF(), k=None).items()
		normalRetrievalResult = sorted(normalRetrievalResult, key= lambda x: x[1], reverse=True)[:p]

		precisionScore, recallScore, fScore, mapScore, bprefScore, recallValues, precisionRecallCurve = calcMetrics(normalRetrievalResult, topic)
		savePrecisionRecallCurve(recallValues, precisionRecallCurve,'report/pagerank/q4/'+topic+'_1.png')


		precisionSum1 += precisionScore
		recallSum1 += recallScore
		fScoreSum1 += fScore
		mapSum1 += mapScore
		bprefSum1 += bprefScore

		if (variant == 'priors'):
			pageRankResult = undirected_page_rank(topic, D, p, graph, variant=variant)
		precisionScore, recallScore, fScore, mapScore, bprefScore, recallValues, precisionRecallCurve = calcMetrics(pageRankResult, topic)
		savePrecisionRecallCurve(recallValues,precisionRecallCurve,'report/pagerank/q4/'+topic+'_2.png')



		precisionSum2 += precisionScore
		recallSum2 += recallScore
		fScoreSum2 += fScore
		mapSum2 += mapScore
		bprefSum2 += bprefScore
		
	
	print("Normal")
	print("Average Precision: ", precisionSum1/topicCount)
	print("Average Recall: ", recallSum1/topicCount)
	print("Average FScore: ", fScoreSum1/topicCount)
	print("Average MAP: ", mapSum1/topicCount)
	print("Average BPREF: ", bprefSum1/topicCount)

	print("With Page Ranking")
	print("Average Precision: ", precisionSum2/topicCount)
	print("Average Recall: ", recallSum2/topicCount)
	print("Average FScore: ", fScoreSum2/topicCount)
	print("Average MAP: ", mapSum2/topicCount)
	print("Average BPREF: ", bprefSum2/topicCount)



def calcMetrics(result, topic):
	resultDocs = [i[0] for i in result]
	evaledDocs = getEvaledDocsForTopic('qrels.train', topic, 'train')
	relevance = getRelevanceForTopic('qrels.train', topic)
	relevant = [evaledDocs[i] for i in range(len(evaledDocs)) if relevance[i] == 1]
	nonRelevant = [evaledDocs[i] for i in range(len(evaledDocs)) if relevance[i] == 0]

	relevant = list(map(lambda x: extractFileId(x) + "newsML.xml", relevant))
	nonRelevant = list(map(lambda x: extractFileId(x) + "newsML.xml", nonRelevant))

	precision = calcPrecision(resultDocs, relevant, nonRelevant)
	recall = calcRecall(resultDocs, relevant, nonRelevant)
	fScoreValue = fscore(precision, recall)
	mapValue = MAP(resultDocs, relevant, nonRelevant)
	bpref = BPREF(resultDocs, relevant, nonRelevant)
	recallValues, precisionAtRecall = precisionRecallCurve(resultDocs, relevant, nonRelevant)
	return precision, recall, fScoreValue, mapValue, bpref, recallValues, precisionAtRecall

def evaluatePageRank2(D, p, sim, minDocFreq=2, maxDocFreq=0.9, iterations = 10, d=0.15):
	allTopics = ['R'+str(i) for i in range(101, 200+1)]
	#allTopics = ['R101']
	topicCount = len(allTopics)
	

	trainIndex = WooshDocumentIndex(True, "trainIndex", D)
	topicIndex = TopicIndex("topics.txt","topicsindexdir")

	precisions = []
	recalls = []
	fScores = []
	maps = []
	bprefs = []
	tetas = []

	teta = 0.1
	while (round(teta,2) <= 0.95):
		graph = build_graph(D, sim, teta, minDocFreq, maxDocFreq)
		precisionSum = 0
		recallSum = 0
		fScoreSum = 0
		mapSum = 0
		bprefSum = 0

		for topic in allTopics:
			pageRankResult = undirected_page_rank(None, D, p, graph, variant='standard')
			precisionScore, recallScore, fScore, mapScore, bprefScore, recallValues, precisionAtRecall = calcMetrics(pageRankResult, topic)

			precisionSum += precisionScore
			recallSum += recallScore
			fScoreSum += fScore
			mapSum += mapScore
			bprefSum += bprefScore

		averagePreccision = precisionSum/topicCount
		averageRecall = recallSum/topicCount
		averageFscore = fScoreSum/topicCount
		averageMap = mapSum/topicCount
		averageBref = bprefSum/topicCount

		precisions.append(averagePreccision)
		recalls.append(averageRecall)
		fScores.append(averageFscore)
		maps.append(averageMap)
		bprefs.append(averageBref)
		tetas.append(round(teta, 2))

		print("Teta: ", round(teta, 2))
		print("Average Precision: ", averagePreccision)
		print("Average Recall: ", averageRecall)
		print("Average FScore: ", averageFscore)
		print("Average MAP: ", averageMap)
		print("Average BPREF: ", averageBref)
		print()

		teta += 0.05

	plt.title("Variation of measures for different values of teta")
	plt.plot(tetas, precisions, label = "precision")
	plt.plot(tetas, recalls, label = "recall")
	plt.plot(tetas, fScores, label = "FScore")
	plt.plot(tetas, maps, label = "MAP")
	plt.plot(tetas, bprefs, label = "BPREF")

	plt.legend()
	plt.xlabel('teta')
	plt.ylabel('measures')
	plt.show()

def evaluatePageRank3(D, p, sim, teta, minDocFreq=2, maxDocFreq=0.9, iterations = 10, d = 0.15):
	allTopics = ['R'+str(i) for i in range(101, 200+1)]
	topicCount = len(allTopics)
	graph = build_graph(D, sim, teta, minDocFreq, maxDocFreq)
	pageRankResult = undirected_page_rank(None, D, p, graph, variant='standard')
	print(pageRankResult)

In [6]:
D_train = getEvaledDocs('qrels.train')
graph = build_graph(D_train, 'tfidf', 0.2)
rankedDocs = undirected_page_rank('R130', D_train, 10, graph, variant='standard')
print(rankedDocs)

[('51244newsML.xml', 0.000932902736316949), ('59791newsML.xml', 0.0008932862530311554), ('28141newsML.xml', 0.0008035735804473581), ('72449newsML.xml', 0.0007467131671326183), ('81408newsML.xml', 0.0007348834248564657), ('45867newsML.xml', 0.000733799352203615), ('28710newsML.xml', 0.0007191987885623017), ('37602newsML.xml', 0.0007139687529623545), ('48309newsML.xml', 0.0007094601748488141), ('46422newsML.xml', 0.0007081631720121787)]


In [7]:
rankedDocs = undirected_page_rank('R130', D_train, 10, graph, variant='priors')
print(rankedDocs)

[('15776newsML.xml', 0.004812010950659665), ('46458newsML.xml', 0.003973985156417172), ('18235newsML.xml', 0.003464989311777912), ('4974newsML.xml', 0.001891421064515769), ('4063newsML.xml', 0.0018676696349996357), ('47530newsML.xml', 0.0015880578051225337), ('57876newsML.xml', 0.001537283990087255), ('71894newsML.xml', 0.001492581540261339), ('42848newsML.xml', 0.001467237036641215), ('41857newsML.xml', 0.0013837538948919732)]


## Util

In [3]:
from os import listdir
from os.path import join
import os
import re

import matplotlib.pyplot as plt
import math

from xml_documents_parser import DocumentParser
from html_topics_parser import TopicsParser
from Preprocess import Preprocess
from sklearn.feature_extraction.text import TfidfVectorizer

def parseDoc(doc):
        tokenizer=Preprocess()
        parser=DocumentParser()
        doc = doc.replace('d_', 'D_')
        doc = doc.replace('newsml', 'newsML')
        parsedDoc = parser.parse(doc)
        words = []
        if(parsedDoc.get('title') != None):
            words += tokenizer.preprocess(parsedDoc['title'])
        if(parsedDoc.get('text') != None):
            words +=  tokenizer.preprocess(parsedDoc['text'])
        if(parsedDoc.get('byline') != None):
            words +=  tokenizer.preprocess(parsedDoc['byline'])
        if(parsedDoc.get('dateline') != None):
            words +=  tokenizer.preprocess(parsedDoc['dateline'])
        return words

def noPreprocessDocs(docs):
    parser=DocumentParser()
    docList = []
    for doc in docs:
        doc = doc.replace('d_', 'D_')
        doc = doc.replace('newsml', 'newsML')
        parsedDoc = parser.parse(doc)
        words = ""
        if(parsedDoc.get('title') != None):
            words += parsedDoc['title']
        if(parsedDoc.get('text') != None):
            words +=  parsedDoc['text']
        if(parsedDoc.get('byline') != None):
            words +=  parsedDoc['byline']
        if(parsedDoc.get('dateline') != None):
            words +=  parsedDoc['dateline']
        docList.append(words)
    return docList

def noPreprocessTopics(path):
    parser = TopicsParser()
    splittedTopics = parser.get_data(path)
    topics = []
    for i in splittedTopics:
        topicText = ''
        topicText += i['title'] + ' '
        topicText += i['desc'] + ' '
        topicText += i['narr']
        topics.append(topicText)
    return topics

def pointDistance(p1, p2):
    return math.sqrt((p1[0]-p2[0])**2 + (p1[0]-p2[0])**2)

def getAllFiles(path):
    subdirs = listdir(path)
    docs = []
    for subdir in subdirs:
        subdirFiles = listdir(join(path, subdir))
        docs += list(map(lambda x : path + "/" + subdir+ "/" +x, subdirFiles))
    return docs

def getEvaledDocs(filename):
    if (filename == 'qrels.train'):
        allFiles = getAllFiles('./rcv1/D_train')
    else:
        allFiles = getAllFiles('./rcv1/D_test')
    f = open(filename, 'r')
    relDocsIds = set()
    for line in f:
        triplet = line.split()
        relDocsIds.add(triplet[1])
    f.close()
    #relDocs = set(filter(lambda x: extractFileId(x) in relDocsIds , allFiles))
    relDocs = list(filter(lambda x: extractFileId(x) in relDocsIds , allFiles))

    return relDocs

def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

def extractFileId(path):
    filenameOnlyExp = re.search('/[^/]*?newsML\.xml', path)
    filenameOnly = ''
    if (filenameOnlyExp):
        filenameOnly = filenameOnlyExp.group()[1:-10]
        return filenameOnly

def getEvaledDocsForTopic(filename, topic, col): #col 'test' or 'train'
    if(col == 'test'):
        allFiles = getAllFiles('./rcv1/D_test')
    if(col == 'train'):
        allFiles = getAllFiles('./rcv1/D_train')
    
    f = open(filename, 'r')
    relDocsIds = set()
    for line in f:
        triplet = line.split()
        if(triplet[0] == topic):
            relDocsIds.add(triplet[1])
    f.close()

    relDocs = list(filter(lambda x: extractFileId(x) in relDocsIds , allFiles))
    return relDocs



def getRelevanceForTopic(filename, topic):
    f = open(filename, 'r')
    relevance = []
    for line in f:
        triplet = line.split()
        if(triplet[0] == topic):
            relevance.append(int(triplet[2]))
    return relevance

def VectorizerParseDoc(doc):
        tokenizer=Preprocess()
        parser=DocumentParser()
        parsedDoc = parser.parse(doc.replace('d_train', 'D_train').replace('d_test', 'D_test').replace('newsml.xml', 'newsML.xml'))
        words = []
        if(parsedDoc.get('title') != None):
            words += tokenizer.preprocess(parsedDoc['title'])
        if(parsedDoc.get('text') != None):
            words +=  tokenizer.preprocess(parsedDoc['text'])
        if(parsedDoc.get('byline') != None):
            words +=  tokenizer.preprocess(parsedDoc['byline'])
        if(parsedDoc.get('dateline') != None):
            words +=  tokenizer.preprocess(parsedDoc['dateline'])
        return words
    
def getDocsForTopics(filename, topics, allFiles): #col 'test' or 'train'    
    f = open(filename, 'r')
    DTrain=dict()
    RTrain=dict()
    docs_ids=set()
    for line in f:
        triplet = line.split()
        if(triplet[0] in topics):
            docs_ids.add(triplet[1])
            try:
                DTrain[triplet[0]].append(triplet[1])
                RTrain[triplet[0]].append(int(triplet[2]))
            except KeyError:
                DTrain[triplet[0]]=[triplet[1]]
                RTrain[triplet[0]]=[int(triplet[2])]        
    f.close()
    return DTrain,RTrain,docs_ids

def getCollectionVector(topics):
    #'Train'
    TrainFiles = getAllFiles('./rcv1/D_train')
    DTrain,RTrain,train_docs_ids=getDocsForTopics('qrels.train',topics,TrainFiles)
    #'Test'
    TestFiles = getAllFiles('./rcv1/D_test')
    DTest,RTest,test_docs_ids=getDocsForTopics('qrels.test',topics,TestFiles)
    fileRepresentation=dict()
    
    allFiles=TrainFiles+TestFiles
    docs_ids=train_docs_ids.union(test_docs_ids)
    allFiles=[file for file in allFiles if extractFileId(file) in docs_ids]
    
    vectorizer = TfidfVectorizer(tokenizer=VectorizerParseDoc,use_idf = False)
    vectorspace = vectorizer.fit_transform(allFiles)
    for i in range(len(allFiles)):
        fileRepresentation[extractFileId(allFiles[i])]=vectorspace[i].toarray()[0]
        #print(type(vectorspace[i]),np.shape(vectorspace[i]),' ',np.shape(vectorspace[i].toarray()[0]))     
        #fileRepresentation[file_id]=vectorspace.toarray()[0]
    for topic in topics:
        for i in range(len(DTrain[topic])):
            DTrain[topic][i]=fileRepresentation[DTrain[topic][i]]
        for i in range(len(DTest[topic])):
            DTest[topic][i]=fileRepresentation[DTest[topic][i]]
    return DTrain,RTrain,DTest,RTest

def getCollection(topics):
    #'Train'
    TrainFiles = getAllFiles('./rcv1/D_train')
    DTrain,RTrain,train_docs_ids=getDocsForTopics('qrels.train',topics,TrainFiles)
    #'Test'
    TestFiles = getAllFiles('./rcv1/D_test')
    DTest,RTest,test_docs_ids=getDocsForTopics('qrels.test',topics,TestFiles)    
    
    #files=TrainFiles+TestFiles
    #docs_ids=train_docs_ids.union(test_docs_ids)
    TrainFiles=[file for file in TrainFiles if extractFileId(file) in train_docs_ids]
    TestFiles=[file for file in TestFiles if extractFileId(file) in test_docs_ids]
    return TrainFiles,TestFiles,DTrain,RTrain,DTest,RTest


def computeSum(vector):
    sum1 = 0
    for k,v in vector.items():
        sum1 += v
    return sum1

def fillVector(allEntries, vector):
    for entry in allEntries:
        if (vector.get(entry) == None):
            vector[entry] = 0

def normalizeDict(d):
    vecSum = computeSum(d)
    for k,v in d.items():
        d[k] = v/vecSum

        
def getRelevantNonRelevant(topic):
    evaledDocs = getEvaledDocsForTopic('qrels.test', topic, 'test')
    relevance = getRelevanceForTopic('qrels.test', topic)
    relevant = [extractFileId(evaledDocs[i]) for i in range(len(evaledDocs)) if relevance[i] == 1]
    nonRelevant = [extractFileId(evaledDocs[i]) for i in range(len(evaledDocs)) if relevance[i] == 0]
    return relevant,nonRelevant

def setupEvalManyFeatures(search_topics, DTrain, DTest,trainDocsIndex,testDocsIndex,topicsIndex):
    topics=topicsIndex.topics

    train_bm25=dict()
    train_cos=dict()
    train_freq=dict()
    test_bm25=dict()
    test_cos=dict()
    test_freq=dict()

    for topic in search_topics:
        train_bm25[topic],train_cos[topic],train_freq[topic] =trainDocsIndex.generate_scores(topics[topic.lower()], k=None)
        test_bm25[topic],test_cos[topic],test_freq[topic]=testDocsIndex.generate_scores(topics[topic.lower()], k=None)

    trainX=dict()
    testX=dict()
    
    for topic in search_topics:
        trainX[topic]=list()
        for fileID in DTrain[topic.upper()]:
            fileName=fileID+"newsML.xml"
            try :
                value=[train_bm25[topic][fileName],train_cos[topic][fileName],
                                     train_freq[topic][fileName]]
                trainX[topic].append(value)
            except:
                trainX[topic].append([0,0,0])
                
        testX[topic]=list()
        for fileID in DTest[topic.upper()]:
            fileName=fileID+"newsML.xml"
            try :
                value=[test_bm25[topic][fileName],test_cos[topic][fileName],
                                    test_freq[topic][fileName]]
                testX[topic].append(value)
            except:
                testX[topic].append([0,0,0])
    return trainX, testX

def setupEvalOneFeature(search_topics, DTrain, DTest,trainDocsIndex,testDocsIndex,topicsIndex):
    topics=topicsIndex.topics

    train_bm25=dict()
    test_bm25=dict()

    for topic in search_topics:
        train_bm25[topic] =trainDocsIndex.generate_score(topics[topic.lower()],measure='bm25', k=None)
        test_bm25[topic]=testDocsIndex.generate_score(topics[topic.lower()],measure='bm25',k=None)

    trainX=dict()
    testX=dict()
    
    for topic in search_topics:
        trainX[topic]=list()
        for fileID in DTrain[topic.upper()]:
            fileName=fileID+"newsML.xml"
            try :
                value=[train_bm25[topic][fileName]]
                trainX[topic].append(value)
            except:
                trainX[topic].append([0])
                
        testX[topic]=list()
        for fileID in DTest[topic.upper()]:
            fileName=fileID+"newsML.xml"
            try :
                value=[test_bm25[topic][fileName]]
                testX[topic].append(value)
            except:
                testX[topic].append([0])
    return trainX, testX

def RRF(rankingsList):
    rrf = 0
    #for score in rankingsList:
    #        rrf += (1 / (50 + score))
    i=3
    for score in rankingsList:
        rrf+=i*score
        i-=1
    return rrf

def fscoreFunc(precision, recall):
    fscore=0
    if(precision == 0 or recall == 0):
        return fscore
    else:
        fscore= 2 / ( (1/recall) + (1/precision))
        return fscore
    
def BPREF(queryResult, relevant, nonRelevant):
    _sum = 0
    relevant=set(relevant)
    nonRelevant=set(nonRelevant)
    relevantSize = len(relevant)
    nonRelevantSize = len(nonRelevant.intersection(set(queryResult)))
    dividingFactor = min(relevantSize,nonRelevantSize)

    judgedAndRetrieved = set(queryResult).intersection(relevant)
    judgedAndRetrievedSize = len(judgedAndRetrieved)
    if(nonRelevantSize == 0):
        return 1
    for i in range(1,judgedAndRetrievedSize+1):
        judgedAndRelevantAndAk = (nonRelevant).intersection(set(queryResult[:i]))
        _sum += float(1 - (len(judgedAndRelevantAndAk)) / dividingFactor)
    return float(float(_sum) / relevantSize)

def calcPrecision(retrieved, relevant, nonRelevant):
    truePositives = len(set(retrieved).intersection(relevant))
    falsePositives = len(set(retrieved).intersection(nonRelevant)) 
    if( truePositives + falsePositives == 0):
        return 0
    return truePositives / (truePositives + falsePositives)

def calcRecall(retrieved,relevant,nonRelevant):
    truePositives = len(set(retrieved).intersection(relevant))
    falseNegatives=len((set(relevant)|set(nonRelevant)-set(retrieved)).intersection(relevant))
    if( truePositives + falseNegatives == 0):
        return 0
    return truePositives / (truePositives + falseNegatives)

def MAP(retrieved, relevantFiles, nonRelevantFiles):
    precisionSum = 0
    k = len(retrieved)
    if(k == 0):
        return 0
    for i in range(1, k+1):
        if (retrieved[i-1] in relevantFiles):
            precisionSum += float(calcPrecision(retrieved[:i], relevantFiles, nonRelevantFiles))
    return float(precisionSum) / float(k)

def fscore(precision, recall):
    if(precision == 0 or recall == 0):
        return 0
    return 2 / ( (1/recall) + (1/precision))

def averagePrecision(precisionRecallCurve):
    total=0
    count=0
    #get the precision values from the precision recall curve
    for precision in precisionRecallCurve[1][1:]:
        total+=precision
        count+=1
    return (precision/count)

def calcPrecisionAtRecall(recall,retrieved,relevant,nonRelevant):
    if (recall==0):
        return 1;
    return calcPrecision(retrieved[0:recall],relevant,nonRelevant)

def precisionRecallCurve(query_result,relevantFiles,nonRelevantFiles):
    if not query_result:
        return [],[]
    recallValues=[]
    precisionAtRecall = []

    for recall_value in range(len(query_result)+1):#0.1 intervals
        result=calcPrecisionAtRecall(recall_value,query_result,relevantFiles,nonRelevantFiles)
        precisionAtRecall.append(result)
        if(len(query_result) == 0):
            recallValues.append(0)
            continue
        recallValues.append(recall_value/len(query_result))
    return recallValues,precisionAtRecall

def plotPrecisionRecallCurve(recall,precision):
    if not precision:
        print("Weren't retrieved any documents with this type of ranking!")
        return None
    fig, ax = plt.subplots()
    for i in range(len(recall)-1):
        ax.plot((recall[i],recall[i]),(precision[i],precision[i+1]),'k-',color='red') #vertical\n",
        ax.plot((recall[i],recall[i+1]),(precision[i+1],precision[i+1]),'k-',color='red') #horizontal\n"
    ax.set_xlabel("recall")
    ax.set_ylabel("precision")
    plt.axis([0.0,1.1,0.0,1.1])
    plt.show()

def savePrecisionRecallCurve(recall,precision,path):
    if not precision:
        print("Weren't retrieved any documents with this type of ranking!")
        return None
    fig, ax = plt.subplots()
    for i in range(len(recall)-1):
        ax.plot((recall[i],recall[i]),(precision[i],precision[i+1]),'k-',color='red') #vertical\n",
        ax.plot((recall[i],recall[i+1]),(precision[i+1],precision[i+1]),'k-',color='red') #horizontal\n"
    ax.set_xlabel("recall")
    ax.set_ylabel("precision")
    plt.axis([0.0,1.1,0.0,1.1])
    plt.savefig(path)

def compute_metrics(docs_names, relevant, nonRelevant,k):
    precision = calcPrecision(docs_names[:k], relevant, nonRelevant)
    recall = calcRecall(docs_names[:k], relevant, nonRelevant)
    fscoreVal = fscoreFunc(precision,recall)
    precision_recall_curve=precisionRecallCurve(docs_names[:k],relevant,nonRelevant)
    bpref=BPREF(docs_names[:k], relevant, nonRelevant)
    #avg_precision=averagePrecision(precision_recall_curve)
    _MAP=MAP(docs_names[:k],relevant,nonRelevant)
    return precision,recall,fscoreVal,precision_recall_curve,bpref,_MAP
    
def show_graphics_metrics_IRmodels(IRmodels,IRmodels_names,search_topics):
    #print(IRmodels)
    for topic in search_topics:
        for i in range(len(IRmodels)):
                print(IRmodels_names[i]," TOPIC ",topic)
                plotPrecisionRecallCurve(IRmodels[i][topic][3][0],IRmodels[i][topic][3][1])
    
    print("----------IR Models Summary----------\n")
    for i in range(len(IRmodels)): 
        print(IRmodels_names[i],'\n')
        print('Avg MAP:',IRmodels[i]['Avg MAP'], ' Avg BPREF:',IRmodels[i]['Avg BPREF'],'\n')
    print('------------------------------------')
        
def show_metrics_classifiers(classifiers,classifiers_names,search_topics): 
    for topic in search_topics:
        for i in range(len(classifiers)):
                print(classifiers_names[i]," TOPIC ",topic)
                precision,recall,fscore,avg_prec_score=classifiers[i][topic]
                print("Precision:",precision," Recall:",recall," F-Score",fscore," MAP:",avg_prec_score,'\n')
    
    print("---------- Classifiers Summary----------\n")
    for i in range(len(classifiers)): 
        print(classifiers_names[i],' ','Avg MAP:',classifiers[i]['Avg MAP'],'\n')
    print('------------------------------------')

## Preprocess

In [4]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from string import punctuation
from autocorrect import spell

snowball_stemmer = SnowballStemmer('english')
wordnet_lemmatizer = WordNetLemmatizer()

class Preprocess:
    def __int__(self):
        pass

    def autospell(self,text):
        spells = [spell(w) for w in (nltk.word_tokenize(text))]
        return " ".join(spells)

    def to_lower(self,text):
        return text.lower()

    def remove_numbers(self,text):
        output = ''.join(c for c in text if not c.isdigit())
        return output

    def remove_punct(self,text):
        return ''.join(c for c in text if c not in punctuation)

    def remove_Tags(self,text):
        cleaned_text = re.sub('<[^<]+?>', '', text)
        return cleaned_text

    def sentence_tokenize(self,text):
        sent_list = []
        for w in nltk.sent_tokenize(text):
            sent_list.append(w)
        return sent_list

    def word_tokenize(self,text):
        return [w for sent in nltk.sent_tokenize(text) for w in nltk.word_tokenize(sent)]

    def remove_stopwords(self,sentence):
        stop_words = stopwords.words('english')
        return ' '.join([w for w in nltk.word_tokenize(sentence) if not w in stop_words])

    def stem(self,text):
        stemmed_word = [snowball_stemmer.stem(word) for sent in nltk.sent_tokenize(text)for word in nltk.word_tokenize(sent)]
        return " ".join(stemmed_word)

    def lemmatize(self,text):
        lemmatized_word = [wordnet_lemmatizer.lemmatize(word)for sent in nltk.sent_tokenize(text)for word in nltk.word_tokenize(sent)]
        return " ".join(lemmatized_word)


    def preprocess(self,text):
        lower_text = self.to_lower(text)
        sentence_tokens = self.sentence_tokenize(lower_text)
        word_list = []
        for each_sent in sentence_tokens:
            lemmatizzed_sent = self.lemmatize(each_sent)
            clean_text = self.remove_numbers(lemmatizzed_sent)
            clean_text = self.remove_punct(clean_text)
            clean_text = self.remove_Tags(clean_text)
            clean_text = self.remove_stopwords(clean_text)
            word_tokens = self.word_tokenize(clean_text)
            for i in word_tokens:
                word_list.append(i)
        return word_list


## Topics Parser

In [5]:
from html.parser import HTMLParser

class TopicsParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.last_tag = None
        self.topics_list=[]
        self.dictionary={}

    def handle_starttag(self, tag, attrs):
        self.last_tag=tag

    def handle_endtag(self, tag):
        self.last_tag='top'
        self.topics_list.append(self.dictionary.copy())
        self.dictionary.clear()
        
    def handle_data(self, data):
        if self.last_tag!='top':
            if self.last_tag!='title':
                _index=data.find(':')
                if _index!=-1:
                    data=data[_index+1:].strip()
            
            self.dictionary[self.last_tag]=data
    
    def get_data(self,file_path):
        file=open(file_path,'r')
        self.feed(file.read().lower())
        return self.topics_list.copy()

## Documents Parser

In [6]:
import xml.etree.ElementTree as ET

class DocumentParser():
    def __init__(self):
        self.tags=['title','headline','byline','dateline','text']
        self.dictionary={}
        
    def parse(self,path):
        self.dictionary.clear()
        try:
            self.root = ET.parse(path).getroot()
        except:
            print("Wrong file path when parsing document!")
        for child in self.root:
            if child.tag in self.tags:
                if child.tag!='text':
                    self.dictionary[child.tag]=child.text
                if child.tag=='text':
                    self.dictionary['text']=''
                    for p in child:
                        self.dictionary['text'] += ' '+p.text
        return self.dictionary


## Whoosh Index

In [7]:
from xml_documents_parser import DocumentParser
from html_topics_parser import TopicsParser
from Preprocess import Preprocess
import nltk
import numpy as np
from whoosh.index import create_in
from whoosh.fields import Schema,TEXT
from whoosh import scoring
from whoosh.qparser import QueryParser, OrGroup
from whoosh.index import open_dir
import re
import os
from tqdm import tqdm


class TopicIndex():
    def __init__(self,topics_filename,dir_name):
        self.parser=TopicsParser()
        self.preProcess=Preprocess()
        self.preprocessed=True
        self.topics_parsed = self.parser.get_data(topics_filename)
        self.topics=dict()
        for topic in self.topics_parsed:
            self.topics[topic['num']]=" ".join(self.preProcess.preprocess(topic['title']
                +' '+topic['narr']+' '+topic['desc']))

class WooshDocumentIndex():
   
    def __init__(self,load,dir_name,files):
        self.preProcess=Preprocess()
        self.documentParser=DocumentParser()
        self.preprocessed=True
        if not load:
            if not os.path.isdir(dir_name):
                os.mkdir(dir_name)
            schema = Schema(id = TEXT(stored=True), content=TEXT(stored=True))
            self.ix = create_in(dir_name, schema)
            self.index(files)    
        else:
            self.ix=open_dir(dir_name)
                    
    def index(self,files):
        writer = self.ix.writer()
        #Read file.
        i=0
        fl = len(files)
        for fname in tqdm(range(fl)):
            parsedDoc = self.documentParser.parse(files[fname])
            words = []

            if(self.preprocessed):
                if(parsedDoc.get('title') != None):
                    words += self.preProcess.preprocess(parsedDoc['title'])
                if(parsedDoc.get('text') != None):
                    words +=  self.preProcess.preprocess(parsedDoc['text'])
                if(parsedDoc.get('byline') != None):
                    words +=  self.preProcess.preprocess(parsedDoc['byline'])
                if(parsedDoc.get('dateline') != None):
                    words +=  self.preProcess.preprocess(parsedDoc['dateline'])
            else:
                if(parsedDoc.get('title') != None):
                    words += nltk.word_tokenize(parsedDoc['title'].lower())
                if(parsedDoc.get('text') != None):
                    words +=  nltk.word_tokenize(parsedDoc['text'].lower())
                if(parsedDoc.get('byline') != None):
                    words +=  nltk.word_tokenize(parsedDoc['byline'].lower())
                if(parsedDoc.get('dateline') != None):
                    words +=  nltk.word_tokenize(parsedDoc['dateline'].lower())
                    
            filenameOnlyExp = re.search('/[^/]*?newsML\.xml', files[fname])
            if (filenameOnlyExp):
                filenameOnlyExp = filenameOnlyExp.group()[1:]
            writer.add_document(id=filenameOnlyExp,content=words)
        writer.commit()

    def rank(self,query_str,weighting=scoring.BM25F(),k=None):
        ''' Perform a query using the weighting scoring function and obtain the corresponding textual similarity score. '''
        ix = self.ix
    
        with ix.searcher(weighting=weighting) as searcher:
            query = QueryParser("content", ix.schema, group=OrGroup).parse(query_str)
            results = searcher.search(query,limit=k)#,scored=None,sortedby=None)
            query_res = dict()
            for i,r in enumerate(results):
                id = r['id']
                #print(i,results.score(i),r['id'],r['content'],'\n')
                query_res[id] = results.score(i)
            return query_res
        
    def generate_scores(self,query,k=None):
        '''Generate scores for a given query according to BM25, TF IDF (under a cosine similarity) and Frequency rank functions'''
        bm25 = self.rank(query,weighting=scoring.BM25F(),k=k)
        cos = self.rank(query,weighting=scoring.TF_IDF(),k=k)
        freq = self.rank(query,weighting=scoring.Frequency(),k=k)
        return bm25,cos,freq 
    
    def generate_score(self,query,measure,k=None):
        '''Generate scores for a given query according to a given measure'''
        if(measure=='bm25'):
            score = self.rank(query,weighting=scoring.BM25F(),k=k)
        elif(measure=='cos'):
            score = self.rank(query,weighting=scoring.TF_IDF(),k=k)
        elif(measure=='freq'):
            score = self.rank(query,weighting=scoring.Frequency(),k=k)
        return score 
    
    def score(self,n_docs,n_rel,bm25,cos,freq,pagerank,alpha_1=3,alpha_2=3,alpha_3=1,alpha_4=1):
        scores = dict()
        #Iterate over all documents in collection.
        for k,v in bm25.items():
            #Rank combination.
            scores[k] = alpha_1*bm25[k] + alpha_2*cos[k] + alpha_3*freq[k] #+ alpha_4 * pagerank[k]
            if(pagerank.get(k) != None):
                scores[k] += alpha_4 * pagerank[k]

        return scores

## Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression  
from sklearn.metrics import precision_recall_fscore_support,average_precision_score
from util import compute_metrics, RRF

class LogisticClassifier():
    def __init__(self,**args):
        hyper_parameters=args.get('hyper_parameters')
        if(hyper_parameters!=None):
            solver=hyper_parameters['solver']
            C=hyper_parameters['C']
            penalty=hyper_parameters['penalty']
            self.clf = LogisticRegression(
                    random_state=1, solver=solver,C=C,penalty=penalty,multi_class='multinomial')
        else:
            self.clf=LogisticRegression(random_state=1,solver='lbfgs',multi_class='multinomial')
        
    def train(self,topic,DTrain,RTrain,**kwargs):
        DTrain=DTrain[topic]
        RTrain=RTrain[topic]
        self.clf.fit(DTrain,RTrain)
        
    
    def classify(self,doc,topic,**kwargs):
        return (self.clf.predict_proba([doc])[0])
                    
    def evaluate(self,topic,DTest,RTest,**kwargs):
        k = kwargs.get('k')
        testX=kwargs.get('testX')
        ranking_type=kwargs.get('ranking_type')
        relevant=kwargs.get('relevant')
        nonRelevant=kwargs.get('nonRelevant')
        '''
        Evaluating the classifier
        '''
        docs=testX[topic]
        feedback=RTest[topic]
        y_scores=self.clf.predict_proba(docs)
        y_pred=self.clf.predict(docs)
        precision,recall,fscore,true_sum = precision_recall_fscore_support(feedback, y_pred
                                                , average='macro',zero_division=1)
        avg_prec_score = average_precision_score(feedback,y_scores[:,1])
        classifier_metrics=[precision,recall,fscore,avg_prec_score]
        '''
        Binary retrieval
        '''
        scores_names=zip(y_scores,DTest[topic]) 
        positive_class_predicted=[doc for doc in scores_names if doc[0][1]>0.5]
        aided_non_ranked_docs_names=[doc[1] for doc in positive_class_predicted]
        '''
        Evaluating binary retrieval
        '''
        precision,recall,fscoreVal,precision_recall_curve,bpref,avg_prec=compute_metrics(aided_non_ranked_docs_names
                                                                          ,relevant, nonRelevant,k)
        aidedNonRanked=[precision,recall,fscoreVal,precision_recall_curve,bpref,avg_prec]
        '''
        Extension towards ranking
        '''
        if ranking_type=='proba':#sorts according to probabilities
            aided_ranked_docs_names=[x for _, x in sorted(zip(y_scores,DTest[topic]), 
                                                    key=lambda pair: pair[0][1],reverse=True)]
        else:#sort according to the score of the docs classified as positive
            aided_ranked_docs_names=[doc[1] for doc in sorted(positive_class_predicted,
                                           key=lambda x:RRF(x[0]),reverse=True)[:k]]
        '''
        Evaluating Aided IR
        '''
        precision,recall,fscoreVal,precision_recall_curve,bpref,avg_prec=compute_metrics(aided_ranked_docs_names
                                                                          ,relevant, nonRelevant,k)
        aidedRanked=[precision,recall,fscoreVal,precision_recall_curve,bpref,avg_prec]
        return aidedRanked,aidedNonRanked,classifier_metrics

## xGBoost

In [9]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support,average_precision_score
from statistics import mean
from util import getRelevantNonRelevant
from util import compute_metrics, RRF

class XGBOOSTClassifier():
    def __init__(self):
        self.clf = XGBClassifier(n_estimators=100,max_depth=100)
        
    def train(self,topic,DTrain,RTrain,**kwargs):
        DTrain=DTrain[topic]
        RTrain=RTrain[topic]
        DTrain = StandardScaler().fit_transform(DTrain)
        self.clf.fit(DTrain,RTrain)
        
    
    def classify(self,doc,topic,**kwargs):
        return self.clf.predict_proba([doc])[0]
                    
    def evaluate(self,topics,DTest,RTest,**kwargs):
        avg_prec_scores = []
        RTrain=kwargs.get('RTrain')
        k = kwargs.get('k')
        trainX=kwargs.get('trainX')
        testX=kwargs.get('testX')
        ranking_type=kwargs.get('ranking_type')
        '''
        Metricss Data structures
        '''
        aidedRanked=dict()
        aidedNonRanked=dict()
        nonAidedIROutput = dict()
        classifier_metrics=dict()
        '''
        Evaluation for each topic
        '''
        for topic in topics:
            relevant,nonRelevant=getRelevantNonRelevant(topic)
            '''
            Non-Aided IR
            '''
            ranked_docs_names=[name for score, name in sorted(zip(testX[topic],DTest[topic]), 
                                key=lambda pair: RRF(pair[0]),reverse=True)]
            precision,recall,fscoreVal,precision_recall_curve,bpref,avg_prec=compute_metrics(ranked_docs_names
                                                                              ,relevant, nonRelevant,k)
            nonAidedIROutput[topic] = [precision, recall, fscoreVal,precision_recall_curve,bpref,avg_prec]
            '''
            Trainning the classifier
            '''
            try:
                self.train(topic,trainX,RTrain)
            except ValueError:
                print("For topic ", topic
                      ,"the classifier needs samples of at least 2 classes in the data"
                      , "but the data contains only one class: 1")
                #the behaviour with the classifier would not change since there is no information in the data
                aidedRanked[topic]=[precision,recall,fscoreVal,precision_recall_curve,bpref,avg_prec]
                aidedNonRanked[topic]=[precision,recall,fscoreVal,precision_recall_curve,bpref,avg_prec]
                avg_prec_scores += [0.5]#no information in the classifier
                continue
            '''
            Evaluating the classifier
            '''
            docs=testX[topic]
            docs = StandardScaler().fit_transform(docs)
            feedback=RTest[topic]
            y_scores=self.clf.predict_proba(docs)
            y_pred=self.clf.predict(docs)
            precision,recall,fscore,true_sum = precision_recall_fscore_support(feedback, y_pred
                                                    , average='macro',zero_division=1)
            avg_prec_scores += [average_precision_score(feedback,y_scores[:,1])]
            classifier_metrics[topic]=[precision,recall,fscore]
            '''
            Binary retrieval
            '''
            scores_names=zip(y_scores,DTest[topic]) 
            positive_class_predicted=[doc for doc in scores_names if doc[0][1]>0.5]
            aided_non_ranked_docs_names=[doc[1] for doc in positive_class_predicted]
            '''
            Evaluating binary retrieval
            '''
            precision,recall,fscoreVal,precision_recall_curve,bpref,avg_prec=compute_metrics(aided_non_ranked_docs_names
                                                                              ,relevant, nonRelevant,k)
            aidedNonRanked[topic]=[precision,recall,fscoreVal,precision_recall_curve,bpref,avg_prec]
            '''
            Extension towards ranking
            '''
            if ranking_type=='proba':#sorts according to probabilities
                aided_ranked_docs_names=[x for _, x in sorted(zip(y_scores,DTest[topic]), 
                                                        key=lambda pair: pair[0][1],reverse=True)]
            else:#sort according to the score of the docs classified as positive
                aided_ranked_docs_names=[doc[1] for doc in sorted(positive_class_predicted,
                                               key=lambda x:RRF(x[0]),reverse=True)[:k]]
            '''
            Evaluating Aided IR
            '''
            precision,recall,fscoreVal,precision_recall_curve,bpref,avg_prec=compute_metrics(aided_ranked_docs_names
                                                                              ,relevant, nonRelevant,k)
            aidedRanked[topic]=[precision,recall,fscoreVal,precision_recall_curve,bpref,avg_prec]

        #MEAN AVERAGE PRECISION
        
        #print([nonAidedIROutput[topic][4] for topic in nonAidedIROutput])
        nonAidedIROutput['Mean MAP'] = mean([nonAidedIROutput[topic][5] for topic in nonAidedIROutput])
        aidedRanked['Mean MAP']= mean([aidedRanked[topic][5] for topic in aidedRanked])
        aidedNonRanked['Mean MAP']= mean([aidedNonRanked[topic][5] for topic in aidedNonRanked])
        
        #MEAN BPREF
        nonAidedIROutput['Mean MBPREF'] = mean([nonAidedIROutput[topic][4] for topic in nonAidedIROutput if topic != 'Mean MAP'])
        aidedRanked['Mean MBPREF']= mean([aidedRanked[topic][4] for topic in aidedRanked if topic != 'Mean MAP'])
        aidedNonRanked['Mean MBPREF']= mean([aidedNonRanked[topic][4] for topic in aidedNonRanked if topic != 'Mean MAP'])
        classifier_metrics['Mean MAP']=mean(avg_prec_scores)
        
        return classifier_metrics, nonAidedIROutput,aidedRanked,aidedNonRanked

In [28]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression  
from scipy.stats import loguniform

def hyper_parameter_search(classifier_type,trainX,RTrain):
    hyper_params=dict()
    space = dict()
    space['solver'] = ['lbfgs']
    space['penalty'] = ['none', 'l2']
    space['C'] = loguniform(1e-5, 100)
    model = LogisticRegression(multi_class='multinomial')
    #perform topic-conditional hyper-parameter search
    for topic in trainX:
        if topic != 'R175':
            if classifier_type=='logistic':
                search = RandomizedSearchCV(model, space, n_iter=100, scoring='accuracy', n_jobs=-1, random_state=1)
                # execute search
                result = search.fit(trainX[topic], RTrain[topic])
                # summarize result
                print('Best Score: %s' % result.best_score_)
                print('Best Hyperparameters: %s' % result.best_params_)
                hyper_params[topic]=result.best_params_
    return hyper_params