In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string


class Preprocess():
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def lowercasing(self, data):
        # print(data.head())
        x = pd.Series(data)
        # print(x.head())
        return x.str.lower()

    def remove_punc(self, data):
        x = pd.Series(data)
        x = data.str.replace('[^\w\s]', '')
        return x

    def remove_stopwords(self, data):
        temp = pd.Series(data).astype('str')
        stop = stopwords.words('english')
        temp1 = temp.str.split()
        temp = temp1.apply(lambda x: ' '.join(
            word for word in x if word not in (stop)))
        return temp

    def stemming(self, data):
        stemmer = PorterStemmer()
        temp = pd.Series(data).astype('str')
        temp = temp.str.split()
        temp = temp.apply(lambda x: ' '.join(stemmer.stem(word)
                                             for word in x))
        return (temp)

    def start(self, data):
        x = self.lowercasing(data)
        x = self.remove_punc(x)
        x = self.remove_stopwords(x)
        x = self.stemming(x)
        return x


class Preprocess_query():
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def lowercasing(self, query):
        return query.lower()

    def remove_punc(self, query):
        x = query.translate(str.maketrans("", "", string.punctuation))
        return x

    def remove_stopwords(self, query):
        stop = stopwords.words('english')
        temp1 = query.split()
        result = ' '.join(word for word in temp1 if word not in stop)
        return result

    def stemming(self, query):
        stemmer = PorterStemmer()
        temp = query.split()
        result = ' '.join(stemmer.stem(word) for word in temp)
        return result

    def start(self, query):
        x = self.lowercasing(query)
        x = self.remove_punc(x)
        x = self.remove_stopwords(x)
        x = self.stemming(x)
        return x


In [2]:
from preprocess_data import Preprocess, Preprocess_query
import pandas as pd
from rank_bm25 import BM25Plus
import heapq
import itertools
import pandas as pd
import time
import numpy as np
import multiprocessing

global top_n_doc


def jaccard_co(ti, tj):
    base = r'^{}'
    expr = '(?=.*{})'
    exij = base.format(''.join(expr.format(w) for w in [ti, tj]))
    exi = base.format(expr.format(ti))
    exj = base.format(expr.format(tj))
    dij = top_n_doc.str.contains(exij, regex=True).sum()
    di = top_n_doc.str.contains(exi, regex=True).sum()
    dj = top_n_doc.str.contains(exj, regex=True).sum()
    jc = dij / ((di + dj) - dij)
    return jc


    # x = str(tmp_df.iloc[2, ])
    # q_terms.append(tmp_df.loc[0, 'Ti'])
    # print(x+'-agawq')


In [3]:
r = Preprocess()
rq = Preprocess_query()

    # import files
file_q = 'query_corpus.csv'
file_all = 'document_corpus.csv'
file_qrel = 'qrels.csv'
q = pd.read_csv(file_q)
al = pd.read_csv(file_all)
qr = pd.read_csv(file_qrel)
    # s_al = pd.Series(al)

# copy to new array
qs = q['Questions']
a = al['Content']
new_a = r.start(a)

    # splitting array
token_a = new_a.str.split()

    # initialize bm25+ retrieval model
bm25p = BM25Plus(token_a)

    # get query from list of query
inq = qs.loc[2]

    # preprocess query
tokenized_inq = rq.start(inq).split()

    # get bm25+ score
doc_scores = bm25p.get_scores(tokenized_inq)

    # get top n documents
n = 100
top_idx = heapq.nlargest(n, range(len(doc_scores)), doc_scores.__getitem__)
top_n_doc = a[top_idx]
print(inq)
print(tokenized_inq)

What is information science?  Give definitions where possible. 
['inform', 'scienc', 'give', 'definit', 'possibl']


In [4]:
top_n_doc.head()

1349       The different explanations of the nature of...
545        Discusses the various explicit and implicit...
617     This book sheds light on basic problems, princ...
1347       A definition of informatics is given, its m...
1237      Developing from the definitions of the conce...
Name: Content, dtype: object

In [5]:
    # get top n docs and split
top_n_doc = top_n_doc.str.lower()
top_n_doc = r.remove_punc(top_n_doc)
top_n_doc = r.remove_stopwords(top_n_doc)
top_n_doc = r.stemming(top_n_doc)
    # splitted = splitted.str.split()
top_n_doc.head()

1349    differ explan natur inform problem face scienc...
545     discuss variou explicit implicit definit infor...
617     book shed light basic problem principl result ...
1347    definit informat given method subject discuss ...
1237    develop definit concept languag terminolog rep...
Name: Content, dtype: object

In [6]:
    # built term pool
term_pool = list(itertools.chain.from_iterable(top_n_doc.str.split()))
    # print(splitted)
term_pool = pd.DataFrame(term_pool, index=None, columns=['pool'])
term_pool = term_pool.drop_duplicates()
term_pool = term_pool["pool"]
term_pool[0:5]

0     differ
1     explan
2      natur
3     inform
4    problem
Name: pool, dtype: object

In [7]:
    # split query
q_terms = inq.lower()
q_terms = rq.remove_punc(q_terms)
q_terms = rq.remove_stopwords(q_terms)
q_terms = rq.stemming(q_terms)
q_terms = q_terms.split()

tmp = list(itertools.product(term_pool, q_terms))
df = pd.DataFrame(tmp, columns=['Ti', 'Tj'])
df.head()

Unnamed: 0,Ti,Tj
0,differ,inform
1,differ,scienc
2,differ,give
3,differ,definit
4,differ,possibl


In [8]:
df['jaccard'] = ""
df['jaccard'] = df.apply(
        lambda row: jaccard_co(row['Ti'], row['Tj']), axis=1)
    # df['jaccard'] = jaccard_co(df['Ti'], df['Tj'])

In [31]:
df.head()

Unnamed: 0,Ti,Tj,jaccard
0,differ,inform,0.2
1,differ,scienc,0.151515
2,differ,give,0.269231
3,differ,definit,0.181818
4,differ,possibl,0.180328


In [32]:
sorted_df = df[df.Ti != df.Tj]
sorted_df = sorted_df.sort_values(by=['jaccard'],ascending=False).reset_index(drop=True)
sorted_df.head()

Unnamed: 0,Ti,Tj,jaccard
0,finit,definit,1.0
1,form,inform,0.869048
2,defin,definit,0.828571
3,c,inform,0.73
4,n,inform,0.73


In [33]:
tmp_list = sorted_df.Ti.tolist()
tmp_list[0:2]

['finit', 'form']

In [42]:
tmp_tokenized_inq  = tokenized_inq + tmp_list[5:10]
tmp_tokenized_inq

['inform', 'scienc', 'give', 'definit', 'possibl', 'r', 'b', 'te', 'ie', 'al']

In [43]:
    # get bm25+ score
new_doc_scores = bm25p.get_scores(tmp_tokenized_inq)

    # get top n documents
n = 10
new_top_idx = heapq.nlargest(n, range(len(new_doc_scores)), doc_scores.__getitem__)
new_top_n_doc = a[new_top_idx]

In [44]:
tmp_tokenized_inq

['inform', 'scienc', 'give', 'definit', 'possibl', 'r', 'b', 'te', 'ie', 'al']

In [45]:
new_top_n_doc

1349       The different explanations of the nature of...
545        Discusses the various explicit and implicit...
617     This book sheds light on basic problems, princ...
1347       A definition of informatics is given, its m...
1237      Developing from the definitions of the conce...
1298       The possibilities are discussed of a univer...
938       The term 'informatics' was first advanced fo...
101        It is pointed out that if information scien...
1307    This book generalizes world and soviet experie...
191       In a university, the mode of research is usu...
Name: Content, dtype: object

In [46]:
top_n_doc[0:10]

1349    differ explan natur inform problem face scienc...
545     discuss variou explicit implicit definit infor...
617     book shed light basic problem principl result ...
1347    definit informat given method subject discuss ...
1237    develop definit concept languag terminolog rep...
1298    possibl discuss univers definit concept thesau...
938     term informat first advanc formal director vin...
101     point inform scienc consid true scienc similar...
1307    book gener world soviet experi scienc give ori...
191     univers mode research usual call pure basic re...
Name: Content, dtype: object