In [1]:
import pandas as pd
import cupy
import cupyx
import numpy as np

In [2]:
val = pd.read_hdf('cleaned2.h5', 'val')
train = pd.read_hdf('cleaned2.h5', 'train')
candid = pd.read_hdf('cleaned2.h5', 'candid')

In [3]:
val.iloc[9].to_dict()

{'description_id': '001184',
 'description_text': 'Thus, in addition to prospective motion-reduction techniques, retrospective QC remains necessary to rule out distortion due to motion artifacts (Blumenthal et al., [**##**]; Gedamu, 2011). ([**##**]) also found that there was a dose-dependent effect of motion artifacts and estimated GM volume loss, with mild motion associated with 4%, moderate motion associated with 7%, and severe motion associated with 27% reduction of total GM.'}

In [4]:
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import scipy.sparse as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import _document_frequency

class BM25Transformer(BaseEstimator, TransformerMixin):
    """
    Parameters
    ----------
    use_idf : boolean, optional (default=True)
    k1 : float, optional (default=2.0)
    b : float, optional (default=0.75)
    References
    ----------
    Okapi BM25: a non-binary model - Introduction to Information Retrieval
    http://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
    """
    def __init__(self, use_idf=True, k1=2.0, b=0.75):
        self.use_idf = use_idf
        self.k1 = k1
        self.b = b

    def fit(self, X):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            idf = np.log((n_samples - df + 0.5) / (df + 0.5))
            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)
        return self

    def transform(self, X, copy=True):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        copy : boolean, optional (default=True)
        """
        if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
            # preserve float family dtype
            X = sp.csr_matrix(X, copy=copy)
        else:
            # convert counts or binary occurrences to floats
            X = sp.csr_matrix(X, dtype=np.float64, copy=copy)

        n_samples, n_features = X.shape

        # Document length (number of terms) in each row
        # Shape is (n_samples, 1)
        dl = X.sum(axis=1)
        # Number of non-zero elements in each row
        # Shape is (n_samples, )
        sz = X.indptr[1:] - X.indptr[0:-1]
        # In each row, repeat `dl` for `sz` times
        # Shape is (sum(sz), )
        # Example
        # -------
        # dl = [4, 5, 6]
        # sz = [1, 2, 3]
        # rep = [4, 5, 5, 6, 6, 6]
        rep = np.repeat(np.asarray(dl), sz)
        # Average document length
        # Scalar value
        avgdl = np.average(dl)
        # Compute BM25 score only for non-zero elements
        data = X.data * (self.k1 + 1) / (X.data + self.k1 * (1 - self.b + self.b * rep / avgdl))
        X = sp.csr_matrix((data, X.indices, X.indptr), shape=X.shape)

        if self.use_idf:
            check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')

            expected_n_features = self._idf_diag.shape[0]
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            # *= doesn't work
            X = X * self._idf_diag

        return X

In [5]:
# 执行两次 修改三个地方

In [6]:
query = val['description_text'] # change train or val!!!!
doc = candid['title'] + '' + \
      candid['journal'] + '' + \
      candid['keywords'] + '' + \
      candid['abstract']
all = pd.concat([doc, query])

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cnt_vectorizer = CountVectorizer().fit(all)
cnt_all = cnt_vectorizer.transform(all)
bm25 = BM25Transformer().fit(cnt_all)


In [8]:
cnt_query = cnt_vectorizer.transform(query)
bm25_query = bm25.transform(cnt_query)

cnt_doc = cnt_vectorizer.transform(doc)
bm25_doc = bm25.transform(cnt_doc)



In [11]:
def cal_bm25_rank():
    ans = []
    step = 150
    with cupy.cuda.Device(0):
        tf_bm25_doc = cupyx.scipy.sparse.csr_matrix(bm25_doc.T)
        def cal_a_query(start,step=10,topk=1000,tf_bm25_doc=None):

            tf_bm25_query = cupyx.scipy.sparse.csr_matrix(bm25_query[start:start+step,:])
            c = tf_bm25_query * (tf_bm25_doc)

            del tf_bm25_query

            cupy._default_memory_pool.free_all_blocks()
            c = cupy.argsort(-c.todense())[:,:topk]
            d = cupy.asnumpy(c)
            return d
        for i in range(0, len(val), step): # change train or val!!!!
            ans.append(cal_a_query(i, step, 10000, tf_bm25_doc))
            if i % 3000 == 0:
                print(i)
    ret = np.vstack(ans)
    return ret
bm25_rank = cal_bm25_rank()

0
3000
6000
9000
12000
15000
18000
21000
24000
27000
30000
33000


In [12]:
bm25_rank.shape

(34428, 10000)

In [13]:
np.save('test_recall2.npy', bm25_rank) # change train or val!!!!