In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import scipy.sparse as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import _document_frequency

class BM25Transformer(BaseEstimator, TransformerMixin):
    """
    Parameters
    ----------
    use_idf : boolean, optional (default=True)
    k1 : float, optional (default=2.0)
    b : float, optional (default=0.75)
    References
    ----------
    Okapi BM25: a non-binary model - Introduction to Information Retrieval
    http://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
    """
    def __init__(self, use_idf=True, k1=2.0, b=0.75):
        self.use_idf = use_idf
        self.k1 = k1
        self.b = b

    def fit(self, X):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            idf = np.log((n_samples - df + 0.5) / (df + 0.5))
            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)
        return self

    def transform(self, X, copy=True):
        """
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            document-term matrix
        copy : boolean, optional (default=True)
        """
        if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.floating):
            # preserve float family dtype
            X = sp.csr_matrix(X, copy=copy)
        else:
            # convert counts or binary occurrences to floats
            X = sp.csr_matrix(X, dtype=np.float64, copy=copy)

        n_samples, n_features = X.shape

        # Document length (number of terms) in each row
        # Shape is (n_samples, 1)
        dl = X.sum(axis=1)
        # Number of non-zero elements in each row
        # Shape is (n_samples, )
        sz = X.indptr[1:] - X.indptr[0:-1]
        # In each row, repeat `dl` for `sz` times
        # Shape is (sum(sz), )
        # Example
        # -------
        # dl = [4, 5, 6]
        # sz = [1, 2, 3]
        # rep = [4, 5, 5, 6, 6, 6]
        rep = np.repeat(np.asarray(dl), sz)
        # Average document length
        # Scalar value
        avgdl = np.average(dl)
        # Compute BM25 score only for non-zero elements
        data = X.data * (self.k1 + 1) / (X.data + self.k1 * (1 - self.b + self.b * rep / avgdl))
        X = sp.csr_matrix((data, X.indices, X.indptr), shape=X.shape)

        if self.use_idf:
            check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')

            expected_n_features = self._idf_diag.shape[0]
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            # *= doesn't work
            X = X * self._idf_diag

        return X

In [2]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import numpy as np
from scipy import sparse
import pandas as pd

In [3]:
%matplotlib inline

In [4]:
counts_tr = sparse.load_npz('../data/processed/counts_tr.npz')
counts_tst = sparse.load_npz('../data/processed/counts_tst.npz')
counts_tr

<1335x821657 sparse matrix of type '<class 'numpy.uint8'>'
	with 13551451 stored elements in Compressed Sparse Row format>

In [5]:
df_tr = pd.read_csv('../data/processed/meta_tr.csv', index_col=0)
df_tst = pd.read_csv('../data/processed/meta_tst.csv', index_col=0)

malwares_tr = (df_tr.label == 'class1').values
malwares_tst = (df_tst.label == 'class1').values

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
from sklearn.metrics import confusion_matrix

## using tfidf

In [6]:
tfidf_tf = TfidfTransformer()

In [7]:
tfidf_tr = tfidf_tf.fit_transform(counts_tr)
tfidf_tst = tfidf_tf.transform(counts_tst)

tfidf_tr

<1335x779804 sparse matrix of type '<class 'numpy.float64'>'
	with 13349667 stored elements in Compressed Sparse Row format>

In [11]:
lr_tfidf = LogisticRegression()
lr_tfidf.fit(tfidf_tr, malwares_tr)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
lr_tfidf.score(tfidf_tr, malwares_tr)

0.9700374531835206

In [13]:
lr_tfidf.score(tfidf_tst, malwares_tst)

0.9535580524344569

In [14]:
confusion_matrix(malwares_tst, lr_tfidf.predict(tfidf_tst))

array([[626,  45],
       [ 17, 647]])

In [15]:
from sklearn.feature_selection import SelectFromModel

In [16]:
sfm = SelectFromModel(lr_tfidf, prefit=True, max_features=100)

# sfm.fit(tfidf_tr, malwares)
n_features = sfm.transform(tfidf_tr).shape[1]

n_features

100

In [17]:
lr_new = LogisticRegression()
lr_new.fit(sfm.transform(tfidf_tr), malwares_tr)
lr_new.score(sfm.transform(tfidf_tr), malwares_tr), lr_new.score(sfm.transform(tfidf_tst), malwares_tst)

(0.9191011235955057, 0.9078651685393259)

## using counts

In [18]:
lr_counts = LogisticRegression()

In [19]:
lr_counts.fit(counts_tr, malwares_tr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
lr_counts.score(counts_tr, malwares_tr)

1.0

In [21]:
lr_counts.score(counts_tst, malwares_tst)

0.9647940074906367

In [22]:
confusion_matrix(malwares_tst, lr_counts.predict(counts_tst))

array([[643,  28],
       [ 19, 645]])

In [23]:
sfm = SelectFromModel(lr_counts, prefit=True, max_features=100)

# sfm.fit(counts_tr, malwares)
n_features = sfm.transform(counts_tr).shape[1]

n_features

100

In [24]:
lr_new = LogisticRegression()
lr_new.fit(sfm.transform(counts_tr), malwares_tr)
lr_new.score(sfm.transform(counts_tr), malwares_tr), lr_new.score(sfm.transform(counts_tst), malwares_tst)

(1.0, 0.9595505617977528)

## using BM25

In [157]:
bm = BM25Transformer()  # k1=1.2

bm_tr = bm.fit_transform(counts_tr)
bm_tst = bm.transform(counts_tst)

bm_tr

<1335x821657 sparse matrix of type '<class 'numpy.float64'>'
	with 13551451 stored elements in Compressed Sparse Row format>

In [158]:
lr_bm = LogisticRegression(solver='sag')
lr_bm.fit(bm_tr, malwares_tr)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [159]:
lr_bm.score(bm_tr, malwares_tr)

0.997003745318352

In [160]:
lr_bm.score(bm_tst, malwares_tst)

0.9707865168539326

In [161]:
confusion_matrix(malwares_tst, lr_bm.predict(bm_tst))

array([[637,  29],
       [ 10, 659]])

In [162]:
lr_bm.coef_

array([[ 0.02207132, -0.00186249, -0.00175572, ...,  0.00015595,
         0.00021944,  0.00023527]])

In [163]:
from sklearn.feature_selection import SelectFromModel

In [179]:
sfm = SelectFromModel(lr_bm, prefit=True, max_features=1000)
# sfm = SelectFromModel(lr_bm, prefit=True, threshold=0.012)

# sfm.fit(counts_tr, malwares)
n_features = sfm.transform(counts_tr).shape[1]

n_features, lr_bm.coef_[0, sfm.get_support()].max()

(1000, 0.022071319970081413)

In [180]:
lr_new = LogisticRegression(C=1)
lr_new.fit(sfm.transform(bm_tr), malwares_tr)
lr_new.score(sfm.transform(bm_tr), malwares_tr), lr_new.score(sfm.transform(bm_tst), malwares_tst)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(1.0, 0.9857677902621723)

In [181]:
confusion_matrix(malwares_tst, lr_new.predict(sfm.transform(bm_tst)))

array([[655,  11],
       [  8, 661]])

In [182]:
sfm.get_support()

array([ True, False, False, ..., False, False, False])

In [183]:
A_tr = sparse.load_npz('../data/processed/A_tr.npz')
B_tr = sparse.load_npz('../data/processed/B_tr.npz')
P_tr = sparse.load_npz('../data/processed/P_tr.npz')
A_tst = sparse.load_npz('../data/processed/A_tst.npz')

A_tr = sparse.csr_matrix(A_tr, dtype='uint32')
A_tst = sparse.csr_matrix(A_tst, dtype='uint32')

In [184]:
from sklearn.svm import SVC

In [185]:
apis = np.random.choice(700000, replace=False, size=500000)
apis = sfm.get_support()

In [186]:
apis.sum()

1000

In [187]:
A_tr = A_tr[:, apis]
B_tr = B_tr[apis, :][:, apis]
P_tr = P_tr[apis, :][:, apis]
A_tst = A_tst[:, apis]

In [25]:
sparse.save_npz('../data/processed/A_reduced_tr.npz', A_tr)
sparse.save_npz('../data/processed/B_reduced_tr.npz', B_tr)
sparse.save_npz('../data/processed/P_reduced_tr.npz', P_tr)
sparse.save_npz('../data/processed/A_reduced_tst.npz', A_tst)

In [188]:
svm = SVC(kernel='precomputed')

gram_train = (A_tr * A_tr.T).todense()

svm.fit(gram_train, malwares_tr)
svm.score(gram_train, malwares_tr)

0.9992509363295881

In [189]:
gram_test = (A_tst * A_tr.T).todense()

svm.score(gram_test, malwares_tst)

0.9835205992509364

In [190]:
svm = SVC(kernel='precomputed')

gram_train = (A_tr * B_tr * P_tr * B_tr * A_tr.T).todense()

svm.fit(gram_train, malwares_tr)
svm.score(gram_train, malwares_tr)

1.0

In [191]:
gram_test = (A_tst * B_tr * P_tr * B_tr * A_tr.T).todense()

svm.score(gram_test, malwares_tst)

0.9805243445692884

In [192]:
svm = SVC(kernel='precomputed')

gram_train = (A_tr * P_tr * A_tr.T).todense()

svm.fit(gram_train, malwares_tr)
svm.score(gram_train, malwares_tr)

0.9992509363295881

In [193]:
gram_test = (A_tst * P_tr * A_tr.T).todense()

svm.score(gram_test, malwares_tst)

0.9850187265917603

In [194]:
svm = SVC(kernel='precomputed')

gram_train = (A_tr * B_tr * A_tr.T).todense()

svm.fit(gram_train, malwares_tr)
svm.score(gram_train, malwares_tr)

0.8374531835205993

In [195]:
gram_test = (A_tst * B_tr * A_tr.T).todense()

svm.score(gram_test, malwares_tst)

0.8426966292134831

In [117]:
np.random.choice([1, 2], random_state=0)

TypeError: choice() got an unexpected keyword argument 'random_state'