In [35]:
import json
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

In [36]:
dataset_path = './extended-features/'

In [78]:
def load_X(dataset_path, reduced=False):
    print("Loading data (feature representation X, and feature names)...")
    # Load the reduced 10k features
    if reduced:
        with open('{}/X-10k.p'.format(dataset_path), 'rb') as f:
            X = pickle.load(f)
        with open('{}/f-10k.p'.format(dataset_path), 'rb') as f:
            feature_names = pickle.load(f)
    else:
        with open('{}/X.json'.format(dataset_path), 'r') as f:
            X = json.load(f)

        # Convert to numpy array and get feature names
        vec = DictVectorizer()
        X = vec.fit_transform(X).astype("float32")
        feature_names = vec.get_feature_names_out()

    return X, feature_names


def load_y(dataset_path):
    print('Loading labels...')
    with open('{}y.json'.format(dataset_path), 'rt') as f:
        y = json.load(f)
    y = np.asarray(y)
    return y


def load_metadata(dataset_path):
    print('Loading metadata...')
    with open('{}meta.json'.format(dataset_path), 'rt') as f:
        metadata = json.load(f)
    return metadata

In [79]:
# Flag reduced=True loads only the top 10k features; reduced=False loads the entire dataset of featues
X, feature_names = load_X(dataset_path, reduced=True)
y = load_y(dataset_path)
metadata = load_metadata(dataset_path)

Loading data (feature representation X, and feature names)...
Loading labels...
Loading metadata...


In [61]:
X

<259230x10000 sparse matrix of type '<class 'numpy.float32'>'
	with 5320754 stored elements in Compressed Sparse Row format>

In [62]:
metadata

[{'dex_date': '2016-04-05T17:58:46',
  'markets': 'anzhi',
  'analysis_engines': [],
  'user_id': 1,
  'sha256': '0000003B455A6C7AF837EF90F2EAFFD856E3B5CF49F5E27191430328DE2FA670',
  'dex_size': 4765888,
  'vercode': '121',
  'submission_date': 1519654338,
  'sha1': '9C14D537A7ADB4CFC43D291352F73E05E0CCDD4A',
  'vt_detection': 0,
  'apk_size': 10386469,
  'pkg_name': 'com.zte.bamachaye',
  'dex_date_quarter': '2016-Q2',
  'tags': ['androzoo'],
  'sample_path': '/media/nas/datasets/android/samples/Androzoo/0/0/0/0000003B455A6C7AF837EF90F2EAFFD856E3B5CF49F5E27191430328DE2FA670.apk',
  'vt_scan_date': 1466004404,
  'md5': '3EDFC78AB53521942798AD551027D04F'},
 {'dex_date': '2016-01-19T13:28:36',
  'markets': 'play.google.com',
  'analysis_engines': [],
  'user_id': 1,
  'sha256': '0000143EF8D00E3A65C5C8C380221D00678FED906FDC2EBC483D1987457C7B2B',
  'dex_size': 1862492,
  'vercode': '70101',
  'submission_date': 1519654338,
  'sha1': 'DAF886288EB27F9C0B866EB19A357E1E866AF4DC',
  'vt_detecti

In [57]:
a = pd.Series(y)
a.value_counts()

0    232843
1     26387
Name: count, dtype: int64

In [60]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Machine learning model: Linear SVM (liblinear implementation)
model = LinearSVC(C=1, dual=True, max_iter=10000)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Print the F1 score
print("F1 score:", f1)

F1 score: 0.8608780487804877


In [82]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)

In [65]:
D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
X = v.fit_transform(D)
X

array([[2., 0., 1.],
       [0., 1., 3.]])

In [73]:
v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},
                           {'baz': 1.0, 'foo': 3.0}]

True

In [74]:
v.transform({'foo': 4, 'unseen_feature': 3})

array([[0., 0., 4.]])

In [75]:
X

array([[2., 0., 1.],
       [0., 1., 3.]])

In [84]:
v.inverse_transform(X)

AttributeError: 'DictVectorizer' object has no attribute 'feature_names_'