In [4]:
import os
import pickle
import main
from datasketch import MinHash, MinHashLSH

In [5]:
DATA_DIR = 'datasets/trec07p/data/'
LABELS_FILE = 'datasets/trec07p/full/index'
TRAINING_SET_RATIO = 0.5

In [6]:
# чтение меток
labels = {}
with open(LABELS_FILE) as f:
    for line in f:
        line = line.strip()
        label, key = line.split()
        labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0

In [7]:
# Разделение массива на тренировочный и тестовый
filelist = os.listdir(DATA_DIR)
X_train = filelist[:int(len(filelist)*TRAINING_SET_RATIO)]
X_test = filelist[int(len(filelist)*TRAINING_SET_RATIO):]

In [8]:
# извлечение только спама для вставки в детектор совпадений LSH
spam_files = [x for x in X_train if labels[x] == 0]

In [9]:
# Инициализация детектора совпадений MinHashLSH с помощью порогового значения меры сходства Жаккарда (0.5) и 128 функции перестановок
lsh = MinHashLSH(threshold=0.5, num_perm=128)

In [10]:
# Заполнение детектора совпадений LSH значениям тренировочного набора спам-данных MinHash
for idx, f in enumerate(spam_files):
    minhash = MinHash(num_perm=128)
    stems = main.load(os.path.join(DATA_DIR, f))
    if len(stems) < 2: continue
    for s in stems:
        minhash.update(s.encode('utf-8'))
    lsh.insert(f, minhash)

In [11]:
# присваивание прогнозируемых меток детектора совпадений LSH данным тестового набора
def lsh_predict_label(stems):
    '''
    Queries the LSH matcher and returns:
        0 if predicted spam
        1 if predicted ham
       -1 if parsing error
    '''
    minhash = MinHash(num_perm=128)
    if len(stems) < 2:
        return -1
    for s in stems:
        minhash.update(s.encode('utf-8'))
    matches = lsh.query(minhash)
    if matches:
        return 0
    else:
        return 1

In [12]:
fp = 0
tp = 0
fn = 0
tn = 0

for filename in X_test:
    path = os.path.join(DATA_DIR, filename)
    if filename in labels:
        label = labels[filename]
        stems = main.load(path)
        if not stems:
            continue
        pred = lsh_predict_label(stems)
        if pred == -1:
            continue
        elif pred == 0:
            if label == 1:
                fp = fp + 1
            else:
                tp = tp + 1
        elif pred == 1:
            if label == 1:
                tn = tn + 1
            else:
                fn = fn + 1

In [13]:
conf_matrix = [[tn, fp],
               [fn, tp]]
print(conf_matrix)

[[12498, 118], [4130, 17851]]


In [14]:
count = tn + tp + fn + fp
percent_matrix = [["{:.1%}".format(tn/count), "{:.1%}".format(fp/count)],
                  ["{:.1%}".format(fn/count), "{:.1%}".format(tp/count)]]
print(percent_matrix)

[['36.1%', '0.3%'], ['11.9%', '51.6%']]


In [15]:
print("Classification accuracy: {}".format("{:.1%}".format((tp+tn)/count)))

Classification accuracy: 87.7%
