In [21]:
import os
import pickle
import main
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [22]:
DATA_DIR = 'datasets/trec07p/data/'
LABELS_FILE = 'datasets/trec07p/full/index'
TRAINING_SET_RATIO = 0.5

In [23]:
labels = {}
spam_words = set()
ham_words = set()

In [24]:
# чтение меток
with open(LABELS_FILE) as f:
    for line in f:
        line = line.strip()
        label, key = line.split()
        labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0

In [33]:
# Разделение массива на тренировочный и тестовый
filelist = os.listdir(DATA_DIR)
X_train = filelist[:int(len(filelist)*TRAINING_SET_RATIO)]
X_test = filelist[int(len(filelist)*TRAINING_SET_RATIO):]

if not os.path.exists('blacklist.pkl'):
    for filename in X_train:
        path = os.path.join(DATA_DIR, filename)
        if filename in labels:
            label = labels[filename]
            stems = main.load(path)
            if not stems:
                continue
            if label == 1:
                ham_words.update(stems)
            elif label == 0:
                spam_words.update(stems)
            else:
                continue
    blacklist = spam_words - ham_words
    pickle.dump(blacklist, open('blacklist.pkl', 'wb'))
else:
    blacklist = pickle.load(open('blacklist.pkl', 'rb') )
print('Blacklist of {} tokens is ready'.format(len(blacklist)))

Blacklist of 97598 tokens is ready


In [26]:
nltk.download('words')
from nltk.corpus import words
word_set = set(words.words())
word_set.intersection(blacklist)

[nltk_data] Downloading package words to
[nltk_data]     /Users/veronikagavrilova/nltk_data...
[nltk_data]   Package words is already up-to-date!


{'longhair',
 'adroit',
 'percussionist',
 'airfield',
 'demigod',
 'agal',
 'suzerain',
 'dotal',
 'leapt',
 'perfectionist',
 'cyp',
 'makeshift',
 'sportswear',
 'ropewalk',
 'daffodil',
 'whooper',
 'hoi',
 'tule',
 'horticulturist',
 'urchin',
 'plastisol',
 'statolith',
 'blest',
 'bub',
 'perfecto',
 'untrain',
 'dickcissel',
 'georgic',
 'weasel',
 'caracol',
 'oast',
 'tobacconist',
 'thill',
 'adjunct',
 'astern',
 'fiefdom',
 'genuflect',
 'zag',
 'doorman',
 'hatbox',
 'camelopard',
 'bevel',
 'zoologist',
 'vireo',
 'gelatin',
 'throve',
 'kiosk',
 'pall',
 'checkup',
 'carotenoid',
 'bedstead',
 'jacana',
 'casein',
 'bypath',
 'revivalist',
 'thine',
 'nominal',
 'disinfect',
 'clink',
 'ammono',
 'pidan',
 'pupa',
 'gopher',
 'tope',
 'sori',
 'bouncer',
 'stilt',
 'scull',
 'unlatch',
 'vestal',
 'quorum',
 'impregn',
 'toldo',
 'struthioniform',
 'yelp',
 'tum',
 'vortex',
 'homonym',
 'steamer',
 'bade',
 'kat',
 'veranda',
 'underfoot',
 'viceroy',
 'chaw',
 'pizzer

In [27]:
fp = 0
tp = 0
fn = 0
tn = 0

for filename in X_test:
    path = os.path.join(DATA_DIR, filename)
    if filename in labels:
        label = labels[filename]
        stems = main.load(path)
        if not stems:
            continue
        stems_set = set(stems)
        if stems_set & blacklist:
            if label == 1:
                fp = fp + 1
            else:
                tp = tp + 1
        else:
            if label == 1:
                tn = tn + 1
            else:
                fn = fn + 1

In [31]:
conf_matrix = [[tn, fp],
               [fn, tp]]
print(conf_matrix)

[[11916, 700], [9071, 13084]]


In [32]:
count = tn + tp + fn + fp
percent_matrix = [["{:.1%}".format(tn/count), "{:.1%}".format(fp/count)],
                  ["{:.1%}".format(fn/count), "{:.1%}".format(tp/count)]]
print(percent_matrix)

[['34.3%', '2.0%'], ['26.1%', '37.6%']]


In [30]:
print("Classification accuracy: {}".format("{:.1%}".format((tp+tn)/count)))

Classification accuracy: 71.9%
