In [4]:
from parseUtils import parseDocs

tagDict, tagList, textList, namesList = parseDocs()

with open('stop_words.txt') as sw:
    stop_words = [line.rstrip('\n') for line in sw]

with open('important_words.txt') as iw:
    imp_words = {line.rstrip('\n') for line in iw}

In [5]:
trainNum = 3684
testNumStart = 3685
testNumFinish = len(textList)

In [6]:
from nltk.stem.snowball import SnowballStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

stemmer = SnowballStemmer('russian')
tkzr = CountVectorizer().build_tokenizer()

stem_tokenize = lambda tokens: [stemmer.stem(item) for item in tokens]
tokenize = lambda text: stem_tokenize(tkzr(text))

prepoc = Pipeline([('count', CountVectorizer(tokenizer=tokenize,
                                             max_df=0.8,
                                             min_df=0.01,
                                             ngram_range=(1, 2),
                                             stop_words = stop_words)),
                   ('tfidf', TfidfTransformer())])

In [7]:
from sklearn.preprocessing import MultiLabelBinarizer

X_train = prepoc.fit_transform(textList)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(tagList)

In [8]:
from sklearn.feature_selection import (chi2, SelectKBest)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

clf = Pipeline([('chi2', SelectKBest(chi2, k = 'all')),
                ('svm', LinearSVC(random_state=0,
                                  class_weight='balanced',
                                  C=2.0))])

classer = OneVsRestClassifier(clf, n_jobs=1).fit(X_train[:trainNum, :],
                                                y[:trainNum, :])

In [9]:
import evaluation
out = ("/Volumes/Media/Documents/Git/MachineLearning/src"
            "/main/resources/classifierOuts/")
evaluation.eval(out_path = out, 
                classifier = classer, 
                mlb = mlb,
                x_train = X_train,
                tagList = tagList,
                namesList = namesList).evalTrainer(testNumStart, testNumFinish)


macro_precision = 0.6536373507057546
micro_precision = 0.82829373650108
recall = 0.7541789577187807
F1 = 0.7895007720020586
macro_correct = 602
micro_correct = 767
false_neg_tags = 250
false_pos_tags = 159
total_test_docs = 921
n_of_pred_tags = 926
n_of_relevant_tags = 1017
