# Определение тональности текста

Используется выборка с отзывами на фильмы [sentence polarity dataset](http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz)
В ней содержатся предложения с отрицательными и положительными отзывами (по 5331).

In [1]:
import os
import re
import collections, itertools, functools
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews, stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures

POLARITY_DATA_DIR = os.path.join('polarityData', 'rt-polaritydata')
RT_POLARITY_POS_FILE = os.path.join(POLARITY_DATA_DIR, 'rt-polarity-pos.txt')
RT_POLARITY_NEG_FILE = os.path.join(POLARITY_DATA_DIR, 'rt-polarity-neg.txt')

from __future__ import division

Изначальная модель: наивный Байес. Каждому предложению сопоставляется бинарный bag-of-words вектор.

In [2]:
def use_all(words):
    return dict([(word, True) for word in words])
 
def getWords(file_path, type_of_feature=None, feature_select=None, apply_features=True):
    features = []
    with open(file_path, 'r') as sentences:
        for sentence in sentences:
            words = re.findall(r"[\w']+|[.,!?;]", sentence.rstrip())
            if apply_features:
                words = [feature_select(words), type_of_feature]
            features.append(words)
    return features

def evaluate(feature_selector, show_features=False):
    pos_features = getWords(RT_POLARITY_POS_FILE, 'pos', feature_select=feature_selector)
    neg_features = getWords(RT_POLARITY_NEG_FILE, 'neg', feature_select=feature_selector)
    train_fraction = int(len(pos_features)*3/4)
    
    train_features = neg_features[:train_fraction] + pos_features[:train_fraction]
    test_features = neg_features[train_fraction:] + pos_features[train_fraction:]
    clf = NaiveBayesClassifier.train(train_features)
    actual_labels = collections.defaultdict(set)
    pred_lables = collections.defaultdict(set)
    for num, (features, label) in enumerate(test_features):
            actual_labels[label].add(num)
            observed = clf.classify(features)
            pred_lables[observed].add(num)
 
    print 'accuracy:', nltk.classify.util.accuracy(clf, test_features)
    print 'precision:', nltk.metrics.precision(actual_labels['pos'], pred_lables['pos'])
    print 'recall:', nltk.metrics.recall(actual_labels['pos'], pred_lables['pos'])
    if show_features:
        clf.show_most_informative_features()
 


In [3]:
evaluate(use_all)

accuracy: 0.773068267067
precision: 0.787066246057
recall: 0.748687171793


Попробуем повысить качество модели. Вначале уберем стоп-слова

In [4]:
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
 
def stopword_filtered_word_feats(words):
    return dict([(word, True) for word in words if word not in stopset])
 
evaluate(stopword_filtered_word_feats)

accuracy: 0.764066016504
precision: 0.765060240964
recall: 0.762190547637


Качество только ухудшилось, следовательно стоп-слова помогают в определении тональности. Попробуем добавить в признаки самые значимые биграммы.

In [5]:
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=50):
    all_bigrams = BigramCollocationFinder.from_words(words)
    bigrams = all_bigrams.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

evaluate(bigram_word_feats)

accuracy: 0.781695423856
precision: 0.791311093871
recall: 0.765191297824


Качество немного выросло, теперь попробуем понизить размерность выборки, используя хи-квадрат метрику.

In [6]:
def get_best_words(num=10000):
    word_freq = FreqDist()
    label_freq = ConditionalFreqDist()

    for word in itertools.chain(*getWords(RT_POLARITY_POS_FILE, 'pos', apply_features=False)):
        word_freq[word.lower()] += 1
        label_freq['pos'][word.lower()] += 1

    for word in  itertools.chain(*getWords(RT_POLARITY_NEG_FILE, 'neg', apply_features=False)):
        word_freq[word.lower()] += 1
        label_freq['neg'][word.lower()] += 1

    pos_word_count = label_freq['pos'].N()
    neg_word_count = label_freq['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    for word, freq in word_freq.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_freq['pos'][word],
            (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_freq['neg'][word],
            (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num]
    bestwords = set([w for w, s in best])
    return bestwords
 
def best_word_feats(words, bestwords):
    return dict([(word, True) for word in words if word in bestwords])
 
evaluate(functools.partial(best_word_feats, bestwords=get_best_words()))

accuracy: 0.846586646662
precision: 0.867834394904
recall: 0.817704426107


Теперь качество улучшилось более значительно, посмотрим на ключевые слова, позволяющие определить тональность.

In [7]:
evaluate(functools.partial(best_word_feats, bestwords=get_best_words()), show_features=True)

accuracy: 0.846586646662
precision: 0.867834394904
recall: 0.817704426107
Most Informative Features
              engrossing = True              pos : neg    =     17.0 : 1.0
                   quiet = True              pos : neg    =     15.7 : 1.0
                mediocre = True              neg : pos    =     13.7 : 1.0
               absorbing = True              pos : neg    =     13.0 : 1.0
                portrait = True              pos : neg    =     12.4 : 1.0
               inventive = True              pos : neg    =     12.3 : 1.0
                   flaws = True              pos : neg    =     12.3 : 1.0
              refreshing = True              pos : neg    =     12.3 : 1.0
                 triumph = True              pos : neg    =     11.7 : 1.0
            refreshingly = True              pos : neg    =     11.7 : 1.0


Обучим несколько линейных моделей, используя бинарный bag-of-words с отбором самых значимых слов

In [8]:
from sklearn.feature_extraction import DictVectorizer
import numpy as np
from scipy.sparse import vstack

def get_feature_matrix(filenames):
    best_words = get_best_words()
    pos_features = getWords(filenames[0], apply_features=False)
    neg_features = getWords(filenames[1], apply_features=False)
    features = [dict((x, True) for x in words if x in best_words) for words in
                itertools.chain(pos_features, neg_features)]
    dv = DictVectorizer()
    mat = dv.fit_transform(features)
    return mat[:len(pos_features)], mat[len(pos_features):]

pos_X, neg_X = get_feature_matrix([RT_POLARITY_POS_FILE, RT_POLARITY_NEG_FILE])

pos_Y = np.ones(pos_X.shape[0])
neg_Y = np.zeros(neg_X.shape[0])

cutoff = int(pos_X.shape[0] * 3/4)
train_X = vstack((pos_X[:cutoff], neg_X[:cutoff]))
test_X = vstack((pos_X[cutoff:], neg_X[cutoff:]))

train_Y = np.concatenate((pos_Y[:cutoff], neg_Y[:cutoff]))
test_Y = np.concatenate((pos_Y[cutoff:], neg_Y[cutoff:]))

In [9]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import grid_search
from sklearn.linear_model import SGDClassifier

def benchmark(clf_class, params, name):
    clf = grid_search.GridSearchCV(clf_class, params)
    clf.fit(train_X, train_Y)

    print(clf.best_estimator_)
    print()
    pred = clf.predict(test_X)
    print(classification_report(test_Y, pred))

parameters = {
    'loss': ['hinge', 'log', 'squared_hinge', 'perceptron'],
    'penalty': ['l1','l2', 'elasticnet'],
    'alpha': [1e-5, 1e-3, 2e-2, 1e-1],
    'l1_ratio': [1e-3, 1e-2, 1e-1]
}


benchmark(SGDClassifier(n_iter=20,  random_state=0), parameters, 'SGD')

SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.001,
       learning_rate='optimal', loss='hinge', n_iter=20, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=0, shuffle=True,
       verbose=0, warm_start=False)
()
             precision    recall  f1-score   support

        0.0       0.81      0.78      0.79      1333
        1.0       0.79      0.81      0.80      1333

avg / total       0.80      0.79      0.79      2666



In [10]:
from sklearn.linear_model import LogisticRegression
parameters_lr = {
    'penalty' : ['l1', 'l2'],
    'C' : [1e-4, 1e-3, 1e-1, 10, 100, 1e3]
}
benchmark(LogisticRegression(random_state=0), parameters_lr, 'LogisticRegression')

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0)
()
             precision    recall  f1-score   support

        0.0       0.82      0.78      0.80      1333
        1.0       0.79      0.83      0.81      1333

avg / total       0.81      0.81      0.81      2666



In [11]:
from sklearn.svm import LinearSVC

parameters_svm = {
    'penalty': [ 'l1', 'l2'],
    'loss': ['squared_hinge'],
    'C': [1e-8, 1e-6, 1e-4, 1e-2, 1e0, 1e2, 1e4, 1e6]
}
benchmark(LinearSVC(random_state=0, dual=False), parameters_svm, 'LinearSVC')

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)
()
             precision    recall  f1-score   support

        0.0       0.82      0.77      0.79      1333
        1.0       0.78      0.83      0.80      1333

avg / total       0.80      0.80      0.80      2666



Получили примерно одно и то же качество для всех моделей, немного уступающее наивному байесовскому классификатору.