# Анализ тональности отзывов

Сначала возьмем выборку отзывов на фильмы из NLTK:

In [11]:
from nltk.corpus import movie_reviews
import nltk
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

print negids[:5]

[u'neg/cv000_29416.txt', u'neg/cv001_19502.txt', u'neg/cv002_17424.txt', u'neg/cv003_12683.txt', u'neg/cv004_12641.txt']


Приготовим список текстов и классов как обучающую выборку:

In [12]:
negfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in negids]
posfeats = [" ".join(movie_reviews.words(fileids=[f])) for f in posids]

texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

In [15]:
negids

[u'neg/cv000_29416.txt',
 u'neg/cv001_19502.txt',
 u'neg/cv002_17424.txt',
 u'neg/cv003_12683.txt',
 u'neg/cv004_12641.txt',
 u'neg/cv005_29357.txt',
 u'neg/cv006_17022.txt',
 u'neg/cv007_4992.txt',
 u'neg/cv008_29326.txt',
 u'neg/cv009_29417.txt',
 u'neg/cv010_29063.txt',
 u'neg/cv011_13044.txt',
 u'neg/cv012_29411.txt',
 u'neg/cv013_10494.txt',
 u'neg/cv014_15600.txt',
 u'neg/cv015_29356.txt',
 u'neg/cv016_4348.txt',
 u'neg/cv017_23487.txt',
 u'neg/cv018_21672.txt',
 u'neg/cv019_16117.txt',
 u'neg/cv020_9234.txt',
 u'neg/cv021_17313.txt',
 u'neg/cv022_14227.txt',
 u'neg/cv023_13847.txt',
 u'neg/cv024_7033.txt',
 u'neg/cv025_29825.txt',
 u'neg/cv026_29229.txt',
 u'neg/cv027_26270.txt',
 u'neg/cv028_26964.txt',
 u'neg/cv029_19943.txt',
 u'neg/cv030_22893.txt',
 u'neg/cv031_19540.txt',
 u'neg/cv032_23718.txt',
 u'neg/cv033_25680.txt',
 u'neg/cv034_29446.txt',
 u'neg/cv035_3343.txt',
 u'neg/cv036_18385.txt',
 u'neg/cv037_19798.txt',
 u'neg/cv038_9781.txt',
 u'neg/cv039_5963.txt',
 u'neg/

In [14]:
print texts[1999]

truman ( " true - man " ) burbank is the perfect name for jim carrey ' s character in this film . president truman was an unassuming man who became known worldwide , in spite of ( or was it because of ) his stature . " truman " also recalls an era of plenty following a grim war , an era when planned communities built by government scientists promised an idyllic life for americans . and burbank , california , brings to mind the tonight show and the home of nbc . if hollywood is the center of the film world , burbank is , or was , the center of tv ' s world , the world where our protagonist lives . combine all these names and concepts into " truman burbank , " and you get something that well describes him and his artificial world . truman leads the perfect life . his town , his car , and his wife are picture perfect . his idea of reality comes under attack one day when a studio light falls from the sky . the radio explains that an overflying airplane started coming apart . . . but then w

Импортируем нужные нам модули

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline



###Оценка качества работы разных классификаторов

In [10]:
def text_classifier(vectorizer, transformer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
            ("transformer", transformer),
            ("classifier", classifier)]
        )

In [11]:
for clf in [LogisticRegression, LinearSVC, SGDClassifier]:
    print clf
    print cross_val_score(text_classifier(CountVectorizer(), TfidfTransformer(), clf()), texts, labels).mean()
    print "\n"

<class 'sklearn.linear_model.logistic.LogisticRegression'>
0.813511115906


<class 'sklearn.svm.classes.LinearSVC'>
0.845507183831


<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>




0.828496160831




###Подготовка классификатора, обученного на всех данных

In [12]:
clf_pipeline = Pipeline(
            [("vectorizer", TfidfVectorizer()),
            ("classifier", LinearSVC())]
        )


clf_pipeline.fit(texts, labels)

print clf_pipeline

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_id...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])


In [13]:
print clf_pipeline.predict(["Amazing film! I will advice it to all my friends. Genious",
                           "Awful film! The man who advised me to watch it is really crazy idiot."])

[1 0]


## Понижение размерности и ансамбли деревьев

In [14]:
%%time
from sklearn.decomposition import NMF, TruncatedSVD

v = CountVectorizer()
mx = v.fit_transform(texts)
mf = TruncatedSVD(10)
u = mf.fit_transform(mx)

Wall time: 6.55 s


In [15]:
for transform in [TruncatedSVD, NMF]:
    print transform
    print cross_val_score(text_classifier(CountVectorizer(), transform(n_components=10), LinearSVC()), texts, labels).mean()
    print "\n"


<class 'sklearn.decomposition.truncated_svd.TruncatedSVD'>
0.512500524477


<class 'sklearn.decomposition.nmf.NMF'>
0.643008277739







Если задать n_components=1000:

In [16]:
%%time
print cross_val_score(text_classifier(TfidfVectorizer(), TruncatedSVD(n_components=1000), LinearSVC()),
                      texts, 
                      labels
                     ).mean()

0.843009176842
Wall time: 5min 36s


##Ансамбли деревьев на преобразованных признаках

In [17]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [18]:
%%time
print cross_val_score(
    Pipeline([
            ("vectorizer", CountVectorizer()),
            ("transformer", TruncatedSVD(100)),
            ("classifier", RandomForestClassifier(100))
        ]),
    texts,
    labels
    )

[ 0.71407186  0.7042042   0.72222222]
Wall time: 40.8 s


Больше компонент и больше деревьев:

In [19]:
%%time
print cross_val_score(text_classifier(CountVectorizer(), TruncatedSVD(n_components=1000), RandomForestClassifier(1000)),
                      texts, 
                      labels
                     ).mean()

0.72100393807
Wall time: 9min 11s


Tf*Idf вместо частот слов:

In [18]:
%%time
print cross_val_score(text_classifier(TfidfVectorizer(), TruncatedSVD(n_components=1000), RandomForestClassifier(1000)),
                      texts, 
                      labels
                     ).mean()

0.590001678325
CPU times: user 4min 39s, sys: 14.3 s, total: 4min 53s
Wall time: 3min 52s


##Совмещаем Tf*Idf и SVD

In [20]:
from sklearn.pipeline import FeatureUnion

estimators = [('tfidf', TfidfTransformer()), ('svd', TruncatedSVD(1))]
combined = FeatureUnion(estimators)

In [21]:
%%time
print cross_val_score(
    Pipeline([
            ("vectorizer", CountVectorizer()),
            ("transformer", combined),
            ("classifier", LinearSVC())
        ]),
    texts,
    labels
    )

[ 0.76197605  0.54054054  0.51951952]
Wall time: 28.7 s
