In [1]:
import collections
import nltk
import os
from sklearn import (
    datasets, model_selection, feature_extraction, linear_model
)

In [2]:
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''
    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        stop_words='english',  # remove stop words
        min_df=1  # minimum document frequency, i.e. the word must appear more than once.
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus

In [3]:
data_directory = 'movie_reviews'
movie_sentiment_data = datasets.load_files(data_directory, shuffle=True)

In [4]:
print('{} files loaded.'.format(len(movie_sentiment_data.data)))
print('They contain the following classes: {}.'.format(
    movie_sentiment_data.target_names))

2000 files loaded.
They contain the following classes: ['neg', 'pos'].


In [5]:
%%time
movie_tfidf = extract_features(movie_sentiment_data.data)

CPU times: user 16.7 s, sys: 219 ms, total: 17 s
Wall time: 16.6 s


In [6]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    movie_tfidf, movie_sentiment_data.target, test_size=0.30, random_state=42)

In [7]:
# similar to nltk.NaiveBayesClassifier.train()
model = linear_model.LogisticRegression()
model.fit(X_train, y_train)
print('Model performance: {}'.format(model.score(X_test, y_test)))

Model performance: 0.795


In [8]:
y_pred = model.predict(X_test)
for i in range(5):
    print('Review:\n{review}\n-\nCorrect label: {correct}; Predicted: {predict}'.format(
        review=X_test[i], correct=y_test[i], predict=y_pred[i]
    ))

Review:
  (0, 15495)	0.08562453929860635
  (0, 16075)	0.08562453929860635
  (0, 25519)	0.08562453929860635
  (0, 40333)	0.08562453929860635
  (0, 18366)	0.08562453929860635
  (0, 38388)	0.08562453929860635
  (0, 24341)	0.08123447317058612
  (0, 26423)	0.08123447317058612
  (0, 7356)	0.08123447317058612
  (0, 45613)	0.08123447317058612
  (0, 6315)	0.07372960555495599
  (0, 32613)	0.07061480406734608
  (0, 13836)	0.06535809670996791
  (0, 30560)	0.07206057987410651
  (0, 16379)	0.0574446056832854
  (0, 23315)	0.07061480406734608
  (0, 8667)	0.061834671811305635
  (0, 30000)	0.1183613465658554
  (0, 11075)	0.06716683059103849
  (0, 3958)	0.07570364394909454
  (0, 16080)	0.08123447317058612
  (0, 20129)	0.0781196716829762
  (0, 24675)	0.07570364394909454
  (0, 11492)	0.08123447317058612
  (0, 600)	0.07372960555495599
  :	:
  (0, 1758)	0.025522342680743385
  (0, 33212)	0.024105513200181974
  (0, 24785)	0.02667479194403957
  (0, 12601)	0.17172302155069424
  (0, 17573)	0.03364855947402738
  (