In [1]:
import random
import nltk
from nltk import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/dhanyaharish/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [4]:
cats = movie_reviews.categories()
reviews = []
for cat in cats:
    for fid in movie_reviews.fileids(cat):
        review = (list(movie_reviews.words(fid)),cat)
        reviews.append(review)
random.shuffle(reviews)

In [5]:
all_wd_in_reviews = nltk.FreqDist(wd.lower() for wd in movie_reviews.words())
top_wd_in_reviews = [list(wds) for wds in zip(*all_wd_in_reviews.most_common(2000))][0]

In [6]:
def ext_ft(review,top_words):
    review_wds = set(review)
    ft = {}
    for wd in top_words:
        ft['word_present({})'.format(wd)] = (wd in review_wds)
    return ft

In [7]:
featuresets = [(ext_ft(d,top_wd_in_reviews), c) for (d,c) in reviews]
train_set, test_set = featuresets[200:], featuresets[:200]

In [8]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.835


In [9]:
classifier.show_most_informative_features(20)

Most Informative Features
word_present(outstanding) = True              pos : neg    =     12.5 : 1.0
     word_present(mulan) = True              pos : neg    =      8.9 : 1.0
    word_present(seagal) = True              neg : pos    =      7.4 : 1.0
word_present(wonderfully) = True              pos : neg    =      7.0 : 1.0
     word_present(damon) = True              pos : neg    =      6.1 : 1.0
    word_present(poorly) = True              neg : pos    =      6.0 : 1.0
      word_present(lame) = True              neg : pos    =      5.9 : 1.0
word_present(ridiculous) = True              neg : pos    =      5.8 : 1.0
    word_present(wasted) = True              neg : pos    =      5.4 : 1.0
     word_present(awful) = True              neg : pos    =      5.4 : 1.0
       word_present(era) = True              pos : neg    =      5.3 : 1.0
     word_present(waste) = True              neg : pos    =      5.2 : 1.0
     word_present(snake) = True              neg : pos    =      5.0 : 1

In [10]:
dict_vectorizer=None
def get_train_test(train_set,test_set):
    global dict_vectorizer
    dict_vectorizer = DictVectorizer(sparse=False)
    X_train, y_train = zip(*train_set)
    X_train = dict_vectorizer.fit_transform(X_train)
    X_test,y_test = zip(*test_set)
    X_test = dict_vectorizer.transform(X_test)
    return X_train,X_test,y_train,y_test

In [11]:
X_train,X_test,y_train,y_test = get_train_test(train_set,test_set)
rf = RandomForestClassifier(n_estimators=100,n_jobs=4,random_state=10)
rf.fit(X_train,y_train)

RandomForestClassifier(n_jobs=4, random_state=10)

In [12]:
preds = rf.predict(X_test)
print(accuracy_score(y_test,preds))

0.77


In [13]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
all_words_in_reviews = nltk.FreqDist(word.lower() for word in movie_reviews.words() if word not in stopwords_list)
top_words_in_reviews = [list(words) for words in zip(*all_words_in_reviews.most_common(2000))][0]

In [14]:
featuresets = [(ext_ft(d,top_words_in_reviews), c) for (d,c) in reviews]
train_set, test_set = featuresets[200:], featuresets[:200]
X_train,X_test,y_train,y_test = get_train_test(train_set,test_set)

In [15]:
rf = RandomForestClassifier(n_estimators=100,n_jobs=4,random_state=10)
rf.fit(X_train,y_train)

RandomForestClassifier(n_jobs=4, random_state=10)

In [16]:
preds = rf.predict(X_test)
print(accuracy_score(y_test,preds))

0.755


In [17]:
features_list = zip(dict_vectorizer.get_feature_names(),rf.feature_importances_)
features_list = sorted(features_list, key=lambda x: x[1], reverse=True)
print(features_list[0:20])

[('word_present(bad)', 0.015529669630700889), ('word_present(stupid)', 0.0071365303260568615), ('word_present(boring)', 0.006867331151253918), ('word_present(worst)', 0.0065533881391988645), ('word_present(ridiculous)', 0.005154484285257659), ('word_present(mess)', 0.004947554136504527), ('word_present(awful)', 0.00490327846463197), ('word_present(script)', 0.004867138224172507), ('word_present(life)', 0.004552713486366785), ('word_present(lame)', 0.004502582842268086), ('word_present(supposed)', 0.004149222376007494), ('word_present(dull)', 0.004088775330999942), ('word_present(waste)', 0.003918416767064426), ('word_present(perfect)', 0.003911777398889316), ('word_present(outstanding)', 0.0036064039823212692), ('word_present(excellent)', 0.0035788148536847362), ('word_present(plot)', 0.0034848590622247547), ('word_present(great)', 0.0033338059595372305), ('word_present(allows)', 0.003069991691742157), ('word_present(memorable)', 0.0030285097003528217)]


