In [1]:
import nltk
from nltk.corpus import PlaintextCorpusReader
import os
from nltk import *

from nltk.corpus import sentence_polarity
import random
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier

In [2]:
mycorpus = PlaintextCorpusReader('.', '.*\.txt')
Text = mycorpus.raw("new_file.txt")
Text = Text.replace("reviewText:","")
NewText = nltk.sent_tokenize(Text)

In [3]:
len(NewText)

1140642

In [5]:
sentences = sentence_polarity.sents()
documents = [(sent, cat) for cat in sentence_polarity.categories() for sent in sentence_polarity.sents(categories=cat)]
random.shuffle(documents)

In [6]:
all_words_list = [word for (sent,cat) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)
word_items = all_words.most_common(3000)
word_features = [word for (word,count) in word_items]

In [7]:
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [8]:
featuresets = [(document_features(a, word_features), b) for (a, b) in documents]
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.779

In [9]:
Stopwords = nltk.corpus.stopwords.words('english')

In [164]:
pos_sent=[]
neg_sent=[]
for i in NewText:
    tokentext = nltk.word_tokenize(i)
    Words = [w.lower() for w in tokentext]
    RevisedWords = [w for w in Words if w.isalpha()]
    StoppedRevisedWords = [w for w in RevisedWords if w not in Stopwords]
    category = classifier.classify(document_features(StoppedRevisedWords,word_features))
    if (category == "pos"):
        pos_sent.append(i)
    else:
        neg_sent.append(i)

In [165]:
print(len(pos_sent))
print(len(neg_sent))

440573
700069


In [12]:
negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']

In [13]:
def NOT_features(document, word_features, negationwords):
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = False
        features['contains(NOT{})'.format(word)] = False
    for i in range(0, len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            features['contains(NOT{})'.format(document[i])] = (document[i] in word_features)
        else:
            features['contains({})'.format(word)] = (word in word_features)
    return features

In [14]:
NOT_featuresets = [(NOT_features(c, word_features, negationwords), d) for (c, d) in documents]
train_set1, test_set1 = NOT_featuresets[1000:], NOT_featuresets[:1000]
classifier1 = nltk.NaiveBayesClassifier.train(train_set1)
nltk.classify.accuracy(classifier1, test_set1)

0.794

In [267]:
pos_sent1=[]
neg_sent1=[]
for i in NewText:
    tokentext = nltk.word_tokenize(i)
    Words = [w.lower() for w in tokentext]
    RevisedWords = [w for w in Words if w.isalpha()]
    StoppedRevisedWords = [w for w in RevisedWords if w not in Stopwords]
    category = classifier1.classify(NOT_features(StoppedRevisedWords,word_features,negationwords))
    if (category == "pos"):
        pos_sent1.append(i)
    else:
        neg_sent1.append(i)

In [268]:
print(len(pos_sent1))
print(len(neg_sent1))

447447
693195


In [183]:
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB

classifier2 = SklearnClassifier(BernoulliNB()).train(train_set)
nltk.classify.accuracy(classifier2, test_set)

0.777

In [184]:
classifier3 = SklearnClassifier(BernoulliNB()).train(train_set1)
nltk.classify.accuracy(classifier3, test_set1)

0.782

In [269]:
pos_sent2=[]
neg_sent2=[]
for i in NewText:
    tokentext = nltk.word_tokenize(i)
    Words = [w.lower() for w in tokentext]
    RevisedWords = [w for w in Words if w.isalpha()]
    StoppedRevisedWords = [w for w in RevisedWords if w not in Stopwords]
    category = classifier3.classify(NOT_features(StoppedRevisedWords,word_features,negationwords))
    if (category == "pos"):
        pos_sent2.append(i)
    else:
        neg_sent2.append(i)

In [270]:
print(len(pos_sent2))
print(len(neg_sent2))

431938
708704


In [279]:
pos_sent1[30:35]

['Beautiful vibrant color.',
 'I bought several more colors!',
 'Nice and puffy tutu skirt.',
 'Bought this for my niece as part of her fairy outfit.',
 'I bought this for her for Christmas and she never wanted to take it off.']

In [282]:
neg_sent1[30:35]

["Can't recommend.",
 'Fits great and easy to  clean!',
 'Never GOT this item - but gave a 1 STAR because the replies from the SUPPLIER was GREAT.They tried to send the item more than once.My $ was refunded in a timely manner too.It was a shame I never got it for my daughter - it would of looked great with her OUTFIT for Dr. Seuss WEEK at school.Most original.Maybe next time.',
 'I would recommend this for girls under 10 yr. old.',
 'It will be too short and small for older girls.']

In [283]:
pos_sentences=open("pos_sentences.txt","w")
for line in pos_sent1:
    pos_sentences.write(line)
pos_sentences.close()

In [285]:
neg_sentences=open("neg_sentences.txt","w")
for line in neg_sent1:
    neg_sentences.write(line)
neg_sentences.close()