In [21]:
from nltk.corpus import stopwords
import string

def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def clean_doc(doc):
    tokens = doc.split()
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word)>1]
    return tokens

filename = r'C:\Users\danie\Desktop\txt_sentoken\pos\cv000_29590.txt'
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)

['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'theyre', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'theres', 'never', 'really', 'comic', 'book', 'like', 'hell', 'starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'series', 'called', 'watchmen', 'say', 'moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'would', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd', 'book', 'graphic', 'novel', 'pages', 'long', 'includes', 'nearly', 'consist', 'nothing', 'footnotes', 'words', 'dont', 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumbling', 'block', 'hells', 'directors', 'albert', 'allen', 'hughes', 'getting', 'hughes', 'brothers', 'direct', 'seems', 'almost', 'ludicrous', 'casting', 'carrot', 'top', 'well', 'anythi

In [22]:
from collections import Counter
from os import listdir

def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    vocab.update(tokens)
    
def process_docs(directory, vocab, is_train):
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory+'/'+ filename
        add_doc_to_vocab(path, vocab)

vocab = Counter()
process_docs(r'C:\Users\danie\Desktop\txt_sentoken\neg',vocab,True)
process_docs(r'C:\Users\danie\Desktop\txt_sentoken\pos',vocab,True)

print(len(vocab))
print(vocab.most_common(50))

44276
[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844), ('much', 1824), ('also', 1757), ('characters', 1735), ('get', 1724), ('character', 1703), ('two', 1643), ('first', 1588), ('see', 1557), ('way', 1515), ('well', 1511), ('make', 1418), ('really', 1407), ('little', 1351), ('life', 1334), ('plot', 1288), ('people', 1269), ('bad', 1248), ('could', 1248), ('scene', 1241), ('movies', 1238), ('never', 1201), ('best', 1179), ('new', 1140), ('scenes', 1135), ('man', 1131), ('many', 1130), ('doesnt', 1118), ('know', 1092), ('dont', 1086), ('hes', 1024), ('great', 1014), ('another', 992), ('action', 985), ('love', 977), ('us', 967), ('go', 952), ('director', 948), ('end', 946), ('something', 945), ('still', 936)]


In [23]:
at_least = 2
tokens  = [k for k,c in vocab.items() if c >= at_least]
print(len(tokens))

25767


In [24]:
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename,'w')
    file.write(data)
    file.close()
    
save_list(tokens, r'C:\Users\danie\Desktop\vocab.txt')

In [32]:
def doc_to_line(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

def process_docs(directory, vocab, is_train):
    lines = list()
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory +'/'+filename
        line = doc_to_line(path, vocab)
        lines.append(line)
    return lines
    

In [33]:
vocab_filename = r'C:\Users\danie\Desktop\vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [35]:
positive_lines = process_docs(r'C:\Users\danie\Desktop\txt_sentoken\pos',vocab, True)
negative_lines = process_docs(r'C:\Users\danie\Desktop\txt_sentoken\neg',vocab, True)
print(len(positive_lines), len(negative_lines))

900 900


In [37]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
docs = positive_lines + negative_lines
tokenizer.fit_on_texts(docs)
Xtrain = tokenizer.texts_to_matrix(docs, mode='freq')
print(Xtrain.shape)

(1800, 25768)


In [38]:
positive_lines = process_docs(r'C:\Users\danie\Desktop\txt_sentoken\pos',vocab, False)
negative_lines = process_docs(r'C:\Users\danie\Desktop\txt_sentoken\neg',vocab, False)
docs = positive_lines + negative_lines
Xtest = tokenizer.texts_to_matrix(docs, mode='freq')
print(Xtest.shape)

(200, 25768)


In [40]:
from numpy import array
n_words = Xtest.shape[1]
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

In [41]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
model = Sequential()
model.add(Dense(50, input_shape=(n_words,), activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(Xtrain, ytrain, epochs = 50, verbose =2)

loss, acc = model.evaluate(Xtest, ytest, verbose = 0)
print('Test Accuracy: %f' % (acc*100))

Epoch 1/50
1s - loss: 0.6916 - acc: 0.5428
Epoch 2/50
1s - loss: 0.6818 - acc: 0.7156
Epoch 3/50
1s - loss: 0.6613 - acc: 0.7139
Epoch 4/50
1s - loss: 0.6306 - acc: 0.9317
Epoch 5/50
1s - loss: 0.5911 - acc: 0.9378
Epoch 6/50
1s - loss: 0.5470 - acc: 0.9439
Epoch 7/50
1s - loss: 0.5005 - acc: 0.9522
Epoch 8/50
1s - loss: 0.4535 - acc: 0.9578
Epoch 9/50
1s - loss: 0.4093 - acc: 0.9611
Epoch 10/50
1s - loss: 0.3689 - acc: 0.9678
Epoch 11/50
1s - loss: 0.3306 - acc: 0.9739
Epoch 12/50
1s - loss: 0.2971 - acc: 0.9767
Epoch 13/50
1s - loss: 0.2658 - acc: 0.9833
Epoch 14/50
1s - loss: 0.2384 - acc: 0.9844
Epoch 15/50
1s - loss: 0.2152 - acc: 0.9867
Epoch 16/50
1s - loss: 0.1938 - acc: 0.9906
Epoch 17/50
1s - loss: 0.1744 - acc: 0.9928
Epoch 18/50
1s - loss: 0.1579 - acc: 0.9933
Epoch 19/50
1s - loss: 0.1432 - acc: 0.9939
Epoch 20/50
1s - loss: 0.1297 - acc: 0.9956
Epoch 21/50
1s - loss: 0.1173 - acc: 0.9961
Epoch 22/50
1s - loss: 0.1067 - acc: 0.9978
Epoch 23/50
1s - loss: 0.0974 - acc: 0.99

In [42]:
def predict_sentiment(review, vocab, tokenizer, model):
    tokens = clean_doc(review)
    token  = [w for w in tokens if w in vocab]
    line = ' '.join(tokens)
    encoded = tokenizer.texts_to_matrix([line], mode='freq')
    predict = model.predict(encoded, verbose=0)
    return round(predict[0,0])

In [44]:
positive = 'The best movie I have ever seen'
print(predict_sentiment(positive,vocab, tokenizer, model))

0.0


In [45]:
negative = 'This is a bad movie. The worst.'
print(predict_sentiment(negative, vocab, tokenizer, model))

1.0
