In [89]:
import csv
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional
from sklearn import preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.model_selection import train_test_split

In [110]:
df = pd.read_csv('./data/imdb.csv',delimiter=',', header=None)

In [111]:
df = df.sample(frac=1)

In [112]:
X = df[0]
Y = df[1]

In [113]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/chris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [114]:
if Y.dtype == 'int64':
    Y = np.array(Y, dtype='str')
Y

4422     negative
19416    positive
11133    negative
34501    positive
34856    negative
           ...   
21797    negative
8159     positive
38326    positive
10904    positive
6506     negative
Name: 1, Length: 50000, dtype: object

In [115]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)

In [116]:
vocab_size = 5000 # make the top list of words (common words)
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # OOV = Out of Vocabulary
training_portion = .8

In [117]:

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [118]:

train_sequences = tokenizer.texts_to_sequences(X_train)


In [119]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


In [120]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

validation_sequences = tokenizer.texts_to_sequences(X_test)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


In [121]:
labels = set()
for l in Y_train:
    labels.add(l)
labels

{'negative', 'positive'}

In [122]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(Y_train))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(Y_test))
validation_label_seq

array([[2],
       [2],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [123]:

model = Sequential()

model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(6, activation='softmax'))

model.summary()

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 64)          320000    
_________________________________________________________________
dropout_5 (Dropout)          (None, None, 64)          0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_5 (Dense)              (None, 6)                 774       
Total params: 386,822
Trainable params: 386,822
Non-trainable params: 0
_________________________________________________________________


In [124]:
train_padded
training_label_seq

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [125]:

num_epochs = 10
history = model.fit(train_padded, training_label_seq, epochs=num_epochs,verbose=2)

Epoch 1/10
1329/1329 - 108s - loss: 0.6018 - accuracy: 0.6686
Epoch 2/10
1329/1329 - 119s - loss: 0.4445 - accuracy: 0.8035
Epoch 3/10
1329/1329 - 118s - loss: 0.3189 - accuracy: 0.8700
Epoch 4/10
1329/1329 - 127s - loss: 0.2887 - accuracy: 0.8845
Epoch 5/10
1329/1329 - 143s - loss: 0.2561 - accuracy: 0.8975
Epoch 6/10
1329/1329 - 143s - loss: 0.2378 - accuracy: 0.9062
Epoch 7/10
1329/1329 - 146s - loss: 0.2133 - accuracy: 0.9160
Epoch 8/10
1329/1329 - 158s - loss: 0.1975 - accuracy: 0.9230
Epoch 9/10
1329/1329 - 152s - loss: 0.1810 - accuracy: 0.9305
Epoch 10/10
1329/1329 - 149s - loss: 0.1657 - accuracy: 0.9366


In [126]:
validation_label_seq

array([[2],
       [2],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [127]:
pred = model.predict(validation_padded)
pred = pred.argmax(axis=-1)
pred


array([2, 2, 1, ..., 1, 1, 1])

In [128]:
test = []
for i in validation_label_seq:
    test.append(i[0])
test = np.asarray(test)
test

array([2, 2, 1, ..., 1, 1, 1])

In [129]:
metrics.accuracy_score(pred, test)

0.8738666666666667

In [131]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras import layers
df_imdb = pd.read_csv("data/imdb.csv", delimiter=',', header=None)
train_x, test_x, train_y, test_y = model_selection.train_test_split(df_imdb[0], df_imdb[1])
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df_imdb[0])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)
cls = naive_bayes.MultinomialNB().fit(xtrain_tfidf, train_y)
pred = cls.predict(xtest_tfidf)
print(metrics.accuracy_score(pred, test_y))

0.85064
