In [0]:
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow_hub as hub
import tensorflow as tf

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
#upload text file (data set)
from google.colab import files
uploaded = files.upload()


Saving imdb_labelled.txt to imdb_labelled (1).txt


In [0]:
#file name = imdb_labelled.txt
data=uploaded['imdb_labelled.txt'].decode("utf-8").split("\n")
data.remove('')

In [0]:
reviews=[]
labels=[]
# set of stop words
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

In [0]:
for review in data:
  #remove punctuations and stop words from each review
    temp=review.split('\t')
    temp[0]=tokenizer.tokenize(temp[0].lower())
    r=[]
    for w in temp[0]:
        if w not in stop_words: r.append(w)
    #spliting reviews and labels
    reviews.append(r)
    labels.append(int(temp[1]))

In [0]:
# connect all the words of each review to create a sentence without punctuations and stop words
for i in range(len(reviews)):
  r=''
  for word in reviews[i]:
    r+=word + ' '
  reviews[i]=r[:-1]

In [0]:
#elmo embedding
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

def elmo_vectors(x):
  embeddings = elmo([x], signature="default", as_dict=True)["elmo"]

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))

In [0]:
vectors=[]

In [0]:
for r in reviews:
  # 1-d array vector
  # turn every pre-proccessed review into a vector using elmo function
  vector=elmo_vectors(r)[0]
  vectors.append(vector)

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(vectors, labels)

In [0]:
#implementing Naive Bayes classifier
clf_nb= GaussianNB().fit(X_train,Y_train)
Y_pred=clf_nb.predict(X_test)

In [0]:
accu=accuracy_score(Y_test,Y_pred)
print('Accuracy : ' + str(accu) )
print('Error : '+ str(1-accu))
print('F1_Score : '+str(f1_score(Y_test,Y_pred)))
print('Confusion matrix : \n'+str(confusion_matrix(Y_test,Y_pred)))

In [0]:
#implementing Logistic Regression classifier
clf_lr = LogisticRegression().fit(X_train, Y_train)
Y_pred=clf_lr.predict(X_test)

In [0]:
accu=accuracy_score(Y_test,Y_pred)
print('Accuracy : ' + str(accu) )
print('Error : '+ str(1-accu))
print('F1_Score : '+str(f1_score(Y_test,Y_pred)))
print('Confusion matrix : \n'+str(confusion_matrix(Y_test,Y_pred)))

In [0]:
# in this case Naive Bayes works better than Logistic Regression and it's because Naive Bayes works better with small training data sets, 
# but  when the training size reaches infinity the Logistic Regression performs better than the Naive Bayes. (it depends on data set)