In [None]:
import numpy as np
import pandas as pd
import nltk
import string

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import gensim
from gensim.models import Word2Vec, KeyedVectors

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
#text preprocessing
reviews = pd.read_csv("imdbReviews.csv")

In [None]:
reviews.head()

Unnamed: 0,Index,URL,Text,Sentiment
0,3617,http://www.imdb.com/title/tt0210075/usercomments,Girlfight follows a project dwelling New York ...,POS
1,3671,http://www.imdb.com/title/tt0337640/usercomments,Hollywood North is an euphemism from the movie...,POS
2,3157,http://www.imdb.com/title/tt0303549/usercomments,That '70s Show is definitely the funniest show...,POS
3,660,http://www.imdb.com/title/tt0716825/usercomments,"9/10- 30 minutes of pure holiday terror. Okay,...",POS
4,265,http://www.imdb.com/title/tt0182225/usercomments,"A series of random, seemingly insignificant th...",POS


In [None]:
reviews['Sentiment'].value_counts()

POS    1000
NEG    1000
Name: Sentiment, dtype: int64

In [None]:
#save the labels and encode them as 1 and 0 for future classification/clustering
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
label = enc.fit_transform(reviews['Sentiment'])
print(label[:10])
print(reviews['Sentiment'][:10])

[1 1 1 1 1 1 1 1 1 1]
0    POS
1    POS
2    POS
3    POS
4    POS
5    POS
6    POS
7    POS
8    POS
9    POS
Name: Sentiment, dtype: object


In [None]:
#change the text column datatype to string
reviews = reviews.astype({'Text':'string'})

In [None]:
#get the review text for preprocessing
text = reviews['Text']

In [None]:
text1 = []

for review in text:
    #print(sentence)
    #remove punctuation
    review = review.translate(str.maketrans('', '', string.punctuation))
    # remove digits/numbers
    review = review.translate(str.maketrans('', '', string.digits))
    #change to lowercase
    review = review.lower()
    #print(sentence)
    text1.append(review)

In [None]:
text1 = pd.Series(text1)

In [None]:
#remove stop words

#Setting English stopwords
stop_words = set(stopwords.words('english'))

text1 = text1.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))
text1[:5]

0    girlfight follows project dwelling new york hi...
1    hollywood north euphemism movie industry went ...
2    show definitely funniest show currently tv sta...
3    minutes pure holiday terror okay scary sure fu...
4    series random seemingly insignificant thefts s...
dtype: object

In [None]:
#apply stemming
ps = nltk.PorterStemmer()

text1 = text1.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [None]:
reviews1 = list(zip(text1, label))

In [None]:
reviewsP = pd.DataFrame (reviews1, columns = ['Review', 'Sentiment'])

In [None]:
reviewsP1 = reviewsP.sample(frac=1, random_state=1).reset_index()

In [None]:
#split the dataset

#train dataset by splitting the data
train_reviews = reviewsP1.Review[:1400]
train_sentiments = reviewsP1.Sentiment[:1400]

#test dataset
test_reviews = reviewsP1.Review[1400:]
test_sentiments = reviewsP1.Sentiment[1400:]

print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

(1400,) (1400,)
(600,) (600,)


In [None]:
train_reviews.head()

0    superb episod one best season right horror cha...
1    metamorphosi work way chill classic movi pack ...
2    spoil lame south border adventur movi someth b...
3    actual stop dont get wrong love bad monster mo...
4    intent director film quit honor histori produc...
Name: Review, dtype: object

In [None]:
#tokenise the data
tokenized_reviews = train_reviews.apply(lambda x: x.split())

In [None]:
#learn vectors from the data
model = gensim.models.Word2Vec(
            tokenized_reviews,
            vector_size=100, # desired no. of features/independent variables
            window=5,        # context window size
            min_count=2,     # Ignores all words with total frequency lower than 2.
            sg = 1,          # 1 for skip-gram model
            hs = 0,
            negative = 10,   # for negative sampling
            workers= 32,     # no.of cores
            seed = 34
)


In [None]:
model.train(tokenized_reviews, total_examples= len(train_reviews), epochs=20)

(3344882, 3767020)

In [None]:
embeddingsSize=100

def getVectors(dataset):
  singleDataItemEmbedding=np.zeros(embeddingsSize)
  vectors=[]
  for dataItem in dataset:
    wordCount=0
    for word in dataItem:
      if word in model.wv:
        singleDataItemEmbedding=singleDataItemEmbedding+model.wv.key_to_index[word]
        wordCount=wordCount+1

    singleDataItemEmbedding=singleDataItemEmbedding/wordCount
    vectors.append(singleDataItemEmbedding)
  return vectors

In [None]:
trainReviewVectors=getVectors(train_reviews)
testReviewVectors=getVectors(test_reviews)

In [None]:
############################################
###           Decision Tree              ###
############################################
#training the model
DT=DecisionTreeClassifier(criterion ='entropy', random_state= 0)

DT=DT.fit(trainReviewVectors,train_sentiments)

DT_predict=DT.predict(testReviewVectors)


DT_report=classification_report(test_sentiments,DT_predict,target_names=['Positive','Negative'])
print(confusion_matrix(test_sentiments,DT_predict), '\n')
print(DT_report)

[[163 143]
 [141 153]] 

              precision    recall  f1-score   support

    Positive       0.54      0.53      0.53       306
    Negative       0.52      0.52      0.52       294

    accuracy                           0.53       600
   macro avg       0.53      0.53      0.53       600
weighted avg       0.53      0.53      0.53       600

