# Pos Neg Classifier
##### Simple Classifier which classifies if the entered sentence is positive or negative.

In [1]:
# import nltk to process language data
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# import Counter
from collections import Counter

In [3]:
# import tensorflow for obvious reasons
import tensorflow as tf

In [4]:
# import numpy for data manipulation
import numpy as np

In [5]:
# variables
random_state = 42
pos_one_hot = [1.0, 0.0]
neg_one_hot = [0.0, 1.0]

In [6]:
# define word lemmatizer
lemmatizer = WordNetLemmatizer()

In [7]:
# our data files
files = ['./dataset/pos.txt', './dataset/neg.txt']

In [8]:
# create words list
words_list = []

for file in files:
    # open the file and create a context
    with open(file, 'r') as f:
        # read file
        sentences = f.readlines()
        for sentence in sentences:
            # tokenize words
            words = word_tokenize(sentence)
            words_list += list(words)

In [9]:
# lemmatize words
lemmatized_words_list = [lemmatizer.lemmatize(i) for i in words_list]

In [10]:
# extract each lemma and calculate its number of occurences
words_freq = Counter(lemmatized_words_list)

In [11]:
# create lexicon containing unique lemmatized words
lexicon = []
for word in words_freq:
    # discard rare and most used words
    if 8000 > words_freq[word] > 20:
        lexicon.append(word)
len(lexicon)

1073

In [12]:
# create features list
X = []
# create labels list
y = []

In [13]:
# update feature and labels list for pos.txt
with open(files[0], 'r') as f:
    sentences = f.readlines()
    for line in sentences:
        words = word_tokenize(line.lower())
        words = [lemmatizer.lemmatize(i) for i in words]

        features = np.zeros((len(lexicon)))

        for word in words:
            word = word.lower()
            if word in lexicon:
                index = lexicon.index(word)
                features[index] += 1
            
        X.append(list(features))
        y.append(pos_one_hot)

In [14]:
# update feature and labels list for neg.txt
with open(files[1], 'r') as f:
    sentences = f.readlines()
    for line in sentences:
        words = word_tokenize(line.lower())
        words = [lemmatizer.lemmatize(i) for i in words]

        features = np.zeros((len(lexicon)))

        for word in words:
            word = word.lower()
            if word in lexicon:
                index = lexicon.index(word)
                features[index] += 1
            
        X.append(list(features))
        y.append(neg_one_hot)

In [15]:
# convert lists to numpy arrays
X = np.array(X)
y = np.array(y)

In [16]:
# shuffle the data
from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state=random_state)

In [17]:
# split the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.08, random_state=random_state)

In [18]:
# import keras to build our model
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [19]:
# build our sequential model
model = Sequential()

# add a dense layer
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))

# add a dense layer
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))

# add a dense layer
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))

# add a dense layer
model.add(Dense(1024, activation='relu'))

# add a softmax layer
model.add(Dense(2, activation='softmax'))

# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# fit data into the model
model.fit(X_train, y_train, epochs=3, validation_data=[X_test, y_test])

Train on 9809 samples, validate on 853 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f07d9b605f8>

##### Model accuracy: 76.08%
##### More Data = Higher Accuracy

In [20]:
# print model summary
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              1099776   
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dropout_2 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 1024)              1049600   
__________

In [21]:
# test custom prediction
line = 'You are doing good fam! Keep it up!'
words = word_tokenize(line.lower())
words = [lemmatizer.lemmatize(i) for i in words]

_features = np.zeros((len(lexicon)))

for word in words:
    word = word.lower()
    if word in lexicon:
        index = lexicon.index(word)
        _features[index] += 1
_features = np.reshape(_features, (1,1073))
model.predict(_features)

array([[0.52989733, 0.47010267]], dtype=float32)

##### Above sentence is classified as a positive one with 52% confidence.