### IMDB sentiment classification

In [1]:
import nltk
import pandas as pd
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

##### Get the data from https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [2]:
imdb_data = pd.read_csv('data/imdb.csv')

In [3]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


#### Encoding the labels

In [4]:
sent_dict ={'positive':1, 'negative':0}
y_sentiments = imdb_data.sentiment.replace(sent_dict)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(imdb_data.review, y_sentiments, test_size=0.2, stratify=y_sentiments)

#### Computing bag of words

In [6]:
CV = CountVectorizer(min_df=5, stop_words='english', tokenizer=word_tokenize)

In [7]:
X_train_counts = CV.fit_transform(x_train)

In [8]:
X_test_counts = CV.transform(x_test)

#### Model training

In [9]:
# clf = RandomForestClassifier(max_depth=25, random_state=0)
clf = LogisticRegression(max_iter=1000, random_state=0, solver='liblinear')
# clf = GaussianNB()

In [10]:
clf.fit(X_train_counts, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
predicted = clf.predict(X_test_counts)

In [12]:
accuracy_score(y_test, predicted)

0.8859

#### Training a NN model using keras

In [13]:
from keras import models, layers

Using TensorFlow backend.


In [14]:
import tensorflow as tf

In [15]:
model = models.Sequential()
model.add(layers.Dense(128, activation = "relu", input_shape=(X_train_counts.shape[1], )))
# Hidden - Layers
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(64, activation = "relu"))
model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
model.add(layers.Dense(32, activation = "relu"))
# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               4780032   
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 4,790,401
Trainable param

In [16]:
model.compile(optimizer='adam', loss=tf.keras.losses.binary_crossentropy)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [17]:
model.fit(X_train_counts, y_train, epochs=1)


Epoch 1/1


<keras.callbacks.callbacks.History at 0x156b841e4e0>

In [18]:
predicted = model.predict_classes(X_test_counts)

In [19]:
accuracy_score(y_test, predicted)

0.8998