In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import warnings

import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

import spacy
import re

from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

warnings.filterwarnings('ignore')

In [2]:
#https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [3]:
imdb = pd.read_csv('data/IMDB Dataset.csv')
imdb.columns = ['review', '-sentiment-']

In [4]:
imdb.head()

Unnamed: 0,review,-sentiment-
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review       50000 non-null  object
 1   -sentiment-  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
def clean_text(doc):
    # tokenize
    doc_words = word_tokenize(doc)

    # lowercase
    doc_lower = [word.lower() for word in doc_words]

    # remove stop words
    doc_nostop = [token for token in doc_lower if token not in stopwords.words('english')]

    # remove punctuation
    doc_nopunc = [word for word in doc_nostop if word.isalpha()]

    # stem the tokens
    stemmer = SnowballStemmer('english')
    doc_stem = [stemmer.stem(word) for word in doc_nopunc]

    return_doc = ' '.join(doc_stem)

    return return_doc

In [7]:
# Split into positive and negative documents
pos_doc = ''
neg_doc = ''
for index, row in imdb[:2000].iterrows():
    if row['-sentiment-'] == 'positive':
        pos_doc = pos_doc + row['review']
    else:
        neg_doc = neg_doc + row['review']

In [8]:
# Parse the reviews
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 2000000
pos_doc = nlp(pos_doc)
neg_doc = nlp(neg_doc)

In [9]:
# Group into sentences
pos_sents = [[sent, "positive"] for sent in pos_doc.sents]
neg_sents = [[sent, "negative"] for sent in neg_doc.sents]

# Combine the sentences from the two docs into one DataFrame
sentences = pd.DataFrame(pos_sents[:2000] + neg_sents[:2000], columns = ["text", "-sentiment-"])

In [10]:
sentences.head()

Unnamed: 0,text,-sentiment-
0,"(One, of, the, other, reviewers, has, mentione...",positive
1,"(They, are, right, ,, as, this, is, exactly, w...",positive
2,"(/>The, first, thing, that, struck, me, about,...",positive
3,"(Trust, me, ,, this, is, not, a, show, for, th...",positive
4,"(This, show, pulls, no, punches, with, regards...",positive


In [11]:
# clean the text
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = " ".join(
        [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop and token.is_alpha])


In [12]:
# Analyze the BOW data
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='word')
X = vectorizer.fit_transform(sentences["text"])
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
bow_sentences = pd.concat([bow_df, sentences[["text", "-sentiment-"]]], axis=1)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y = bow_sentences['-sentiment-']
X = np.array(bow_sentences.drop(['text','-sentiment-'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.9490625

Test set score: 0.6875
----------------------Random Forest Scores----------------------
Training set score: 0.9871875

Test set score: 0.65625
----------------------Gradient Boosting Scores----------------------
Training set score: 0.686875

Test set score: 0.59


In [14]:
# Use TF-IDF to convert text feature
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_df=0.5, min_df=2, use_idf=True, norm=u'l2', smooth_idf=True)
X = vectorizer.fit_transform(sentences["text"])
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
tfidf_sentences = pd.concat([bow_df, sentences[["text", "-sentiment-"]]], axis=1)

In [15]:
tfidf_sentences.head()

Unnamed: 0,aaliyah,abandon,abbot,abbott,abbreviate,abet,abhorrent,abide,abiding,ability,...,ziyi,zombie,zombies,zone,zoo,zoom,zulu,zwick,text,-sentiment-
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,reviewer mention watch Oz episode hook,positive
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,right exactly happen,positive
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,thing strike Oz brutality unflinche scene viol...,positive
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,trust faint hearted timid,positive
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,pull punch regard drug sex violence,positive


In [16]:
# Analyze the TF-IDF data
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y = tfidf_sentences['-sentiment-']
X = np.array(tfidf_sentences.drop(['text','-sentiment-'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.9604166666666667

Test set score: 0.673125
----------------------Random Forest Scores----------------------
Training set score: 0.99

Test set score: 0.614375
----------------------Gradient Boosting Scores----------------------
Training set score: 0.7179166666666666

Test set score: 0.584375


In [17]:
# Use Word2Vec to convert text feature
import gensim

# Train word2vec on the sentences
model = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=0.5,
    window=12,
    sg=0,
    sample=0.001,
    size=100,
    hs=1
)

In [18]:
word2vec_arr = np.zeros((sentences.shape[0], 100))

for i, sentence in enumerate(sentences["text"]):
    word2vec_arr[i,:] = np.mean([model[lemma] for lemma in sentence], axis=0)

word2vec_arr = pd.DataFrame(word2vec_arr)
w2v_sentences = pd.concat([sentences[["-sentiment-", "text"]],word2vec_arr], axis=1)
w2v_sentences.dropna(inplace=True)

w2v_sentences.head()

Unnamed: 0,-sentiment-,text,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
0,positive,reviewer mention watch Oz episode hook,-0.151249,-0.353188,0.065817,-0.032253,0.008561,-0.006188,0.046282,-0.09232,...,-0.050658,0.053701,0.031562,-0.004698,0.071064,-0.069873,-0.024052,0.033327,0.036499,-0.026365
1,positive,right exactly happen,-0.163511,-0.44006,0.133467,-0.057674,-0.02522,-0.060228,0.01846,-0.098982,...,-0.022171,0.023984,0.033788,0.024046,0.081334,0.045124,-0.010001,0.040297,-0.00904,-0.035455
2,positive,thing strike Oz brutality unflinche scene viol...,-0.155503,-0.412025,0.117487,-0.055959,0.007992,-0.04363,0.038388,-0.115279,...,-0.021589,0.025454,0.032383,0.020385,0.076433,0.017668,-0.025647,0.034134,-0.011445,-0.025262
3,positive,trust faint hearted timid,-0.156114,-0.427978,0.131881,-0.045207,-0.045937,-0.075598,0.038368,-0.089455,...,-0.038107,0.005767,0.030569,0.036344,0.08923,0.052766,-0.033745,0.028816,-0.01196,-0.031888
4,positive,pull punch regard drug sex violence,-0.14644,-0.415308,0.122994,-0.074044,0.048729,-0.042702,0.039688,-0.141606,...,0.018393,0.01016,0.048882,0.02206,0.050674,0.091039,0.03409,0.018852,-0.069765,-0.042664


In [19]:
# Analyze the Word2Vec data
Y = w2v_sentences['-sentiment-']
X = np.array(w2v_sentences.drop(['text','-sentiment-'], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.5570698466780238

Test set score: 0.5549169859514687
----------------------Random Forest Scores----------------------
Training set score: 0.9948892674616695

Test set score: 0.558109833971903
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8343270868824532

Test set score: 0.5498084291187739
