In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
train_pos_files= os.listdir("train/pos/")
train_neg_files= os.listdir("train/neg")
test_pos_files= os.listdir("test/pos/")
test_neg_files= os.listdir("test/neg")

In [3]:
train_reviews=[]
for  pfile in train_pos_files:
    with open("train/pos/"+ pfile,encoding="latin1") as f:
        train_reviews.append(f.read())
for  nfile in train_neg_files:
    with open("train/neg/"+ nfile,encoding="latin1") as f:
        train_reviews.append(f.read())

In [4]:
test_reviews=[]
for  pfile in test_pos_files:
    with open("test/pos/"+ pfile,encoding="latin1") as f:
        test_reviews.append(f.read())
for  nfile in test_neg_files:
    with open("test/neg/"+ nfile,encoding="latin1") as f:
        test_reviews.append(f.read())

In [5]:
print(len(train_reviews))
print(len(test_reviews))

25000
25000


remove stopwords and remove common punctuations

In [37]:
import re
from nltk.corpus import stopwords
stopWords = stopwords.words('english')
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]*")

def preprocess_reviews(reviews):
    out=[]
    reviews = [REPLACE_NO_SPACE.sub("", review.lower()) for review in reviews]
#     for review in reviews:
#         out.append(" ".join([word for word in review.split() if word not in stopWords]))
    return [" ".join([word for word in review.split() if word not in stopWords]) for review in reviews]


reviews_train_clean = preprocess_reviews(train_reviews)
reviews_test_clean = preprocess_reviews(test_reviews)

In [38]:
len(reviews_train_clean)

25000

In [39]:
reviews_train_clean[:3]

['movie gets respect sure lot memorable quotes listed gem imagine movie joe piscopo actually funny maureen stapleton scene stealer moroni character absolute scream watch alan skipper hale jr police sgt',
 'bizarre horror movie filled famous faces stolen cristina raines later tvs flamingo road pretty somewhat unstable model gummy smile slated pay attempted suicides guarding gateway hell scenes raines modeling well captured mood music perfect deborah raffin charming cristinas pal raines moves creepy brooklyn heights brownstone inhabited blind priest top floor things really start cooking neighbors including fantastically wicked burgess meredith kinky couple sylvia miles & beverly dangelo diabolical lot eli wallach great fun wily police detective movie nearly cross-pollination rosemarys baby exorcist--but combination based best-seller jeffrey konvitz sentinel entertainingly spooky full shocks brought well director michael winner mounts thoughtfully downbeat ending skill ***1/2 ****',
 'sol

In [40]:
def get_stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews = get_stemmed_text(reviews_train_clean)

In [41]:
stemmed_reviews[:3]

['movi get respect sure lot memor quot list gem imagin movi joe piscopo actual funni maureen stapleton scene stealer moroni charact absolut scream watch alan skipper hale jr polic sgt',
 'bizarr horror movi fill famou face stolen cristina rain later tv flamingo road pretti somewhat unstabl model gummi smile slate pay attempt suicid guard gateway hell scene rain model well captur mood music perfect deborah raffin charm cristina pal rain move creepi brooklyn height brownston inhabit blind priest top floor thing realli start cook neighbor includ fantast wick burgess meredith kinki coupl sylvia mile & beverli dangelo diabol lot eli wallach great fun wili polic detect movi nearli cross-pollin rosemari babi exorcist--but combin base best-sel jeffrey konvitz sentinel entertainingli spooki full shock brought well director michael winner mount thought downbeat end skill ***1/2 ****',
 'solid unremark film matthau einstein wonder favorit part thing would make go way see wonder scene physicist pl

linear classifiers typically perform better on sparse data(lots of zeroes). we can also try SVM with linear kernel on this type of data.

In [42]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.87744
Accuracy for C=0.05: 0.88448
Accuracy for C=0.25: 0.88624
Accuracy for C=0.5: 0.88576
Accuracy for C=1: 0.88544


In the above, we used a binary count to tell if a word is present or not, next we will actually count the frequency of each word. This has increase our accuracy by 1%

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vectorizer = CountVectorizer(binary=False, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.88592
Accuracy for C=0.05: 0.8944
Accuracy for C=0.25: 0.89424
Accuracy for C=0.5: 0.89424
Accuracy for C=1: 0.89344


In [46]:
X.shape

(25000, 1922798)

for c=0.25, acc=max; .886

We will use tf-idf for feature respresentations. tf-idf aims to represent the number of times a given word appears in a document (a movie review in our case) relative to the number of documents in the corpus that the word appears in — where words that appear in many documents have a value closer to zero and words that appear in less documents have values closer to 1.

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(reviews_train_clean)
X = tfidf_vectorizer.transform(reviews_train_clean)
X_test = tfidf_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

# Accuracy for C=0.01: 0.79632
# Accuracy for C=0.05: 0.83168
# Accuracy for C=0.25: 0.86768
# Accuracy for C=0.5: 0.8736
# Accuracy for C=1: 0.88432



Accuracy for C=0.01: 0.79888
Accuracy for C=0.05: 0.84064
Accuracy for C=0.25: 0.87696
Accuracy for C=0.5: 0.88848
Accuracy for C=1: 0.89472
Final Accuracy: 0.88388


In [49]:
final_model = LogisticRegression(C=1)
final_model.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_model.predict(X_test)))



Final Accuracy: 0.88388


In [51]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vectorizer = CountVectorizer(binary=False, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    
# Accuracy for C=0.01: 0.89104
# Accuracy for C=0.05: 0.88736
# Accuracy for C=0.25: 0.8856
# Accuracy for C=0.5: 0.88608
# Accuracy for C=1: 0.88592
    
final_svm_ngram = LinearSVC(C=0.01)
final_svm_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_svm_ngram.predict(X_test)))



Accuracy for C=0.01: 0.89104
Accuracy for C=0.05: 0.88704
Accuracy for C=0.25: 0.88512




Accuracy for C=0.5: 0.8848
Accuracy for C=1: 0.88496
Final Accuracy: 0.88644


for us, logistic regressioon with count vectorizer binary=False with n-grams (1,2) worked best.