In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

import os
import re

In [None]:
reviews_train = []
for line in open('./data/imdb-review/full_train.txt', 'r',encoding='utf8'):
    reviews_train.append(line.strip())

reviews_test = []
for line in open('./data/imdb-review/full_test.txt', 'r',encoding='utf8'):
    reviews_test.append(line.strip())

In [None]:
reviews_train[5]

In [None]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

reviews_train_clean[5]

# Baseline Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

baseline_vect = CountVectorizer(binary = True)
baseline_vect.fit(reviews_train_clean)
X_baseline = baseline_vect.transform(reviews_train_clean)
X_test_baseline = baseline_vect.transform(reviews_test_clean)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

target = [1 if i<12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(X_baseline, target, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C = c)
    lr.fit(X_train, y_train)
    print("Accuracy for C=%s : %s"%(c, accuracy_score(y_val, lr.predict(X_val))))

In [None]:
final_model = LogisticRegression(C = 0.05)
final_model.fit(X_baseline, target)
print("Final Accuracy: %s" %accuracy_score(target, final_model.predict(X_test_baseline)))

In [None]:
feature_to_coef = {
    word: coef for word, coef in zip(
        baseline_vect.get_feature_names(), final_model.coef_[0]
    )
}

for best_positive in sorted(
    feature_to_coef.items(),
    key = lambda x: x[1],
    reverse = True)[:5]:
    print(best_positive)

In [None]:
for best_negative in sorted(feature_to_coef.items(),
                           key = lambda x: x[1])[:5]:
    print(best_negative)

# Remove Stop Words

In [None]:
from nltk.corpus import stopwords

english_stopwords = stopwords.words('english')

def remove_stopwords(corpus):
    removed_stopwords = []
    
    for review in corpus:
        removed_stopwords.append(' '.join([word for word in review.split()
                                          if word not in english_stopwords]))
    return removed_stopwords

no_stopwords_train =  remove_stopwords(reviews_train_clean)
no_stopwords_test = remove_stopwords(reviews_test_clean)

cv = CountVectorizer(binary=True)
cv.fit(no_stopwords_train)
X = cv.transform(no_stopwords_train)
X_test  =cv.transform(no_stopwords_test)

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)


for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print("Accuracy for C=%s : %s"%(c, accuracy_score(y_val, lr.predict(X_val))))

# Stemming

In [None]:
def stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews_train = stemmed_text(reviews_train_clean)
stemmed_reviews_test = stemmed_text(reviews_test_clean)

cv = CountVectorizer(binary = True)
cv.fit(stemmed_reviews_train)
X = cv.transform(stemmed_reviews_train)
X_test = cv.transform(stemmed_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size=0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print("Accuracy for C=%s : %s"%(c, accuracy_score(y_val, lr.predict(X_val))))


In [None]:
final_stemmed = LogisticRegression(C=0.05)
final_stemmed.fit(X, target)
print('Final Accuracy C=%s'%accuracy_score(target, final_stemmed.predict(X_test)))

# Lemmatization

In [None]:
def lemmatized_text(corpus):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    
    return[' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews_train = lemmatized_text(reviews_train_clean)
lemmatized_reviews_test = lemmatized_text(reviews_test_clean)

cv = CountVectorizer(binary= True)
cv.fit(lemmatized_reviews_train)
X = cv.transform(lemmatized_reviews_train)
X_test = cv.transform(stemmed_reviews_test)

X_train, X_val, y_train, y_test = train_test_split(X, target, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print("Accuracy for C=%s : %s"%(c, accuracy_score(y_val, lr.predict(X_val))))

In [None]:
final_lemmatized = LogisticRegression(C=0.5)
final_lemmatized.fit(X, target)
print("Final Accuracy C=%s" %accuracy_score(target, final_lemmatized.predict(X_test)))

# n-grams

In [None]:
ngram_vect = CountVectorizer(binary = True, ngram_range = (1,2))
ngram_vect.fit(reviews_train_clean)
X = ngram_vect.transform(reviews_train_clean)
X_test = ngram_vect.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size=0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print("Accuracy for C %s : %s" %(c, accuracy_score(y_val, lr.predict(X_val))))


In [None]:
final_ngram = LogisticRegression(C=0.25)
final_ngram.fit(X, target)
print("Final Accuracy C=%s" %accuracy_score(target, final_ngram.predict(X_test)))

# Word Counts

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

wc_vect = CountVectorizer(binary = False)
wc_vect.fit(reviews_train_clean)
X = wc_vect.transform(reviews_train_clean)
X_test = wc_vect.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C = c)
    lr.fit(X_train, y_train)
    
    print("Accuracy for C = %s : %s"%(c, accuracy_score(y_val, lr.predict(X_val))))

In [None]:
final_wc = LogisticRegression(C=0.05)
final_wc.fit(X, target)
print("Final Accuracy C = %s"%accuracy_score(target, final_wc.predict(X_test)))