In [1]:
import numpy as np
import pandas as pd
import os
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
reviews_train = []
for line in open('../IMDB/movie_data/full_train.txt', 'r', encoding='utf-8'):
    
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('../IMDB/movie_data/full_test.txt', 'r', encoding='utf-8'):
    
    reviews_test.append(line.strip())
    
target = [1 if i < 12500 else 0 for i in range(25000)]

In [4]:
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

In [5]:
baseline_vectorizer = CountVectorizer(binary=True)
baseline_vectorizer.fit(reviews_train_clean)
X_baseline = baseline_vectorizer.transform(reviews_train_clean)
X_test_baseline = baseline_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X_baseline, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.87312
Accuracy for C=0.05: 0.88432
Accuracy for C=0.25: 0.8832
Accuracy for C=0.5: 0.87936
Accuracy for C=1: 0.87872


In [6]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X_baseline, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_model.predict(X_test_baseline)))



Final Accuracy: 0.88168


In [10]:
from nltk.corpus import stopwords

english_stop_words = stopwords.words('english')

def remove_stop_words(corpus):
    remove_stop_words = []
    
    for review in corpus:
        remove_stop_words.append(' '.join([word for word in review.split() if word not in english_stop_words]))
        
    return remove_stop_words


no_stop_words_train = remove_stop_words(reviews_train_clean)
no_stop_words_test = remove_stop_words(reviews_test_clean)

In [11]:
cv = CountVectorizer(binary=True)
cv.fit(no_stop_words_train)
X = cv.transform(no_stop_words_train)
X_test = cv.transform(no_stop_words_test)

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.8752
Accuracy for C=0.05: 0.88432
Accuracy for C=0.25: 0.88064
Accuracy for C=0.5: 0.87952
Accuracy for C=1: 0.87664


In [15]:
from nltk.stem.porter import PorterStemmer
stemmmer = PorterStemmer()

def get_stemmed_text(corpus):
    return [" ".join([stemmmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews_train = get_stemmed_text(reviews_train_clean)
stemmed_reviews_test = get_stemmed_text(reviews_test_clean)

In [16]:
cv = CountVectorizer(binary=True)
cv.fit(stemmed_reviews_train)
X = cv.transform(stemmed_reviews_train)
X_test = cv.transform(stemmed_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
final_stemmed = LogisticRegression(C=0.05)
final_stemmed.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_stemmed.predict(X_test)))



Accuracy for C=0.01: 0.87552
Accuracy for C=0.05: 0.88336
Accuracy for C=0.25: 0.88272
Accuracy for C=0.5: 0.88048
Accuracy for C=1: 0.8784
Final Accuracy: 0.87748


In [19]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def get_lemmatized_text(corpus):
    return [" ".join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews_train = get_lemmatized_text(reviews_train_clean)
lemmatized_reviews_test = get_lemmatized_text(reviews_test_clean)

cv = CountVectorizer(binary=True)
cv.fit(lemmatized_reviews_train)
X = cv.transform(lemmatized_reviews_train)
X_test = cv.transform(lemmatized_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
final_lemmatized = LogisticRegression(C=0.25)
final_lemmatized.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_lemmatized.predict(X_test)))




Accuracy for C=0.01: 0.87104
Accuracy for C=0.05: 0.87984
Accuracy for C=0.25: 0.87744
Accuracy for C=0.5: 0.8784
Accuracy for C=1: 0.87584
Final Accuracy: 0.87444


In [20]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
final_ngram = LogisticRegression(C=0.5)
final_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_ngram.predict(X_test)))



Accuracy for C=0.01: 0.8864
Accuracy for C=0.05: 0.89328
Accuracy for C=0.25: 0.892
Accuracy for C=0.5: 0.89232
Accuracy for C=1: 0.8944
Final Accuracy: 0.898


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

wc_vectorizer = CountVectorizer(binary=False)
wc_vectorizer.fit(reviews_train_clean)
X = wc_vectorizer.transform(reviews_train_clean)
X_test = wc_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75, 
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

final_wc = LogisticRegression(C=0.05)
final_wc.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_wc.predict(X_test)))



Accuracy for C=0.01: 0.88656
Accuracy for C=0.05: 0.89248
Accuracy for C=0.25: 0.88896
Accuracy for C=0.5: 0.88432
Accuracy for C=1: 0.88336
Final Accuracy: 0.88184


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(reviews_train_clean)
X = tfidf_vectorizer.transform(reviews_train_clean)
X_test = tfidf_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.79984
Accuracy for C=0.05: 0.8264
Accuracy for C=0.25: 0.86256
Accuracy for C=0.5: 0.8736
Accuracy for C=1: 0.87984


In [23]:
final_tfidf = LogisticRegression(C=1)
final_tfidf.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_tfidf.predict(X_test)))



Final Accuracy: 0.882


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    
final_svm_ngram = LinearSVC(C=0.01)
final_svm_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_svm_ngram.predict(X_test)))




Accuracy for C=0.01: 0.88976
Accuracy for C=0.05: 0.8872
Accuracy for C=0.25: 0.88704




Accuracy for C=0.5: 0.88656
Accuracy for C=1: 0.8864
Final Accuracy: 0.8974


In [26]:
feature_to_coef = {word: coef for word, coef in zip(ngram_vectorizer.get_feature_names(), final_svm_ngram.coef_[0])}

for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:30]:
    print (best_positive)
    
print("\n\n")
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:30]:
    print (best_negative)

('excellent', 0.25448822493888634)
('perfect', 0.20715365799447802)
('great', 0.18700461806156607)
('enjoyable', 0.17885078194029524)
('wonderful', 0.16928992353739863)
('amazing', 0.16911807256008582)
('superb', 0.16700003571007702)
('today', 0.1458695172764692)
('better than', 0.14354979032580478)
('must see', 0.1414640141646173)
('incredible', 0.1395277934274742)
('fun', 0.13749632667356254)
('well worth', 0.13531682081457716)
('enjoyed', 0.132786800230463)
('refreshing', 0.13275543057137834)
('wonderfully', 0.13240511041748634)
('brilliant', 0.13163256667883855)
('rare', 0.13022868116571132)
('definitely worth', 0.12742810927291803)
('fantastic', 0.12658450222993936)
('loved', 0.1256983093820601)
('moving', 0.12178224335150316)
('the best', 0.12150540358367938)
('perfectly', 0.12142605621297009)
('gem', 0.12123182096910039)
('very good', 0.12095555317697015)
('liked', 0.11696299664422549)
('bit', 0.1164509270011851)
('loved this', 0.11562264053331012)
('to all', 0.11494468020372049

https://towardsdatascience.com/sentiment-analysis-with-python-part-2-4f71e7bde59a