In [1]:
#Loading it up
reviews_train = []
reviews_test = []

for line in open('aclImdb/movie_data/full_train.txt'):
    reviews_train.append(line.strip().lower())
    
for line in open('aclImdb/movie_data/full_test.txt'):
    reviews_test.append(line.strip().lower())
    

In [2]:
engStopWords = ['a', 'the', 'in', 'of', 'at', 'is']
def removeStopWords(reviewList):
    cleanReviews = []
    for review in reviewList:
        cleanReviews.append(' '.join(word for word in review.split() if word not in engStopWords))
    
    return cleanReviews

reviews_train = removeStopWords(reviews_train)
reviews_test = removeStopWords(reviews_test)


In [3]:
# Stemming -> https://www.nltk.org/howto/stem.html
from nltk.stem import SnowballStemmer

def stemWords(reviewList):
    stemReviews = []
    for review in reviewList:
        stemReviews.append(' '.join(SnowballStemmer("porter").stem(word) for word in review.split()))
        # "porter" gives more accuracy than "english"
    
    return stemReviews

reviews_train = stemWords(reviews_train)
reviews_test = stemWords(reviews_test)

In [4]:
# Ngram Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def getVectorizer(type):
    if type == 'cv':
        return CountVectorizer(binary=True, ngram_range=(1,2))
    elif type == 'tfidf':
        return TfidfVectorizer(ngram_range=(1,2))
    elif type == 'cv-grp':
        return CountVectorizer(binary=False, ngram_range=(1,2))

vec = getVectorizer('cv')
vec.fit(reviews_train)

X = vec.transform(reviews_train)
X_test = vec.transform(reviews_test)


In [7]:
# LOGISTIC REGRESSION - Finding optimal value for c
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

target = [1 if i<12500 else 0 for i in range(25000)]

X_train, X_val, Y_train, Y_val = train_test_split(X, target, train_size=0.75)

for c in [0.01, 0.05, 0.1, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, Y_train)

    print("Accuracy score for c = %s is %s"%(c, accuracy_score(Y_val, lr.predict(X_val))))


Accuracy score for c = 0.01 is 0.87376
Accuracy score for c = 0.05 is 0.87952
Accuracy score for c = 0.1 is 0.88256
Accuracy score for c = 0.5 is 0.88224
Accuracy score for c = 1 is 0.88192


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [8]:
# Moving on with C = 0.5

lr = LogisticRegression(C=0.5)
lr.fit(X, target)

print("Final accuracy score for c = %s is %s"%(0.5, accuracy_score(target, lr.predict(X_test))))



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Final accuracy score for c = 0.5 is 0.89924


In [9]:
# SVM with linear kernel
from sklearn.svm import LinearSVC

for c in [0.01, 0.05, 0.1, 0.5, 1]:
    svc = LinearSVC(C=c)
    svc.fit(X_train, Y_train)

    print("Accuracy score for c = %s is %s"%(c, accuracy_score(Y_val, svc.predict(X_val))))

Accuracy score for c = 0.01 is 0.8824
Accuracy score for c = 0.05 is 0.87712
Accuracy score for c = 0.1 is 0.87584
Accuracy score for c = 0.5 is 0.87488
Accuracy score for c = 1 is 0.87424




In [10]:
# Moving on with C = 0.01

svc = LinearSVC(C=0.01)
svc.fit(X, target)

print("Final accuracy score for c = %s is %s"%(0.5, accuracy_score(target, svc.predict(X_test))))

Final accuracy score for c = 0.5 is 0.89876


In [11]:
import pickle

pickle.dump(lr, open('lrOpti.pkl', 'wb'))
pickle.dump(svc, open('svc.pkl', 'wb'))
