Initialize required lists, stemmers, stop words and punctuation removers.

In [2]:
import nltk
nltk.download('stopwords')
from nltk.stem import *
import string

# Use SnowballStemmer to stem input comments.
ps = SnowballStemmer("english")
# Use nltk's predefined stopword list as our stop_words set.
stop_words = set(nltk.corpus.stopwords.words('english'))

# Remove all occurrences of punctuation with this function.
punct_remove = string.punctuation.maketrans('', '', string.punctuation)

# Initiate list
x_train = list()
y_train = list()
x_test = list()
y_test = list()
pos_list = list()

x_real_test = list()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Read in training and test set.

In [0]:
import os

# Open Negative Training Data and append entries to list.
for filename in os.listdir('./train/neg'):
    file = open('./train/neg/' + filename, encoding="utf8")
    x_train.append(file.read())
    y_train.append(0)
    file.close()

# Open Positive Training Data and append entries to list.
for filename in os.listdir('./train/pos'):
    file = open('./train/pos/' + filename, encoding="utf8")
    x_train.append(file.read())
    y_train.append(1)
    file.close()

for i in range(25000):
    file = open("./test/%d.txt"%(i), encoding="utf8")
    x_real_test.append(file.read())
    file.close()


Function to process text before pipeline.



In [0]:
from nltk.tokenize import sent_tokenize as st, word_tokenize as wt
import re

'''
Process text before pipeline.
'''
def stem_words(data, linebreak=False, notcontract=True,
              havecontract=True, punctuation=True):

    def feature_tokens(tokens):
        stemtokens = list()
        for i in range(len(tokens)):
            if tokens[i] == 'not':
                i += 1
                continue
            if tokens[i] not in stop_words:
                stemmed = ps.stem(tokens[i])
                if len(stemmed) > 2:
                    stemtokens.append(stemmed)
        return stemtokens

    def processText(text, linebreak=False, notcontract=True, 
                    havecontract=True, punctuation=True):
        if linebreak: text = re.sub("<.*>", ' ', text)
        if notcontract: text = re.sub("n't", ' not', text)
        if havecontract: text = re.sub("'ve", ' have', text)
        if punctuation: text = text.translate(punct_remove).lower()
        return text

    # initiate list for counting word frequencies in the list of documents
    new_train = list()
    for rawtext in data:
        # remove line breaks, indenting, punctuation, contractions
        text = processText(rawtext, linebreak, notcontract, havecontract,
                          punctuation)

        # adds all stems that aren't stopwords
        tokens = wt(text)
        stemtokens = feature_tokens(tokens)
        new_train.append(' '.join(stemtokens))
        
    return new_train

Run pre-processing pipeline - model and generate predictions. Print metrics of performance on held-out validation set.

In [0]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import time

prestart = time.time()
new_train = stem_words(raw_x_train)

X_train, X_test, y_train, y_test = train_test_split(new_train, y_train, train_size=0.8, test_size=0.2)


pclf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', LogisticRegression()),
])
preend = time.time()

fitstart = time.time()
pclf.fit(X_train, y_train)
fitend = time.time()

predstart = time.time()
y_pred = pclf.predict(X_test)
predend = time.time()

end = time.time()

print("Pre-Process Time: {}\nTraining Time: {}\nPrediction Time: {}".format(preend-prestart,
                                                                            fitend-fitstart, predend-predstart))
print(metrics.classification_report(y_test, y_pred))

print(metrics.classification_report(y_test, y_pred))

# Logistic Regression Tests

In this test, we use tf-idf, a count vectorizer, and L2-Normalization. We also expand not and have contractions and remove punctuation but keep line breaks, before stemming the words. Note that we additionally scan for and disregard terms (bi-grams) of the form "not x".

**We vary our values for C in our Logistic Regression**

In [0]:
new_train = stem_words(raw_x_train)

X_train, X_test, y_train, y_test = train_test_split(new_train, y_train, train_size=0.8, test_size=0.2)

for i in [0.01, 0.05, 0.25, 0.5, 0.6, 0.75, 1]:
  
    prestart = time.time()
    pclf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('norm', Normalizer()),
        ('clf', LogisticRegression(C=i)),
    ])
    preend = time.time()

    fitstart = time.time()
    pclf.fit(X_train, y_train)
    fitend = time.time()

    predstart = time.time()
    y_pred = pclf.predict(X_test)
    predend = time.time()

    end = time.time()
    
    print("C={}:\nPre-Process Time: {}\nTraining Time: {}\nPrediction Time: {}".format(i,preend-prestart,
                                                                                fitend-fitstart, predend-predstart))
    print(metrics.classification_report(y_test, y_pred))

    print(metrics.classification_report(y_test, y_pred))

**What happens if we turn off the IDF?**

In [0]:
new_train = stem_words(raw_x_train)

X_train, X_test, y_train, y_test = train_test_split(new_train, y_train, train_size=0.8, test_size=0.2)

for i in [0.01, 0.05, 0.25, 0.5, 0.6, 0.75, 1]:
  
    prestart = time.time()
    pclf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('norm', Normalizer()),
        ('clf', LogisticRegression(C=i)),
    ])
    preend = time.time()

    fitstart = time.time()
    pclf.fit(X_train, y_train)
    fitend = time.time()

    predstart = time.time()
    y_pred = pclf.predict(X_test)
    predend = time.time()

    end = time.time()
    
    print("C={}:\nPre-Process Time: {}\nTraining Time: {}\nPrediction Time: {}".format(i,preend-prestart,
                                                                                fitend-fitstart, predend-predstart))
    print(metrics.classification_report(y_test, y_pred))

    print(metrics.classification_report(y_test, y_pred))

**What about if we use L1 Normalization when computing the TF-IDF vectors?**

In [0]:
new_train = stem_words(raw_x_train)

X_train, X_test, y_train, y_test = train_test_split(new_train, y_train, train_size=0.8, test_size=0.2)

for i in [0.01, 0.05, 0.25, 0.5, 0.6, 0.75, 1]:
  
    prestart = time.time()
    pclf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer(norm='l1')),
        ('norm', Normalizer()),
        ('clf', LogisticRegression(C=i)),
    ])
    preend = time.time()

    fitstart = time.time()
    pclf.fit(X_train, y_train)
    fitend = time.time()

    predstart = time.time()
    y_pred = pclf.predict(X_test)
    predend = time.time()

    end = time.time()
    
    print("C={}:\nPre-Process Time: {}\nTraining Time: {}\nPrediction Time: {}".format(i,preend-prestart,
                                                                                fitend-fitstart, predend-predstart))
    print(metrics.classification_report(y_test, y_pred))

    print(metrics.classification_report(y_test, y_pred))

**Let's now try to vary the minimum document frequency required for terms to be considered in our Count Vectorizer**





In [0]:
new_train = stem_words(raw_x_train)

X_train, X_test, y_train, y_test = train_test_split(new_train, y_train, train_size=0.8, test_size=0.2)

for i in [0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035]:
  
    prestart = time.time()
    pclf = Pipeline([
        ('vect', CountVectorizer(min_df=i)),
        ('tfidf', TfidfTransformer()),
        ('norm', Normalizer()),
        ('clf', LogisticRegression(C=1)),
    ])
    preend = time.time()

    fitstart = time.time()
    pclf.fit(X_train, y_train)
    fitend = time.time()

    predstart = time.time()
    y_pred = pclf.predict(X_test)
    predend = time.time()

    end = time.time()
    
    print("Minimum DF={}:\nPre-Process Time: {}\nTraining Time: {}\nPrediction Time: {}".format(i,preend-prestart,
                                                                                fitend-fitstart, predend-predstart))
    print(metrics.classification_report(y_test, y_pred))

    print(metrics.classification_report(y_test, y_pred))

**We now vary the upper bound on the n-grams analzed in the Count Vectorizer**

In [0]:
new_train = stem_words(raw_x_train)

X_train, X_test, y_train, y_test = train_test_split(new_train, y_train, train_size=0.8, test_size=0.2)

for i in [1, 2, 3]:
  
    prestart = time.time()
    pclf = Pipeline([
        ('vect', CountVectorizer(ngram_range(1,i))),
        ('tfidf', TfidfTransformer()),
        ('norm', Normalizer()),
        ('clf', LogisticRegression(C=1)),
    ])
    preend = time.time()

    fitstart = time.time()
    pclf.fit(X_train, y_train)
    fitend = time.time()

    predstart = time.time()
    y_pred = pclf.predict(X_test)
    predend = time.time()

    end = time.time()
    
    print("N-Gram Upperbound={}:\nPre-Process Time: {}\nTraining Time: {}\nPrediction Time: {}".format(i,preend-prestart,
                                                                                fitend-fitstart, predend-predstart))
    print(metrics.classification_report(y_test, y_pred))

    print(metrics.classification_report(y_test, y_pred))

**And we enforce a strict n-gram count for the Count Vectorizer:**

In [0]:
new_train = stem_words(raw_x_train)

X_train, X_test, y_train, y_test = train_test_split(new_train, y_train, train_size=0.8, test_size=0.2)

for i in [1, 2, 3]:
  
    prestart = time.time()
    pclf = Pipeline([
        ('vect', CountVectorizer(ngram_range(i,i))),
        ('tfidf', TfidfTransformer()),
        ('norm', Normalizer()),
        ('clf', LogisticRegression(C=1)),
    ])
    preend = time.time()

    fitstart = time.time()
    pclf.fit(X_train, y_train)
    fitend = time.time()

    predstart = time.time()
    y_pred = pclf.predict(X_test)
    predend = time.time()

    end = time.time()
    
    print("N-Gram Strict Count={}:\nPre-Process Time: {}\nTraining Time: {}\nPrediction Time: {}".format(i,preend-prestart,
                                                                                fitend-fitstart, predend-predstart))
    print(metrics.classification_report(y_test, y_pred))

    print(metrics.classification_report(y_test, y_pred))

**Let's go back and try our first configuration, but with removing Line Breaks:**