In [1]:
import re
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords # Import the stop word list

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### Import Data

In [2]:
train = pd.read_csv('labeledTrainData.tsv', delimiter="\t", header=0,  quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)
num_train_reviews = len(train['review'])
num_test_reviews = len(test['review'])

In [3]:
np.unique(train["sentiment"], return_counts=True)

(array([0, 1], dtype=int64), array([12500, 12500], dtype=int64))

### Word Preprocessing

In [4]:
def review_to_words(data):
    soup = BeautifulSoup(data)
    letters_only = re.sub("[^a-zA-Z]", " ", soup.get_text())
    lower_case = letters_only.lower()
    words = lower_case.split()
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]
    return (" ".join(words))

### Feature Construction

In [5]:
clean_train_reviews = []
for i in range(0, num_train_reviews):
    clean_train_reviews.append(review_to_words(train["review"][i]))



In [6]:
vectorizer = CountVectorizer(analyzer= "word",
                            tokenizer= None,
                            preprocessor= None,
                            stop_words= None,
                            max_features= 5000)
processed_features = vectorizer.fit_transform(clean_train_reviews)

### Model Training

In [7]:
forest = RandomForestClassifier(n_estimators=1000)
forest = forest.fit(processed_features, train["sentiment"])

### Inference

In [36]:
def get_feature_vector(reviews, vectorizer):
    clean_reviews = []
    for i in range(0, len(reviews)):
        clean_reviews.append(review_to_words(reviews[i]))

    test_data_features = vectorizer.transform(clean_reviews)
    test_data_features = test_data_features.toarray()
    
    return test_data_features

In [9]:
test_data_features = get_feature_vector(test["review"], vectorizer)
result = forest.predict(test_data_features)
output = pd.DataFrame(data={"id":test['id'],
                            "sentiment":result})
output.to_csv("results/Bag_of_Words_model.csv", index=False, quoting=3)



## TF-IDF

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=2500,stop_words=stopwords.words('english'))
tfidf_features = tfidf_vectorizer.fit_transform(clean_train_reviews).toarray()

### Model Training

In [11]:
tfidf_forest = RandomForestClassifier(n_estimators=1000)
tfidf_forest = tfidf_forest.fit(tfidf_features, train["sentiment"])

### Inference

In [12]:
tfidf_test_features = get_feature_vector(test["review"], tfidf_vectorizer)
tfidf_result = forest.predict(tfidf_test_features)
tfidf_output = pd.DataFrame(data={"id":test['id'],
                            "sentiment":tfidf_result})
tfidf_output.to_csv("Bag_of_Words_model_tf_idf.csv", index=False, quoting=3)



## Bigrams

In [33]:
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=5000)
bigram_features = bigram_vectorizer.fit_transform(clean_train_reviews)

In [34]:
bigram_forest = RandomForestClassifier(n_estimators=100)
bigram_forest = bigram_forest.fit(bigram_features, train["sentiment"])

In [38]:
bigram_test_features = get_feature_vector(test["review"], bigram_vectorizer)
bigram_result = bigram_forest.predict(bigram_test_features)
bigram_output = pd.DataFrame(data={"id":test['id'], "sentiment":bigram_result})
bigram_output.to_csv("results/Bag_of_Words_model_bigram.csv", index=False, quoting=3)

### Bigram TF-IDF

In [42]:
bigram_tf_vectorizer = TfidfVectorizer(ngram_range=(2, 2), max_features=5000)
bigram_tf_features = bigram_tf_vectorizer.fit_transform(clean_train_reviews)

In [43]:
bigram_tf_forest = RandomForestClassifier(n_estimators=100)
bigram_tf_forest = bigram_tf_forest.fit(bigram_features, train["sentiment"])

In [44]:
bigram_tf_test_features = get_feature_vector(test["review"], bigram_tf_vectorizer)
bigram_tf_result = bigram_tf_forest.predict(bigram_tf_test_features)
bigram_tf_output = pd.DataFrame(data={"id":test['id'], "sentiment":bigram_tf_result})
bigram_tf_output.to_csv("results/Bag_of_Words_model_bigram_tfidf.csv", index=False, quoting=3)

