In [1]:
#import libraries
import nltk
import pickle
import re
import sys
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords # Import the stop word list
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import json

In [20]:
def load_data_from_file(filename):
    with open(filename) as infile:
        features_pos = []
        features_neg = []
        features = []
        labels = []
        num_data = 0
        num_pos_data = 0
        num_neg_data = 0
        data_size = 50000
        half_size = data_size / 2
        for line in infile:

            review = json.loads(line)
            if int(review['stars']) > 3:    # Positive
                if num_pos_data < half_size:
                    labels.append('1')
                    features.append(review['text'])
                    num_pos_data += 1
                    num_data += 1

            if int(review['stars']) < 3:    # Negative
                if num_neg_data < half_size:
                    labels.append('-1')
                    features.append(review['text'])
                    num_neg_data += 1
                    num_data += 1

            if num_pos_data + num_neg_data == data_size:
                # features = features_pos[0:1000] + features_neg[0:1000]
                print (len(features), len(labels), num_pos_data, num_neg_data)
                return features, labels

    return features, labels


In [3]:
def review_to_words( raw_review ):
   
    # 1. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 

    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             

    # 3. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
 
    # 4. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   

    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words )) 


In [4]:
def buildModelAndTest(vectorizer, forest, X_train, y_train, X_test, y_test):
    #preprocess the training reviews
    X_train = [review_to_words(review) for review in X_train]
    
    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of 
    # strings.
    X_train = vectorizer.fit_transform(X_train).toarray()
    
    X_test = [review_to_words(review) for review in X_test]
    X_test = vectorizer.transform(X_test).toarray()
    forest = forest.fit( X_train, y_train )

    y_pred = forest.predict(X_test)
    print(classification_report(y_test,y_pred, target_names=['-1','1']))


In [21]:
X, y = load_data_from_file("yelp_academic_dataset_review.json")
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state=42)

50000 50000 25000 25000


# EXPERIMENTS

Experiment1 : Countvectoriser with maximum features = 5000 and number of trees = 100

In [26]:
countvectorizer1 = CountVectorizer(analyzer = "word", max_features = 5000) 

In [27]:
# Initialize a Random Forest classifier with 100 trees
forest1 = RandomForestClassifier(n_estimators = 100) 

In [28]:
buildModelAndTest(countvectorizer1,forest1, X_train, y_train, X_test, y_test)

             precision    recall  f1-score   support

         -1       0.88      0.89      0.89      7421
          1       0.90      0.89      0.89      7579

avg / total       0.89      0.89      0.89     15000



Experiment2 : Countvectoriser with maximum features = 2000 and number of trees = 50

In [33]:
countvectorizer2 = CountVectorizer(analyzer = "word", max_features = 2000) 
forest2 = RandomForestClassifier(n_estimators = 50) 
buildModelAndTest(countvectorizer2,forest2, X_train, y_train, X_test, y_test)

             precision    recall  f1-score   support

         -1       0.88      0.89      0.88      7421
          1       0.89      0.88      0.88      7579

avg / total       0.88      0.88      0.88     15000



Experiment3 : Countvectoriser with maximum features = 5000, number of trees =100 and using bigrams and trigrams

In [36]:
countvectorizer3 = CountVectorizer(analyzer = "word", max_features = 5000, ngram_range = (1,3)) 
forest3 = RandomForestClassifier(n_estimators = 100) 
buildModelAndTest(countvectorizer3,forest3, X_train, y_train, X_test, y_test)


             precision    recall  f1-score   support

         -1       0.89      0.90      0.89      7421
          1       0.90      0.89      0.89      7579

avg / total       0.89      0.89      0.89     15000



Experiment4 : TFIDF vectorizer with number of trees = 100

In [38]:
tfidfvectorizer1 = TfidfVectorizer(min_df=5, sublinear_tf=True, use_idf=True)
forest4 = RandomForestClassifier(n_estimators = 100) 
buildModelAndTest(tfidfvectorizer1,forest4, X_train, y_train, X_test, y_test)


             precision    recall  f1-score   support

         -1       0.88      0.90      0.89      7421
          1       0.90      0.88      0.89      7579

avg / total       0.89      0.89      0.89     15000



Experiment5 : TFIDF vectorizer with number of trees = 50

In [24]:
tfidfvectorizer2 = TfidfVectorizer(min_df=5, sublinear_tf=True, use_idf=True)
forest5 = RandomForestClassifier(n_estimators = 50) 
buildModelAndTest(tfidfvectorizer2,forest5, X_train, y_train, X_test, y_test)

             precision    recall  f1-score   support

         -1       0.87      0.90      0.88      7421
          1       0.90      0.87      0.88      7579

avg / total       0.88      0.88      0.88     15000



Experiment6 : TFIDF vectorizer with number of trees = 50 and maximum depth of each tree = 5

In [27]:
tfidfvectorizer2 = TfidfVectorizer(min_df=5, sublinear_tf=True, use_idf=True)

forest6 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=2,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=True)
buildModelAndTest(tfidfvectorizer2,forest6, X_train, y_train, X_test, y_test)

             precision    recall  f1-score   support

         -1       0.87      0.79      0.83      7421
          1       0.81      0.88      0.84      7579

avg / total       0.84      0.84      0.84     15000



Experiment7 : TFIDF vectorizer with number of trees = 100 and maximum depth of each tree = 15

In [34]:

forest7 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, 
            min_samples_split=None, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
buildModelAndTest(tfidfvectorizer2,forest6, X_train, y_train, X_test, y_test)

  warn("Warm-start fitting without increasing n_estimators does not "


             precision    recall  f1-score   support

         -1       0.87      0.79      0.83      7421
          1       0.81      0.88      0.84      7579

avg / total       0.84      0.84      0.84     15000

