In [1]:
import pandas as pd
import numpy as np
from lxml import etree
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [9]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
#     review_text = BeautifulSoup(raw_review).get_text() 
    review_text = ''.join(etree.HTML(raw_review).xpath('//text()'))
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(nltk.corpus.stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   

In [4]:
train = pd.read_csv('NLP/labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

# clean_train_reviews = train.review.map(review_to_words)

In [6]:
print("Creating the bag of words...\n")

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer="word",
                             # tokenizer=None,
                             # preprocessor=None,
                             # stop_words=None,
                             stop_words=nltk.corpus.stopwords.words("english"),
                             max_features=5000)

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
# train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = vectorizer.fit_transform(train.review)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()


Creating the bag of words...



In [14]:
vectorizer_tmp = vectorizer
train_data_features_tmp = train_data_features

In [20]:
vectorizer_tmp == vectorizer

False

In [7]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print(vocab[:3])

['00', '000', '10']


In [43]:
# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features_tmp, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab[:5], dist[:5]):
    print(count, tag)

187 abandoned
125 abc
108 abilities
454 ability
1259 able


# Random Forest




In [8]:
# train random forest tree
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_data_features, train['sentiment'])


In [40]:
test = pd.read_csv('testData.tsv', quoting=3, delimiter='\t')
clean_test_reviews = test.review.map(review_to_words)
test_data_feature = vectorizer.transform(clean_test_reviews).toarray()
#use random forest
result = forest.predict(test_data_feature)
# output = pd.DataFrame({'id': test.id, 'sentiment': result})
# output.to_csv('Bag_of_Words_model.csv', index=False, quoting=3)

ValueError: Lengths must match to compare

In [46]:
# # 个人尝试优化  使用全部train data，5折交叉验证
# val_data_feature = vectorizer.transform(clean_val_reviews).toarray()
# #use random forest
# result = forest.predict(val_data_feature)
# val_acc = (result == val_set.sentiment).mean()

# forest.score(val_data_feature, val_set['sentiment'])

parameters = {
    'n_estimators': (100, 500, 1000),
    'max_depth': (None, 24, 16),
    'min_samples_split': (2, 4, 8),
    'min_samples_leaf': (16, 4, 12)
}

clf = GridSearchCV(RandomForestClassifier(), parameters, cv=5, n_jobs=8)
clf.fit(val_data_feature, val_set.sen)
clf.best_score_, clf.best_params_

aGrid = aML_GS.GridSearchCV(
    aClassifierOBJECT,
    param_grid=aGrid_of_parameters,
    cv=cv, n_jobs=n_JobsOnMultiCpuCores,
    verbose=5
)


(0.84379999999999999,
 {'max_depth': None,
  'min_samples_leaf': 4,
  'min_samples_split': 2,
  'n_estimators': 500})

In [58]:
# 个人尝试优化
val_set = train[20000:]
train_set = train[:20000]
clean_val_reviews = clean_train_reviews[20000:]
clean_train_set_reviews = clean_train_reviews[:20000]

In [59]:
print("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

train_set_data_features = vectorizer.fit_transform(clean_train_set_reviews)
train_set_data_features = train_set_data_features.toarray()

Creating the bag of words...



In [61]:
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_set_data_features, train_set['sentiment'])
forest.score(val_data_feature, val_set.sentiment)

0.84299999999999997

In [63]:
forest = RandomForestClassifier(max_depth= None, min_samples_leaf=4, min_samples_split=2, n_estimators=500)
forest = forest.fit(train_set_data_features, train_set['sentiment'])
forest.score(val_data_feature, val_set.sentiment)

0.84519999999999995

In [None]:
parameters = {
    'n_estimators': (100, 500, 1000),
    'max_depth': (None, 24, 16),
    'min_samples_split': (2, 4, 8),
    'min_samples_leaf': (16, 4, 12)
}

clf = GridSearchCV(RandomForestClassifier(), parameters, cv=5, n_jobs=8)
clf.fit(val_data_feature, val_set.sen)
clf.best_score_, clf.best_params_

# The Rotten Tomatoes movie review dataset

In [2]:
train = pd.read_csv('train.tsv', header=0, delimiter='\t', quoting=0)

In [None]:
clean_train_reviews = train.review.map(review_to_words)

In [3]:
vectorizer = CountVectorizer(analyzer = "word",   \
#                              tokenizer = None,    \
#                              preprocessor = None, \
                             stop_words = nltk.corpus.stopwords.words("english"),   \
                             max_features = 5000) 
train_data_features = vectorizer.fit_transform(train.Phrase)
train_data_features = train_data_features.toarray()

In [9]:
parameters = {
     'n_estimators':(100, 200, 400),
     'max_depth':(None, 24, 16),
     'min_samples_split': (2, 4, 8),
     'min_samples_leaf': (16, 4, 12)
}

clf = GridSearchCV(RandomForestClassifier(), parameters, cv=5, n_jobs=8)
clf.fit(train_data_features, train.Sentiment)
clf.best_score_, clf.best_params_

AttributeError: 'DataFrame' object has no attribute 'Sentiment'

In [None]:
test = pd.read_csv('test.tsv', header=0, delimiter='\t', quoting=0)
test_data_features = vectorizer.transform(test)

In [8]:
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_data_features, train.Sentiment)

MemoryError: 

In [53]:
counts = np.sum(train_data_features, axis=0)

In [59]:
frq = pd.Series(index=counts, data=vectorizer.get_feature_names())

In [14]:
train.loc[np.random.choice(train.index, 5)]

Unnamed: 0,id,sentiment,review
11043,"""9930_8""",1,"""I'm totally surprised by some of the comments..."
11072,"""6359_10""",1,"""If you went to this movie to see some huge ac..."
8256,"""11816_8""",1,"""That was great fun! I never read those Cheste..."
22462,"""7407_10""",1,"""This movie took me by complete surprise. I wa..."
3113,"""2190_2""",0,"""**Possible Spoilers Ahead**<br /><br /> J..."
