In [1]:
import pandas as pd
import re
import nltk
import numpy as np

In [2]:
# Using all pickles of the users
non_elite_reviews_pkl = pd.read_pickle('non_elite_reviews.pkl')
real_elite_reviews_pkl= pd.read_pickle('real_elite_reviews.pkl')
slack_elite_reviews_pkl=pd.read_pickle('slack_elite_reviews.pkl')
potential_elite_reviews_pkl=pd.read_pickle('potential_elite_reviews.pkl')

In [3]:
# Check first 10 rows of the potential users
potential_elite_reviews_pkl.head(1)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,type,useful,user_id,is_elite,by_future_elite,category
31,4P-vTvE6cncJyUyLh73pxw,0,2014-03-29,0,6-hKBi-6RC3g7Mft0c-6qw,4,This place is a area staple! Been around for y...,review,0,PmgqNO0-5Y3e3UoR61TD7w,False,True,Potential-elite


In [3]:
# create a sample dataset picking to 100 rows of all the clusters
sample_data = potential_elite_reviews_pkl.head(100)
sample_data = sample_data.append(real_elite_reviews_pkl.head(100))
sample_data = sample_data.append(slack_elite_reviews_pkl.head(100))
sample_data = sample_data.append(non_elite_reviews_pkl.head(100))

sample_data.shape

(400, 13)

In [4]:
# Takng only the review text column from the dataset
sample_text_subset=sample_data['text']
sample_text_subset.head(3)

31    This place is a area staple! Been around for y...
34    I believe in awarding stars bearing in mind th...
38    Okay bar food. Nice bar setting but food is av...
Name: text, dtype: object

In [23]:
# Use regular expressions to do a find-and-replace and change the text to lowercase
def filter_alphabets(input_string):
    return re.sub("[^a-zA-Z]", " ", input_string).lower()

result_sample=sample_text_subset.apply(filter_alphabets)
result_sample.head(2)

31    this place is a area staple  been around for y...
34    i believe in awarding stars bearing in mind th...
Name: text, dtype: object

In [6]:
# split the text into words
def split_text(input_text):
    return input_text.split()

In [24]:
result_sample_1 = result_sample.apply(split_text)

In [25]:
result_sample_1.head()

31    [this, place, is, a, area, staple, been, aroun...
34    [i, believe, in, awarding, stars, bearing, in,...
38    [okay, bar, food, nice, bar, setting, but, foo...
39    [was, boozin, and, cruisin, with, my, canine, ...
59    [i, love, the, decor, and, vibe, of, khotan, a...
Name: text, dtype: object

In [12]:
nltk.download()



KeyboardInterrupt: 

In [9]:
# import library of stopwords and remove all the stop words from the reviews
from nltk.corpus import stopwords
def remove_stopwords(words):
    return [w for w in words if not w in stopwords.words("english")]


In [26]:
sample_removed_stopwords=result_sample_1.apply(remove_stopwords)
sample_removed_stopwords.head(3)

31    [place, area, staple, around, years, much, cha...
34    [believe, awarding, stars, bearing, mind, type...
38    [okay, bar, food, nice, bar, setting, food, av...
Name: text, dtype: object

In [14]:
#Join the words back into one string separated by space, 
    # and return the result.
def join_words(words):
    return( " ".join(words))


In [27]:
sample_removed_stopwords_joined=sample_removed_stopwords.apply(join_words)
sample_removed_stopwords_joined.head(3)

31    place area staple around years much changed se...
34    believe awarding stars bearing mind type resta...
38    okay bar food nice bar setting food average se...
Name: text, dtype: object

In [28]:
# create feature matrix
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)
train_data_features_df = vectorizer.fit_transform(sample_removed_stopwords_joined)
train_data_features = train_data_features_df.toarray()
train_data_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
train_data_features.shape

(400L, 5000L)

In [18]:
vectorizer.get_feature_names()

[u'aback',
 u'abandon',
 u'aber',
 u'abercrombie',
 u'abide',
 u'able',
 u'abord',
 u'abound',
 u'absent',
 u'absinthe',
 u'absolutely',
 u'absolutly',
 u'absorbed',
 u'absurd',
 u'abuse',
 u'abv',
 u'acc',
 u'accept',
 u'acceptable',
 u'access',
 u'accessories',
 u'accident',
 u'accidentally',
 u'acclaimed',
 u'accommodate',
 u'accomodate',
 u'accompanied',
 u'according',
 u'accordingly',
 u'account',
 u'accounting',
 u'accurate',
 u'accurately',
 u'accustomed',
 u'aches',
 u'acknowledged',
 u'acknowledgment',
 u'acquaintance',
 u'acquired',
 u'across',
 u'act',
 u'acted',
 u'acting',
 u'action',
 u'activity',
 u'acts',
 u'actual',
 u'actuality',
 u'actually',
 u'add',
 u'added',
 u'addiction',
 u'adding',
 u'addition',
 u'additional',
 u'additionally',
 u'adds',
 u'ade',
 u'adequate',
 u'adhere',
 u'adjust',
 u'adjustments',
 u'admire',
 u'admit',
 u'adopted',
 u'adore',
 u'adult',
 u'adults',
 u'advantage',
 u'advertisement',
 u'advice',
 u'advise',
 u'advised',
 u'affects',
 u'affi

In [19]:
print "Training the random forest..."
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, sample_data["category"] )

Training the random forest...


In [20]:
import numpy as np
test_reviews = non_elite_reviews_pkl.tail(100)['text']
test_data_features = vectorizer.transform(test_reviews)
np.asarray(test_data_features)
print "Predicting test labels...\n"
result = forest.predict(test_data_features)
result



Predicting test labels...



array(['Real-elite', 'Potential-elite', 'Non-elite', 'Non-elite',
       'Slack-elite', 'Non-elite', 'Non-elite', 'Non-elite', 'Non-elite',
       'Non-elite', 'Non-elite', 'Potential-elite', 'Slack-elite',
       'Non-elite', 'Slack-elite', 'Slack-elite', 'Non-elite',
       'Slack-elite', 'Real-elite', 'Non-elite', 'Non-elite', 'Non-elite',
       'Non-elite', 'Non-elite', 'Non-elite', 'Non-elite', 'Non-elite',
       'Slack-elite', 'Slack-elite', 'Potential-elite', 'Potential-elite',
       'Non-elite', 'Non-elite', 'Slack-elite', 'Non-elite', 'Non-elite',
       'Non-elite', 'Potential-elite', 'Slack-elite', 'Potential-elite',
       'Potential-elite', 'Potential-elite', 'Slack-elite', 'Non-elite',
       'Potential-elite', 'Slack-elite', 'Potential-elite',
       'Potential-elite', 'Slack-elite', 'Potential-elite', 'Non-elite',
       'Non-elite', 'Non-elite', 'Slack-elite', 'Non-elite', 'Non-elite',
       'Slack-elite', 'Non-elite', 'Non-elite', 'Potential-elite',
       'Potent