## A Bag of Words

In [20]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from pandas_confusion import ConfusionMatrix
import matplotlib.pyplot as plt

### Reading Data

In [21]:
data = pd.read_csv("/Users/George/Downloads/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [22]:
data.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [23]:
data.shape

(25000, 3)

In [24]:
print(data["review"][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

### Data Cleaning

In [25]:
remove_html = lambda x: BeautifulSoup(x,"lxml").get_text()

remove_numbers = lambda x: re.sub("[^a-zA-Z]", " ", x) 

to_lower = lambda x: x.lower().split()

remove_stop_words = lambda x: [w for w in x if not w in stops] 

join = lambda x: " ".join(x) 


In [26]:
stops = set(stopwords.words("english"))                  
data['clean_review'] = data['review'].apply(remove_html).apply(remove_numbers).apply(to_lower).apply(remove_stop_words).apply(join)

In [27]:
print(data['review'][0])
print("-------------------------------------------------------------")
print(data['clean_review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

### Create Features

In [28]:
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 5000) 

In [29]:
data_features = vectorizer.fit_transform(list(data['clean_review']))

In [30]:
data_features = pd.DataFrame(data_features.toarray())
#print(data_features)

In [31]:
vocab = vectorizer.get_feature_names()
#print(vocab)

In [32]:
data = data.merge(data_features, how='inner', left_index=True, right_index=True)
data.shape

(25000, 5004)

In [33]:
data.head()

Unnamed: 0,id,sentiment,review,clean_review,0,1,2,3,4,5,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,"""5814_8""",1,"""With all this stuff going down at the moment ...",stuff going moment mj started listening music ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...",classic war worlds timothy hines entertaining ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...",film starts manager nicholas bell giving welco...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"""3630_4""",0,"""It must be assumed that those who praised thi...",must assumed praised film greatest filmed oper...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...",superbly trashy wondrously unpretentious explo...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Split Data

In [34]:
train, test = train_test_split(data, test_size=0.33, random_state=42)

### Random Forest

In [35]:
# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
forest = forest.fit( train.iloc[:,4:], train["sentiment"] )

### Evaluate 

In [36]:
test['prediction'] = forest.predict(test.iloc[:,4:])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [37]:
final = test.loc[:,('id','sentiment','prediction')]
final.head()

Unnamed: 0,id,sentiment,prediction
6868,"""2570_3""",0,0
24016,"""4897_8""",1,0
9668,"""8485_3""",0,0
13640,"""9029_10""",1,1
14018,"""11220_1""",0,0


In [38]:
confusion_matrix = ConfusionMatrix(final['sentiment'], final['prediction'])
print("Confusion matrix:\n%s" % confusion_matrix)

Confusion matrix:
Predicted     0     1  __all__
Actual                        
0          3506   599     4105
1           687  3458     4145
__all__    4193  4057     8250
