In [1]:
#importing libraries
import pandas as pd
import re
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
def reviewWords(review):
    data_train_Exclude_tags = re.sub(r'<[^<>]+>', " ", review)      # Excluding the html tags
    data_train_num = re.sub(r'[0-9]+', 'number', data_train_Exclude_tags)  # Converting numbers to "NUMBER"
    data_train_lower = data_train_num.lower()              # Converting to lower case.
    data_train_split = data_train_lower.split()            # Splitting into individual words.
    stopWords = set(stopwords.words("english") )
    ps=PorterStemmer()

    meaningful_words = [ps.stem(word) for word in data_train_split if not word in stopWords]     # Removing stop words and Stemming
    
    return( " ".join( meaningful_words ))  

In [3]:
#reading training dataset
data_train=pd.read_csv(r'C:\Users\goswa\Desktop\NLP\labeledTrainData.tsv',delimiter='\t')

In [4]:
data_train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
data_train.shape

(25000, 3)

In [6]:
# Reviewing 
cleanWords = []
for i in range(data_train['review'].size):
    cleanWords.append( reviewWords( data_train["review"][i] ))
print("---Review Processing Done!---\n")


---Review Processing Done!---



In [13]:
#Creating features of bag of models for training set
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
data_train_features = vectorizer.fit_transform(cleanWords)
print("Features Created!!!\n")

Features Created!!!



In [15]:
# Training
print("Training the classifier\n")
forest = RandomForestClassifier(n_estimators = 100,random_state = 0)
forest = forest.fit(data_train_features, data_train["sentiment"])
score = forest.score(data_train_features, data_train["sentiment"])
print("Mean Accuracy of the Random forest is: %f" %(score))

Training the classifier

Mean Accuracy of the Random forest is: 1.000000


In [16]:
#Reading test dataset
data_test=pd.read_csv(r'C:\Users\goswa\Desktop\NLP\testData.tsv',delimiter='\t')

In [17]:
# Reviewing for test
testcleanWords = []
for i in range(data_test['review'].size):
    testcleanWords.append( reviewWords( data_test["review"][i] ))
print("---Review Processing Done!---\n")

---Review Processing Done!---



In [18]:
#Creating features of bag of models for testser
data_test_features = vectorizer.transform(testcleanWords)
print("Test Features Created!!!\n")

Test Features Created!!!



In [19]:
# Predicting Test Results.
result=forest.predict(data_test_features)

In [20]:
# Copy the predicted values to pandas dataframe with an id column, and a sentiment column.
output = pd.DataFrame(data = {"id": data_test["id"], "sentiment": result} )
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1
