# Movie Recommendation System

In [1]:
import pandas as pd   
from bs4 import BeautifulSoup   
import re 
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

# Loading Training And Testing Dataset

In [2]:
train = pd.read_csv("/Users/garvitchaudhary/Downloads/Movie_reviewer-master/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("/Users/garvitchaudhary/Downloads/Movie_reviewer-master/testData.tsv", header=0, delimiter="\t", quoting=3 )

In [3]:
train

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."
...,...,...,...
24995,"""3453_3""",0,"""It seems like more consideration has gone int..."
24996,"""5064_1""",0,"""I don't believe they made this film. Complete..."
24997,"""10905_3""",0,"""Guy is a loser. Can't get girls, needs to bui..."
24998,"""10194_3""",0,"""This 30 minute documentary Buñuel made in the..."


# Text Clearing Function
The function clears the text from html tags and punctuation marks. Then we get a list of all words from which we delete the words that
do not affect learning

In [4]:
def review_to_words( raw_review ):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    # 2. Remove non-letters     
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    # 3. Convert words to lower case and split them
    words = letters_only.lower().split()  
    # 4. Optionally remove stop words (false by default)                           
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]   
    # 5. Return a list of words
    return( " ".join( meaningful_words ))  

# We process all the text in the selection

In [5]:
# create a new list to store the clean review
num_reviews = train["review"].size
clean_train_reviews = []
# review_to_wordlist to clean the review and append them to the new list
for i in range(0, num_reviews):
    if (i%1000 == 0):
        print(i/num_reviews * 100, '%')
    clean_train_reviews.append(review_to_words(train["review"][i] ) )


0.0 %
4.0 %
8.0 %
12.0 %
16.0 %
20.0 %
24.0 %
28.000000000000004 %
32.0 %
36.0 %
40.0 %
44.0 %
48.0 %
52.0 %
56.00000000000001 %
60.0 %
64.0 %
68.0 %
72.0 %
76.0 %
80.0 %
84.0 %
88.0 %
92.0 %
96.0 %


# Get the vectors of the recognitions from the training data

In [6]:
#initialzed a vectorizer for 20000 words
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 20000) 
#fitting the clean review data into the vectorizer
train_data_features = vectorizer.fit_transform(clean_train_reviews)
#converting the data to an array 
train_data_features = train_data_features.toarray()

# Using RandomForestClassifier as Model

In [7]:

#initialise a random forest with 110 trees
forest = RandomForestClassifier(n_estimators=110) 
#fit the vectorised data into the forest along with the sentiment of clean train data
forest = forest.fit( train_data_features, train["sentiment"] )

# Outputting the result
Let's clean up the test data and get the feature vectors from the resulting list of words. Next, we use the trained model to get the answer.

In [None]:
#created a  new test data list
num_reviews = len(test["review"])
clean_test_reviews = [] 
#looped through all the test data reviews and use KaggleWord2Vec library to clean the data
for i in range(0,num_reviews):
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )
# vectorized the clean data and converted into an array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()
#calling the random forest we created to predict the test data result
result = forest.predict(test_data_features)
#OUTPUT INTO CSV FILE
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )