In [46]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")

import os
print(os.listdir("../input"))

['testData.tsv', 'sampleSubmission.csv', 'labeledTrainData.tsv', 'unlabeledTrainData.tsv']


In [47]:
train = pd.read_csv("../input/labeledTrainData.tsv", header = 0, delimiter = '\t')
test = pd.read_csv("../input/testData.tsv", header = 0, delimiter = '\t')

In [48]:
print("Train set: ", train.shape, "Test set: ", test.shape)

Train set:  (25000, 3) Test set:  (25000, 2)


In [49]:
train['length'] = train['review'].apply(len)
train.head()

Unnamed: 0,id,sentiment,review,length
0,5814_8,1,With all this stuff going down at the moment w...,2302
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",946
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,2449
3,3630_4,0,It must be assumed that those who praised this...,2245
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,2231


## Sentiment: 
1 - Positive            0 - Negative

In [50]:
train.groupby('sentiment').describe()

Unnamed: 0_level_0,length,length,length,length,length,length,length,length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,12500.0,1305.72192,959.142634,52.0,711.0,978.0,1569.25,8999.0
1,12500.0,1349.6992,1048.890394,70.0,695.0,984.0,1653.0,13708.0


## No strong correlation between sentiment and length

In [51]:
train.drop(['length'], axis=1, inplace=True)

In [52]:
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [53]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [54]:
# Initialize the BeautifulSoup object   
bs_review1 = BeautifulSoup(train["review"][0])

alph_only = re.sub("[^a-zA-Z]", " ", bs_review1.get_text())  #Replace all non-alphabetical letters
words = alph_only.lower().split()
words = [w for w in words if not w in stopwords.words("english")] #Remove "stop words"
print(words)

['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mj', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'kay', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psychopathic', 'powerful', 'drug', 'lord', 

In [55]:
def review_to_words(reviews):
    #Remove HTML
    reviews = BeautifulSoup(reviews).get_text() 
    
    #Remove non-alphabetical letters        
    alpha_only = re.sub("[^a-zA-Z]", " ", reviews) 
    
    #Convert to lower case and split into individual words
    words = alpha_only.lower().split()                             
    
    #Remove "stop words"
    words = [w for w in words if not w in stopwords.words("english")]   

    #Join words separate by spaces
    return(' '.join(words))

In [56]:
review = review_to_words(train["review"][0])
print(review)

stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working

In [57]:
#Modify all reviews using the above approach
train['review'] = train['review'].apply(review_to_words)

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
#Create word count matrices
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
train_features = vectorizer.fit_transform(train['review'])

In [59]:
train_features = train_features.toarray()

In [60]:
train_features.shape

(25000, 5000)

## Modelling

In [92]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_val, y_train, y_val = train_test_split(train_features, train['sentiment'], test_size=0.3, random_state=42)

In [82]:
model = MultinomialNB().fit(X_train, y_train)

In [109]:
pre = model.predict(X_val)

In [110]:
print(classification_report(pre, y_val))

             precision    recall  f1-score   support

          0       0.85      0.85      0.85      3759
          1       0.85      0.85      0.85      3741

avg / total       0.85      0.85      0.85      7500



## Final Prediction

In [87]:
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [88]:
#Preparing final input data
test_reviews = test['review'].apply(review_to_words)
test_reviews = vectorizer.fit_transform(test_reviews)
test_reviews = test_reviews.toarray()

result = model.predict(test_reviews)

ids = test['id']

result_df = pd.DataFrame({'id':ids, 'sentiments':result})

In [91]:
result_df.to_csv( "result.csv", index=False, quoting=3 )