In [81]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt

np.random.seed= 0

In [20]:
#Loading the data

training_data= pd.read_csv('/Users/harikrishnanagarajan/Desktop/Kaggle/SA_using_Word2vec/labeledTrainData.tsv', header= 0, delimiter= '\t')

training_data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [21]:
training_data.review[0]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [32]:
#Checking for null values and balance of data

print(training_data.isna().sum())
print('\n')
print(training_data.sentiment.value_counts())

id           0
sentiment    0
review       0
dtype: int64


0    12500
1    12500
Name: sentiment, dtype: int64


### Cleaning the training data

In [27]:
def clean_raw_text(review):
    
    ps= PorterStemmer()
    
    text= BeautifulSoup(review).get_text()
    
    letters_only= re.sub("[^A-Za-z]", " ", text)
    
    words= letters_only.lower().split()
    
    stop_words= set(stopwords.words('english'))
    
    meaningful_words= [x for x in words if x not in stop_words]
    
    stemmed_words= [ps.stem(w) for w in meaningful_words]
    
    return " ".join(stemmed_words)

In [29]:
clean_reviews= []

for review in training_data.review:
    
    clean_reviews.append(clean_raw_text(review))
    

In [34]:
clean_reviews[0]

'stuff go moment mj start listen music watch odd documentari watch wiz watch moonwalk mayb want get certain insight guy thought realli cool eighti mayb make mind whether guilti innoc moonwalk part biographi part featur film rememb go see cinema origin releas subtl messag mj feel toward press also obviou messag drug bad kay visual impress cours michael jackson unless remot like mj anyway go hate find bore may call mj egotist consent make movi mj fan would say made fan true realli nice actual featur film bit final start minut exclud smooth crimin sequenc joe pesci convinc psychopath power drug lord want mj dead bad beyond mj overheard plan nah joe pesci charact rant want peopl know suppli drug etc dunno mayb hate mj music lot cool thing like mj turn car robot whole speed demon sequenc also director must patienc saint came film kiddi bad sequenc usual director hate work one kid let alon whole bunch perform complex danc scene bottom line movi peopl like mj one level anoth think peopl stay 

### Creating the model using CountVectorizer features

### Tokenizing and Vectorizing the reviews ---> Feature Vectors


In [86]:
#Tokenizing and creating the vector features of our text data

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_vectorizer= CountVectorizer(analyzer= 'word', tokenizer= None, max_features= 2000)

train_data_features= count_vectorizer.fit_transform(clean_reviews)

np.asarray(train_data_features)

array(<25000x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 1802204 stored elements in Compressed Sparse Row format>,
      dtype=object)

### Splitting the reviews for training and validation

In [87]:
#Splitting into training and validation sets

from sklearn.model_selection import train_test_split

X_train, x_valid, y_train, y_valid= train_test_split(train_data_features, training_data.sentiment, test_size= 0.2)


In [88]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

model= RandomForestClassifier(n_estimators= 256)

history= model.fit(X_train, y_train)

In [89]:
results= model.predict(x_valid)

print(classification_report(results, y_valid))


              precision    recall  f1-score   support

           0       0.84      0.84      0.84      2474
           1       0.85      0.85      0.85      2526

    accuracy                           0.84      5000
   macro avg       0.84      0.84      0.84      5000
weighted avg       0.85      0.84      0.85      5000



### Creating the model using the TfIdfVectorizer features

### Tokenizing and Vectorizing the reviews ---> Feature Vectors


In [100]:
tfidf_vectorizer= TfidfVectorizer(analyzer= 'word', tokenizer= None, max_features= 750)

train_data_features= tfidf_vectorizer.fit_transform(clean_reviews)

np.asarray(train_data_features)

array(<25000x750 sparse matrix of type '<class 'numpy.float64'>'
	with 1380581 stored elements in Compressed Sparse Row format>,
      dtype=object)

### Splitting the reviews for training and validation

In [101]:
#Splitting into training and validation sets

from sklearn.model_selection import train_test_split

X_train, x_valid, y_train, y_valid= train_test_split(train_data_features, training_data.sentiment, test_size= 0.2)


In [102]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

model_2= RandomForestClassifier(n_estimators= 256)

history_2= model_2.fit(X_train, y_train)

In [103]:
results= model_2.predict(x_valid)

print(classification_report(results, y_valid))


              precision    recall  f1-score   support

           0       0.82      0.83      0.82      2500
           1       0.83      0.81      0.82      2500

    accuracy                           0.82      5000
   macro avg       0.82      0.82      0.82      5000
weighted avg       0.82      0.82      0.82      5000



### Importing the test data

In [104]:
testing_data= pd.read_csv('/Users/harikrishnanagarajan/Desktop/Kaggle/SA_using_Word2vec/testData.tsv', delimiter= '\t')

testing_data.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [105]:
#Checking test data for null values

print(testing_data.isna().sum())

id        0
review    0
dtype: int64


### Cleaning the test data ---> Predicting the sentiment for test data

In [109]:
cleaned_test_reviews= [clean_raw_text(review) for review in testing_data.review]

cleaned_test_reviews[0]

In [110]:
# test_data_features= tfidf_vectorizer.transform(cleaned_test_reviews)
# np.asarray(test_data_features)

# results= model_2.predict(test_data_features)

# output = pd.DataFrame( data={"id":testing_data["id"], "sentiment":results} )
# output.to_csv('submission.csv')