# Natural Language Processing

NLP for predicting positive/negative movie review

In [1]:
import nltk
from nltk.corpus import movie_reviews 
from nltk import NaiveBayesClassifier
from nltk.stem import WordNetLemmatizer
import random




In [2]:
# Function to fetch data from movie_reviews

documents=[]
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [3]:
# Shuffel the data
random.shuffle(documents)

In [4]:
lemmatizer=WordNetLemmatizer()

In [5]:
# Function to return part of speech in form understandable by wirdnet lemmatizer
from nltk.corpus import wordnet 

def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADJ
    else:
        return wordnet.NOUN

In [6]:
# import stopwords and add punctuations to the list
from nltk.corpus import stopwords
import string

stops=stopwords.words('english')
punctuations=string.punctuation
stops+=list(punctuations)
stops=set(stops)

In [7]:
#Function to remove stopwords and perform lemmatization
from nltk import pos_tag

def clean_review(words):
    output_words=[]
    for w in words:
        if w not in stops:
            pos=pos_tag([w]) 
            clean_word=lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [8]:
# apply clean function on dataset
documents=[(clean_review(document),category) for document,category in documents ]

In [9]:
#split data into testing and trainig dataset
training_document=documents[:1500]
testing_document=documents[1500:]

In [10]:
# generating feature

all_words=[]
for document in documents:
    all_words+=document[0]

freq=nltk.FreqDist(all_words)
common=freq.most_common(3000)
features=[i[0] for i in common]

In [11]:
# function to convert data into desired format

def get_feature_dict(words):
    current_features={}
    words_set=set(words)
    for w in features:
        current_features[w]=w in words_set
    return current_features

In [12]:
# converting data into desired format

training_data=[(get_feature_dict(doc),category) for doc,category in training_document]
testing_data=[(get_feature_dict(doc),category) for doc,category in testing_document]

In [13]:
# using naive byes classifier to train the model and make predictions

clf=NaiveBayesClassifier.train(training_data)
nltk.classify.accuracy(clf,testing_data)

0.804

In [14]:
# using sklearn classifier on npl data

from sklearn.svm import SVC
from nltk import SklearnClassifier

In [15]:
svc=SVC()
classifier_sklearn=SklearnClassifier(svc)

In [16]:
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC())>

In [17]:
nltk.classify.accuracy(classifier_sklearn,testing_data)

0.854