# Opinion Mining Using POS Tagging and Grammar Association

## Imports

In [1]:
import nltk
from xml.dom import minidom

## Parse an XML File and extract data

In [2]:
doc = minidom.parse('reviews.review')

review_data = []

reviews = doc.getElementsByTagName("review")
for review in reviews:
        review_text = review.getElementsByTagName("review_text")[0]
        review_data.append(review_text.firstChild.data.replace("\n",""))


## Exploratory Analysis

In [3]:
len(review_data)

986

In [4]:
review_data[45]

u'Fabulous product --- now I can store over 700 photos in the 8 mega pixel range.    Happy I purchased this one -- very trouble free'

## Download Dependencies

In [None]:
nltk.download()
# Download Averaged Perceptron Tagger in Models
# Help on Tagsets in Models

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


## Word Tokenize

In [None]:
text = nltk.word_tokenize("And now for something completely different")
#print text
nltk.pos_tag(text)

<table border="2" width="100%" cellspacing="2">
	<tbody>
		<tr>
			<th>POS Tag</th>
			<th>Description</th>
			<th>Example</th>
		</tr>
		<tr>
			<td>CC</td>
			<td>coordinating conjunction</td>
			<td>and</td>
		</tr>
		<tr>
			<td>CD</td>
			<td>cardinal number</td>
			<td>1, third</td>
		</tr>
		<tr>
			<td>DT</td>
			<td>determiner</td>
			<td>the</td>
		</tr>
		<tr>
			<td>EX</td>
			<td>existential there</td>
			<td>
				<em>there</em> is</td>
		</tr>
		<tr>
			<td>FW</td>
			<td>foreign word</td>
			<td>d&#8217;hoevre</td>
		</tr>
        <tr>
			<td>IN</td>
			<td>preposition/subordinating conjunction</td>
			<td>in, of, like</td>
		</tr>
		<tr>
			<td>JJ</td>
			<td>adjective</td>
			<td>big</td>
		</tr>
		<tr>
			<td>JJR</td>
			<td>adjective, comparative</td>
			<td>bigger</td>
		</tr>
		<tr>
			<td>JJS</td>
			<td>adjective, superlative</td>
			<td>biggest</td>
		</tr>
		<tr>
			<td>LS</td>
			<td>list marker</td>
			<td>1)</td>
		</tr>
		<tr>
			<td>MD</td>
			<td>modal</td>
			<td>could, will</td>
		</tr>
        <tr>
			<td>NN</td>
			<td>noun, singular or mass</td>
			<td>door</td>
		</tr>
		<tr>
			<td>NNS</td>
			<td>noun plural</td>
			<td>doors</td>
		</tr>
		<tr>
			<td>NNP</td>
			<td>proper noun, singular</td>
			<td>John</td>
		</tr>
		<tr>
			<td>NNPS</td>
			<td>proper noun, plural</td>
			<td>Vikings</td>
		</tr>
		<tr>
			<td>PDT</td>
			<td>predeterminer</td>
			<td>
				<em>both</em> the boys</td>
		</tr>
		<tr>
			<td>POS</td>
			<td>possessive ending</td>
			<td>friend<em>&#8216;s</em>
			</td>
		</tr>
		<tr>
			<td>PRP</td>
			<td>personal pronoun</td>
			<td>I, he, it</td>
		</tr>
		<tr>
			<td>PRP</td>
			<td>possessive pronoun</td>
			<td>my, his</td>
		</tr>
		<tr>
			<td>RB</td>
			<td>adverb</td>
			<td>however, usually, naturally, here, good</td>
		</tr>
		<tr>
			<td>RBR</td>
			<td>adverb, comparative</td>
			<td>better</td>
		</tr>
		<tr>
			<td>RBS</td>
			<td>adverb, superlative</td>
			<td>best</td>
		</tr>
        <tr>
			<td>RP</td>
			<td>particle</td>
			<td>give <em> up </em>
			</td>
		</tr>
		<tr>
			<td>TO</td>
			<td>to</td>
			<td>
				<em>to</em> go, <em>to</em> him</td>
		</tr>
		<tr>
			<td>UH</td>
			<td>interjection</td>
			<td>uhhuhhuhh</td>
		</tr>
		<tr>
			<td>VB</td>
			<td>verb, base form</td>
			<td>take</td>
		</tr>
		<tr>
			<td>VBD</td>
			<td>verb, past tense</td>
			<td>took</td>
		</tr>
		<tr>
			<td>VBG</td>
			<td>verb, gerund/present participle</td>
			<td>taking</td>
		</tr>
		<tr>
			<td>VBN</td>
			<td>verb, past participle</td>
			<td>taken</td>
		</tr>
		<tr>
			<td>VBP</td>
			<td>verb, sing. present, non-3d</td>
			<td>take</td>
		</tr>
		<tr>
			<td>VBZ</td>
			<td>verb, 3rd person sing. present</td>
			<td>takes</td>
		</tr>
		<tr>
			<td>WDT</td>
			<td>wh-determiner</td>
			<td>which</td>
		</tr>
		<tr>
			<td>WP</td>
			<td>wh-pronoun</td>
			<td>who, what</td>
		</tr>
		<tr>
			<td>WP</td>
			<td>possessive wh-pronoun</td>
			<td>whose</td>
		</tr>
		<tr>
			<td>WRB</td>
			<td>wh-abverb</td>
			<td>where, when</td>
		</tr>
	</tbody>
</table>

## POS Tagging of  Words

In [None]:
tagged_reviews = []
for each_review_text in review_data[0:10]:
    text = nltk.word_tokenize(each_review_text)
    tagged_reviews.append(nltk.pos_tag(text))
tagged_reviews[0]

## Opinion Mining/Association

In [None]:
#nltk.help.upenn_tagset()
grammar = "NP: {<DT|PP|CD>?<JJ||JJR|JJS>*<NN|NNS|PRP|NNP|IN|PRP\$>+<VBD|VBZ|VBN|VBP|IN>*<JJ|RB>*<PRP|NN|NNS>*}"
cp = nltk.RegexpParser(grammar)
results = cp.parse(tagged_reviews[9])

## Plot the parse tree

In [None]:
results.draw()

In [None]:
len(tagged_reviews[9])

## Explore results

In [14]:
for result in results:
    if type(result) == nltk.tree.Tree:
        assoc=[]
        for res in result:
            assoc.append(res[0])
        if len(assoc) > 2:
            print assoc

[u'this', u'CF', u'card', u'for', u'my', u'Canon', u'Digital', u'Rebel']
[u'it', u'has', u'worked', u'fine']
[u'though', u'I', u'do', u"n't"]
[u'an', u'expert', u'digital', u'photographer']
[u'the', u'card', u'is', u'empty']
[u'127', u'shots', u'available']
[u'though', u'it', u'takes']
[u'longer', u'than', u'I', u'was', u'used']
[u'with', u'my', u'point-and-shoot']
[u'its', u'SmartMedia', u'card']


In [13]:
review_data[9]
#tagged_reviews[1]

u"After going through the reviews, I bought this CF card for my Canon Digital Rebel. So far it has worked fine, though I don't pretend to be an expert digital photographer. When the card is empty, it shows 127 shots available. It seems to read reasonably fast, though it takes a bit longer than I was used to with my point-and-shoot Olympus and its SmartMedia card."