# Creating a NLP Pipeline
#### -- Load input file and read reviews
#### -- Tokenize
#### -- Remove Stopwords
#### -- Perform Stemming
#### -- Write clean data to output file

In [1]:
sample_text='''I loved this movie since I was 7 and I saw it on the opening day. It was so touching and beautiful. I strongly recommend seeing for all. It's a movie to watch with your family by far.<br /><br />My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language.
'''

## NLTK

In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [3]:
# init Objects
tokenizer=RegexpTokenizer(r'\w+')
en_stopwords=set(stopwords.words('english'))
ps=PorterStemmer()

In [4]:
def getStemmedReview(review):
    review=review.lower()
    review=review.replace("<br /><br />"," ")
    
    #Tokenize
    tokens=tokenizer.tokenize(review)
    new_tokens=[token for token in tokens if token not in en_stopwords]
    stemmed_tokens=[ps.stem(token) for token in new_tokens]
    clean_review=' '.join(stemmed_tokens)
    return clean_review
    
    
    
    

In [5]:
getStemmedReview(sample_text)

'love movi sinc 7 saw open day touch beauti strongli recommend see movi watch famili far mpaa rate pg 13 themat element prolong scene disastor nuditi sexual languag'

In [6]:
def getStemmedDoc(inputFile,outputFile):
    out=open(outputFile,'w',encoding="utf8")
    with open(inputFile,encoding="utf8") as f:
        reviews=f.readlines()
    for review in reviews:
        cleaned_review=getStemmedReview(review)
        print((cleaned_review),file=out)
    out.close()


        


## Getting input and output file

In [7]:
getStemmedDoc("imdb_toy_x.txt","imdb_toy_clean.txt")

## Multinomial Event Model

In [8]:
x=["This was an awesome movie",
  "Great movie, i liked it a lot",
   "Happy ending , awesome acting by the hero",
   "I loved it, truly great one",
   "bad not upto the mark",
   "could have been better",
   "A disappointing one"
  ]
y=[1,1,1,1,0,0,0] #1 = positive review #0  Negative review

In [9]:
test_x=["I was happy and i loved the acting in the movie",
        "This movie i saw was readlly bad and not upto the mark"]

## Clean the data

In [10]:
x_clean=[getStemmedReview(i) for i in x]
x_clean

['awesom movi',
 'great movi like lot',
 'happi end awesom act hero',
 'love truli great one',
 'bad upto mark',
 'could better',
 'disappoint one']

In [11]:
x_test_clean=[getStemmedReview(i) for i in test_x]
x_test_clean

['happi love act movi', 'movi saw readlli bad upto mark']

## Sklearn Multinomial NB

#### Vectorization on Training set

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer() ##We can use bigrams and trigrams 
cv.fit(x_clean)
x_vec=cv.transform(x_clean).toarray()
print(x_vec)
print(x_vec.shape)
print("----------------------------------------")
xt_vec=cv.transform(x_test_clean).toarray()
print(xt_vec)
print(xt_vec.shape)

[[0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0]
 [1 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
 [0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0]]
(7, 18)
----------------------------------------
[[1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1]]
(2, 18)


## Multinomial Naive Bayes

In [13]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB

In [14]:
mnb=MultinomialNB()
mnb.fit(x_vec,y)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
mnb.predict(xt_vec)

array([1, 0])

## Posterior probability

In [16]:
mnb.predict_proba(xt_vec)

array([[0.09175896, 0.90824104],
       [0.97894894, 0.02105106]])

In [18]:
from sklearn.metrics import confusion_matrix

#y vs y pred are the args
cnf_matrix=confusion_matrix(y,mnb.predict(x_vec))
print(cnf_matrix)

[[3 0]
 [0 4]]
