## NLP pipeline
1. load input file and read reviews
2. tokenize
3. remove stopwards
4. perform stemming
5. write cleamned data to output file

In [1]:
sample_text = """"I loved this movie since I was 7 and I saw it on the opening day. It was so touching and beautiful. I strongly recommend seeing for all. It's a movie to watch with your family by far.<br /><br />My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language."""

## NLTK

In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [114]:
tokenizer = RegexpTokenizer(r'\w+')
ps = PorterStemmer()
en_stopwords = set(stopwords.words('english'))
if 'not' in en_stopwords:
    en_stopwords.remove('not')

In [115]:
print(en_stopwords)

{'why', 'myself', 'my', 'if', 'and', 'our', 'there', 'that', 'before', 'above', 'here', "she's", "you're", "doesn't", "shan't", "weren't", 'each', 'the', 'has', "hasn't", 'any', 'just', 'wouldn', 'o', 'who', 'can', 'mustn', 'at', 'through', 'from', "couldn't", 'same', 'a', 'few', 'haven', 'by', 'these', 'but', 'm', 'too', 'y', 'once', 'about', "should've", 'own', 'had', 'this', 'over', 'all', 'to', "don't", 'up', 'so', "that'll", 'am', 'again', 'off', 'needn', 'very', 'i', 'yourself', 'her', 'herself', 'more', 'doing', 'into', 'have', "shouldn't", 'wasn', 'were', 'then', "haven't", 's', 'because', 'such', 'weren', 'been', 'themselves', 'down', 'being', 'when', 'hers', 'other', 'shouldn', 'ain', 'an', 'hasn', 'itself', 'during', 'which', 'their', 'them', 'hadn', 'will', "won't", 'what', 're', 'as', "didn't", 'didn', 'mightn', 'does', "you'll", 'while', 'out', 'is', 'with', 'be', 'you', "isn't", 'shan', 'or', "hadn't", 'no', 'having', 'll', 'now', 'further', 'below', 'did', 'don', 'was',

In [116]:
def get_clean_text(reviews):
    reviews = reviews.lower()
    reviews = reviews.replace("<br /><br />"," ")
    t = tokenizer.tokenize(reviews)
    useful = [x for x in t if x not in en_stopwords]
    stemmed = [ps.stem(x) for x in useful]
    stemmed_review = ' '.join(stemmed)   
    return stemmed_review
    

In [117]:
get_clean_text(sample_text)

'love movi sinc 7 saw open day touch beauti strongli recommend see movi watch famili far mpaa rate pg 13 themat element prolong scene disastor nuditi sexual languag'

## multinommial

In [118]:
x = ["This was an awesome,good & awesome movie",
     "Great movie! I liked it a lot",
     "Happy Ending! awesome acting by the hero",
     "loved it! truly great",
     "bad not good upto the mark",
     "could have better",
     "Surely a Disappointing movie"]

y = [1,1,1,1,0,0,0]

In [119]:
clean_reviews = [get_clean_text(r) for r in x ]
clean_reviews

['awesom good awesom movi',
 'great movi like lot',
 'happi end awesom act hero',
 'love truli great',
 'bad not good upto mark',
 'could better',
 'sure disappoint movi']

## vectorization

In [90]:
from sklearn.feature_extraction.text import CountVectorizer

In [91]:
cv = CountVectorizer(ngram_range=(1,2))

In [92]:
vc = cv.fit_transform(clean_reviews).toarray()
print(vc)

[[0 0 2 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
  0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0
  0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1
  1 0 0]
 [0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0
  0 1 1]
 [0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0
  0 0 0]]


In [93]:
len(cv.vocabulary_)

39

In [94]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB

In [95]:
mnb = MultinomialNB()

In [96]:
mnb.fit(vc,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [97]:
test_x = ["I was happy happy and I loved the acting in the movie",
          "The movie I saw was not good"]
xt_clean = [get_clean_text(x) for x in test_x]
xt_clean

['happi happi love act movi', 'movi saw not good']

In [98]:
xt_vec = cv.transform(xt_clean).toarray()

In [99]:
xt_vec.shape

(2, 39)

In [100]:
mnb.predict(xt_vec)

array([1, 0])

In [101]:
mnb.score(vc,y)

1.0

In [105]:
mnb.predict_proba(xt_vec)

array([[0.07115831, 0.92884169],
       [0.80384651, 0.19615349]])

## bernoulli

In [102]:
bnb = BernoulliNB(binarize=0.0)

In [103]:
bnb.fit(vc,y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [104]:
bnb.predict(xt_vec)

array([1, 0])

In [106]:
bnb.predict_proba(xt_vec)

array([[0.04579296, 0.95420704],
       [0.85047341, 0.14952659]])