# Part 7 - Natural language processing

- Deals with extracting information from text

In [10]:
# import the libraries that will be used
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

In [2]:
# import dataset
dir1 = '/disk1/sousae/Classes/udemy_machineLearning/Machine_Learning_A-Z/Part7_Natural_Language_Processing/'
dataset = pd.read_csv(dir1+'Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
dataset.head(10)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [3]:
help(re.sub)

Help on function sub in module re:

sub(pattern, repl, string, count=0, flags=0)
    Return the string obtained by replacing the leftmost
    non-overlapping occurrences of the pattern in string by the
    replacement repl.  repl can be either a string or a callable;
    if a string, backslash escapes in it are processed.  If it is
    a callable, it's passed the match object and must return
    a replacement string to be used.



In [4]:
# Cleaning the texts
corpus = []

for i in range(0,1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # only keep the letters, remove numbers and punctuation
    review = review.lower() # lower case
    review = review.split() # split the string into a list of strings
    ps = PorterStemmer() # all the words that have the same stem/root
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
print(corpus)

[u'wow love place', 'crust good', u'tasti textur nasti', u'stop late may bank holiday rick steve recommend love', u'select menu great price', u'get angri want damn pho', u'honeslti tast fresh', u'potato like rubber could tell made ahead time kept warmer', u'fri great', 'great touch', u'servic prompt', 'would go back', u'cashier care ever say still end wayyy overpr', u'tri cape cod ravoli chicken cranberri mmmm', u'disgust pretti sure human hair', u'shock sign indic cash', u'highli recommend', u'waitress littl slow servic', u'place worth time let alon vega', 'like', u'burritto blah', u'food amaz', u'servic also cute', u'could care less interior beauti', u'perform', 'right red velvet cake ohhh stuff good', u'never brought salad ask', u'hole wall great mexican street taco friendli staff', u'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', u'also combo like burger fri beer decent deal', 'like final blow', u'found place accid cou

In [8]:
# Creating a Bag of Words model
help(CountVectorizer)

Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(sklearn.base.BaseEstimator, VectorizerMixin)
 |  Convert a collection of text documents to a matrix of token counts
 |  
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.csr_matrix.
 |  
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ----------
 |  input : string {'filename', 'file', 'content'}
 |      If 'filename', the sequence passed as an argument to fit is
 |      expected to be a list of filenames that need reading to fetch
 |      the raw content to analyze.
 |  
 |      If 'file', the sequence items must have a 'read' method (file-like
 |      object) that is called to fetc

In [15]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [17]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Fitting Naive Bayes to the Training set
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

In [18]:
print(cm)

[[55 42]
 [12 91]]
