# Natural Language Processing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Dataset

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Clean data

In [3]:
import re
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []

for i in range(len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower().split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    
    review = ' '.join(review)
    corpus.append(review)
corpus[0:5]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\e150003\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

## Bag of words Model

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)

X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,-1].values

## Naive Bayes

In [5]:
# Split data
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# Fit model
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, classifier.predict(X_test))
cm



array([[54, 43],
       [16, 87]], dtype=int64)

## Visualization

In [7]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(
    np.arange(start=X_set[:,0].min() - 1, stop=X_set[:,0].max() + 1, step=0.01),
    np.arange(start=X_set[:,1].min() - 1, stop=X_set[:,1].max() + 1, step=0.01)
)

X_set

# plt.contourf(
#     X1, 
#     X2, 
#     classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
#     alpha=0.75,
#     cmap=ListedColormap(('red', 'green'))
# )

# plt.xlim(X1.min(), X1.max())
# plt.ylim(X2.min(), X2.max())
# for i,j in enumerate(np.unique(y_set)):
#     plt.scatter(
#         X_set[y_set==j, 0],
#         X_set[y_set==j, 1],
#         c=ListedColormap(('red', 'green'))(i),
#         label=j
#     )

# plt.title('Naive Bayes (Training set)')
# plt.xlabel('Age')
# plt.ylabel('Estimated Salary')
# plt.legend()
# plt.show()

array([[-0.08692914, -0.03537746, -0.03537746, ...,  0.        ,
        -0.07088812, -0.06696495],
       [-0.08692914, -0.03537746, -0.03537746, ...,  0.        ,
        -0.07088812, -0.06696495],
       [-0.08692914, -0.03537746, -0.03537746, ...,  0.        ,
        -0.07088812, -0.06696495],
       ..., 
       [-0.08692914, -0.03537746, -0.03537746, ...,  0.        ,
        -0.07088812, -0.06696495],
       [-0.08692914, -0.03537746, -0.03537746, ...,  0.        ,
        -0.07088812, -0.06696495],
       [-0.08692914, -0.03537746, -0.03537746, ...,  0.        ,
        -0.07088812, -0.06696495]])