# Unsupervised learning - Python - natural language processing

In [1]:
import pandas as pd

# use quoting=3 to ignore double quotes
dataset = pd.read_csv('data/restaurant_reviews.tsv', delimiter='\t', quoting=3)
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset.head()

Unnamed: 0,Review,Liked
0,I had a pretty satifying experience.,1
1,The steaks are all well trimmed and also perfe...,1
2,I will never go back to this place and will ne...,0
3,"The food is about on par with Denny's, which i...",0
4,I think this restaurant suffers from not tryin...,0


In [2]:
# Cleaning the texts
import re
import nltk
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
from nltk.corpus import stopwords

# corpus is a collection of text of the same type (common name for NLP)
corpus = []

for i in range(0, 1000):
    # keep only the letters from the review
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    
    # change everything to lower case and split all words on whitespace
    review = review.lower()
    review = review.split()
    
    # the goal is to not have a sparse matrix too big (one word = one column)
    # remove useless words like "the", "this", ... -> everything not relevant in text
    # also apply stemming -> keep only the root of the words (loving, loved, will love -> love)
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    
    # recreate the review as a single line and appened it to the list of review
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gillouche/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Creating the Bag of Words model
# take unique words and create one column for each word, one row = one review
# this create a matrix containing 1000 rows and the count of words found in the review for each column
# we want to reduce the sparsity of the matrix as much as possible for ML
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer can take care of stop words, lower case, ... but it is better to keep control over that
# keep only 1500 features, remove the features word that do not appear often
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [4]:
X[:5, :]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [5]:
X.shape

(1000, 1500)

In [6]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [7]:
# Fitting Naive Bayes to the Training set
# should also be used for comparison:
# other classification model
# CART
# C5.0
# Maximum entropy
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [8]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1])

In [9]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm

array([[45, 35],
       [23, 97]])