## Reading required packages:
#### sklearn: package which has functions used to perform machine learning 
#### nltk: package used to perform natural language processing
#### pandas: package used to read and write files

In [191]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import confusion_matrix

### Copying stop words into stop object

In [192]:
#import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

### Defining a stemmer object to extract root words

In [205]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

In [206]:
import os
os.getcwd()

'C:\\Users\\dhruva.gupta\\Downloads'

In [243]:
# read yelp.csv into a DataFrame

yelp = pd.read_csv("yelp.csv",encoding = "ISO-8859-1")
print(yelp.shape)
yelp.head()


#X.apply(lambda x: [item for item in x.split() if item not in stop])


(10000, 11)
(4086, 11)
My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.

Do yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I've ever had.  I'm pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.

While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I've ever had.

Anyway, I can't wait to go back!


In [None]:
# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
print(yelp_best_worst.shape)
# define X and y
X = yelp_best_worst.text
print(X[0])

# Stop words:
Some examples of stop words are: "a", "and", "but", "how", "or", and "what." While the majority of all Internet search engines utilize stop words, they do not prevent a user from using them, but they are ignored.

### Stemming:
A stemmer for English, for example, should identify the string "cats" (and possibly "catlike", "catty" etc.) as based on the root "cat", and "stems", "stemmer", "stemming", "stemmed" as based on "stem". A stemming algorithm reduces the words "fishing", "fished", and "fisher" to the root word, "fish"

In [244]:
X =X.apply(lambda x: ' '.join([item for item in x.split() if item not in stop])) #removing stopwords

In [245]:
X[0] #after removing stopwords

'My wife took birthday breakfast excellent. The weather perfect made sitting outside overlooking grounds absolute pleasure. Our waitress excellent food arrived quickly semi-busy Saturday morning. It looked like place fills pretty quickly earlier get better. Do favor get Bloody Mary. It phenomenal simply best I\'ve ever had. I\'m pretty sure use ingredients garden blend fresh order it. It amazing. While EVERYTHING menu looks excellent, I white truffle scrambled eggs vegetable skillet tasty delicious. It came 2 pieces griddled bread amazing absolutely made meal complete. It best "toast" I\'ve ever had. Anyway, I can\'t wait go back!'

In [246]:
def stem1(text1):
    return ps.stem(text1)

X=X.apply(stem1)

y = yelp_best_worst.stars

# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [247]:
X[0]

'my wife took birthday breakfast excellent. the weather perfect made sitting outside overlooking grounds absolute pleasure. our waitress excellent food arrived quickly semi-busy saturday morning. it looked like place fills pretty quickly earlier get better. do favor get bloody mary. it phenomenal simply best i\'ve ever had. i\'m pretty sure use ingredients garden blend fresh order it. it amazing. while everything menu looks excellent, i white truffle scrambled eggs vegetable skillet tasty delicious. it came 2 pieces griddled bread amazing absolutely made meal complete. it best "toast" i\'ve ever had. anyway, i can\'t wait go back!'

In [248]:
cveg = CountVectorizer()

x_example = cveg.fit_transform(["how are you", "We are are fine", "are we fine", "yes we are fine","who is she where is the car"])
x_example.toarray()

array([[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       [2, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0],
       [0, 1, 0, 0, 2, 1, 1, 0, 1, 1, 0, 0]], dtype=int64)

In [249]:
cveg.get_feature_names()

['are',
 'car',
 'fine',
 'how',
 'is',
 'she',
 'the',
 'we',
 'where',
 'who',
 'yes',
 'you']

### Vectorizer transformation example for one hot encoding

|are | fine | how | we | yes | you |        
| :- |: | :| :|: | : 
|1| 0  | 1 | 0 | 0 | 1 
|2| 1  | 0 | 1 | 0 | 0 
|1| 1  | 0 | 1 | 0 | 0 
|1| 1  | 0 | 1 | 1 | 0 


In [250]:
# use CountVectorizer to create document-term matrices from X_train and X_test
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [251]:
X_train_dtm.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

### Apply logistic regression for classification

In [252]:
# use logistic regression with text column only
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)
print(metrics.accuracy_score(y_test, y_pred_class))

0.9246575342465754


In [253]:
y_pred_class.shape

(1022,)

In [254]:
import numpy as np
ii = np.where(y_pred_class == 5)[0]

In [255]:
ii[9]

12

### Print confusion matrix

In [256]:
confusion_matrix(y_test, y_pred_class)

array([[150,  34],
       [ 43, 795]], dtype=int64)

In [257]:
cm= confusion_matrix(y_test,y_pred_class )



In [258]:
print(cm)

[[150  34]
 [ 43 795]]
