### Importing libraries

In [1]:
import numpy as np
import pandas as pd

### Importing dataset




In [2]:
dataset = pd.read_csv('a1_RestaurantReviews_HistoricDump.tsv', delimiter = '\t', quoting = 3)

In [3]:
dataset.shape

(900, 2)

In [4]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### Data cleaning

In [5]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

In [6]:
from nltk.tokenize import word_tokenize

In [7]:
def remove_url(txt):
    # Creating a sublist of lower case words for each tweet
    txt = txt.lower()
    # Removing URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    txt = url_pattern.sub(r'', txt)
    
    txt = re.sub(r'\@\w+|\#',"", txt)
    
    punctuation = re.compile(r'[!"#$%&\'()*+,-./:;<=>?@\^_`{|}~|0-9]')
    txt = punctuation.sub("",txt)
    
    txt_tokens = word_tokenize(txt)
    # Removing stop words from each tweet list of word
    txt = [ word  for word in txt_tokens if word not in all_stopwords]
    
    #words = ["u", "yr", "nop", "nope", "yes", "yup", "yeah"]
    #txt = [ word  for word in txt if word not in words]
    # Steming
    pst = PorterStemmer()
    txt_stem = [pst.stem(word) for word in txt]
    
    return " ".join(txt_stem)


In [8]:
tweet_list = []

In [9]:
for tweet in dataset['Review'] :
    tweet = remove_url(tweet)
    tweet_list.append(tweet)

In [10]:
corpus = tweet_list 

In [11]:
"""corpus=[]

for i in range(0, 900):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)"""

"corpus=[]\n\nfor i in range(0, 900):\n  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])\n  review = review.lower()\n  review = review.split()\n  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]\n  review = ' '.join(review)\n  corpus.append(review)"

In [12]:
#corpus

### Data transformation

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1420)

In [14]:
X = cv.fit_transform(corpus).toarray()

In [15]:
X_names = cv.get_feature_names()

In [16]:
X = pd.DataFrame(X,columns=X_names)

In [17]:
X.shape

(900, 1420)

In [18]:
y = dataset["Liked"]

In [19]:
#X = cv.fit_transform(corpus).toarray()
#y = dataset.iloc[:, -1].values

In [20]:
# Saving BoW dictionary to later use in prediction
#import pickle 
#bow_path = 'C:/Users/dell/c1_BoW_Sentiment_Model.pkl'
#pickle.dump(cv, open(bow_path, "wb"))

### Dividing dataset into training and test set

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

### Model fitting (Naive Bayes)

### Training the MultinomialNB Model

In [22]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [23]:
from sklearn.naive_bayes import MultinomialNB
# instantiate the model
clf = MultinomialNB()
# Fit the classifier to the data
# fit it to the training data
clf.fit(X_train,y_train)

MultinomialNB()

In [24]:
# Predict the labels for the training data X
# make predictions on the test data
y_pred = clf.predict(X_test)

In [25]:
# Exporting NB Classifier to later use in prediction
#import joblib
#joblib.dump(classifier, 'C:/Users/dell/c2_Classifier_Sentiment_Model') 

### Model performance

In [26]:
#y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[[55 23]
 [18 84]]


0.7722222222222223

In [27]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding = 'latin-1')

In [28]:
df.columns = ["target","ids","date","flag","user",'Review']

In [29]:
dataset2 = df.iloc[795000:805000,]

In [30]:
#dataset2 = pd.read_csv('a2_RestaurantReviews_FreshDump.tsv', delimiter = '\t', quoting = 3)
dataset2.head()

Unnamed: 0,target,ids,date,flag,user,Review
795000,0,2327193206,Thu Jun 25 08:02:16 PDT 2009,NO_QUERY,djcampos,Blah 5am still up daang I got deep problems
795001,0,2327193455,Thu Jun 25 08:02:17 PDT 2009,NO_QUERY,RKF,@jenspeedy I would suggest avoiding 360 Living...
795002,0,2327193641,Thu Jun 25 08:02:18 PDT 2009,NO_QUERY,AnaHertz,@alexbroun I didn't convince myself I was fat ...
795003,0,2327193806,Thu Jun 25 08:02:18 PDT 2009,NO_QUERY,yenafer,"@spotzle @jstarrh check on sunscreen, snacks, ..."
795004,0,2327193864,Thu Jun 25 08:02:19 PDT 2009,NO_QUERY,eppoponotumus,im sitting alone at TTE myself without my two ...


In [31]:
#dataset2 = dataset2.iloc[0:10,]
dataset2.shape

(10000, 6)

In [32]:
tweet_list = []

In [33]:
for tweet in dataset2['Review'] :
    tweet = remove_url(tweet)
    tweet_list.append(tweet)

In [34]:
corpus = tweet_list 

In [35]:
X_fresh = cv.transform(corpus).toarray()
X_fresh.shape

(10000, 1420)

In [36]:
X_fresh_names = cv.get_feature_names()

In [37]:
X_f = pd.DataFrame(X_fresh,columns=X_fresh_names)

In [38]:
y_pred1 = classifier.predict(X_f)
type(y_pred1)

numpy.ndarray

In [39]:
dataset2['predicted_label'] = y_pred1.tolist()
dataset2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset2['predicted_label'] = y_pred1.tolist()


Unnamed: 0,target,ids,date,flag,user,Review,predicted_label
795000,0,2327193206,Thu Jun 25 08:02:16 PDT 2009,NO_QUERY,djcampos,Blah 5am still up daang I got deep problems,0
795001,0,2327193455,Thu Jun 25 08:02:17 PDT 2009,NO_QUERY,RKF,@jenspeedy I would suggest avoiding 360 Living...,0
795002,0,2327193641,Thu Jun 25 08:02:18 PDT 2009,NO_QUERY,AnaHertz,@alexbroun I didn't convince myself I was fat ...,0
795003,0,2327193806,Thu Jun 25 08:02:18 PDT 2009,NO_QUERY,yenafer,"@spotzle @jstarrh check on sunscreen, snacks, ...",0
795004,0,2327193864,Thu Jun 25 08:02:19 PDT 2009,NO_QUERY,eppoponotumus,im sitting alone at TTE myself without my two ...,0


In [40]:
a = pd.DataFrame(y_pred1)
type(a)

pandas.core.frame.DataFrame

In [41]:
b = dataset2["target"].replace(4,1)

In [42]:
type(b)

pandas.core.series.Series

In [43]:
cm = confusion_matrix(b, a)
print(cm)

accuracy_score(b,a)

[[3950 1049]
 [3373 1628]]


0.5578