In [1]:
# Set up imports and libraries
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.feature_extraction.text import  TfidfTransformer
from sklearn.pipeline import Pipeline
import string
import pandas as pd
from nltk.corpus import stopwords


import nltk
#nltk.download('punkt')

In [2]:
#import the review data set
import numpy as np
import pandas as pd
yelp_reviews = []
r_dtypes = {"stars": np.float16, 
            "useful": np.int32, 
            "funny": np.int32,
            "cool": np.int32,
           }
with open("yelp_academic_dataset_review.json", "r", encoding='utf8') as f:
    reader = pd.read_json(f, orient="records", lines=True, 
                          dtype=r_dtypes, chunksize=1000)
        
    for chunk in reader:
        reduced_chunk = chunk.drop(columns=['review_id', 'user_id'])\
                             .query("`date` >= '2022-01-01'")
        yelp_reviews.append(reduced_chunk)
    
yelp_reviews = pd.concat(yelp_reviews, ignore_index=True)
#yelp_reviews.describe #80k rows for 12/1/21; 31665  for 2022-01-01

In [None]:
# import business dataset
import pandas as pd
yelp_busn = []
r_dtypes = {"stars": np.float16, 
            "latitude": np.int32, 
            "longitude": np.int32,
           }
with open("yelp_academic_dataset_business.json", "r", encoding='utf8') as f:
    reader = pd.read_json(f, orient="records", lines=True, 
                          dtype=r_dtypes, chunksize=1000)
        
    for chunk in reader:
        reduced_chunk = chunk.drop(columns=['is_open', 'longitude','latitude','hours'])\
                             .query("`city` >= 'philadelphia'")
        yelp_busn.append(reduced_chunk)
    
yelp_busn = pd.concat(yelp_busn, ignore_index=True)
#yelp_busn.describe

In [3]:
# append text length of review
yelp_reviews.describe
yelp_reviews['text length'] = yelp_reviews['text'].apply(len)
yelp_reviews.head()

Unnamed: 0,business_id,stars,useful,funny,cool,text,date,text length
0,drTZrkbpSoJgwKETlFbc3w,1.0,0,0,0,I bought a Fender 1966 Telecaster that the sal...,2022-01-01 15:47:07,641
1,jyxHti29yWdYR00Itt1A2w,5.0,0,0,0,This is our go to for take out when I visit my...,2022-01-02 03:49:01,208
2,Jo4ei-c-5H53IxZxAVf1jQ,5.0,0,0,0,Danielle did a great job! She listened and cu...,2022-01-03 03:17:03,278
3,YT5CjacTllBtvMaMJS3IbA,1.0,0,0,0,We saw a lot of roaches in the bathroom when w...,2022-01-05 15:55:59,514
4,9MHe5jAym2d8VhT_NbCRyw,2.0,0,0,0,We Ordered pork fried rice and beef chow mei ...,2022-01-06 03:59:21,148


In [4]:
# Data Pre Processing
# 1. Remove Stop Words
# 2. Stem
# 3. Tokenize
# 4. Counts
# 5. Replace smileies?

#does not filter to restaurants - need business dataset

#createa dataset without 3 stars since those are neutral
yelp_data = yelp_reviews[(yelp_reviews['stars'] > 3) | (yelp_reviews['stars'] < 3)]
X = pd.DataFrame(yelp_data['text'])
Y = pd.DataFrame(yelp_data['stars'])

In [5]:
#1 Remove stop words - Done
def process_text(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
X['text'] = X['text'].apply(process_text)

In [6]:
# 2 Stem - - for X instead of test set (takes long time)
from nltk.stem.snowball import SnowballStemmer
snowBallStemmer = SnowballStemmer("english")
X['stemmed'] = X['text'].apply(lambda x: [snowBallStemmer.stem(y) for y in x]) # Stem every word.
X = X.drop(columns=['text']) # Get rid of the unstemmed column.

In [7]:
# 3. Tokenize data to turn words into integers for modeling
cv = CountVectorizer() #stop_words=my_stop_words, ngram_range=(2,2
X['stemmed']=[" ".join(review) for review in X['stemmed'].values] #convert from list to text
X_cv = cv.fit_transform(X['stemmed'])

In [None]:
bigram_df = pd.DataFrame(X_cv.toarray(), columns=cv.get_feature_names_out())
bigram_frequency = pd.DataFrame(bigram_df.sum(axis=0)).reset_index()
bigram_frequency.columns = ['bigram', 'frequency']
bigram_frequency = bigram_frequency.sort_values(by='frequency', ascending=False).head(10)
plt.bar(bigram_frequency['bigram'],bigram_frequency['frequency'])

In [8]:
#X_cv
print('Shape of Sparse Matrix: ', X_cv.shape)
print('Amount of Non-Zero occurrences: ', X_cv.nnz)
# Percentage of non-zero values
density = (100.0 * X_cv.nnz / (X_cv.shape[0] * X_cv.shape[1]))
print("Density: {}".format((density)))

#graphics
#wordcloud failed
#reviews over time
#dist of star ratings
#length fo text for ratings
#avg monthly rtating
#Xhead('text'.most_common(20),columns=['Word', 'Frequency'])

Shape of Sparse Matrix:  (29482, 33942)
Amount of Non-Zero occurrences:  1162659
Density: 0.11618712002039289


In [10]:
# set up nb model and run predictions
X_train, X_test, y_train, y_test = train_test_split(X_cv, Y,test_size=0.3,random_state=101)
nb = MultinomialNB()
nb.fit(X_train,y_train)

predictions = nb.predict(X_test)
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

[[1649   90  100  116]
 [ 268   44  172   96]
 [  78    5  453  755]
 [ 334    5  322 4358]]


              precision    recall  f1-score   support

         1.0       0.71      0.84      0.77      1955
         2.0       0.31      0.08      0.12       580
         4.0       0.43      0.35      0.39      1291
         5.0       0.82      0.87      0.84      5019

    accuracy                           0.74      8845
   macro avg       0.57      0.53      0.53      8845
weighted avg       0.70      0.74      0.71      8845



  y = column_or_1d(y, warn=True)


In [11]:
#nltk.download('vader_lexicon')
# Load SentimentIntensityAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Instantiate new SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
# Generate sentiment scores
sentiment_scores = X['stemmed'].apply(sid.polarity_scores)
sentiment = sentiment_scores.apply(lambda x: x['compound']) #get compound score
sentiment
#could graph over time

0       -0.0516
1        0.6369
2        0.8625
3        0.7269
4        0.5106
          ...  
31660    0.0083
31661    0.8750
31662    0.9300
31663    0.7003
31664    0.9978
Name: stemmed, Length: 29482, dtype: float64