In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from nltk.corpus import stopwords



## 1. About the Dataset

### 1.1 Load the Dataset 

In [4]:
df = pd.read_json('yelp_training_set_review.json', lines=True)
df.head()

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id
0,"{'funny': 0, 'useful': 5, 'cool': 2}",rLtl8ZkDX5vH5nAx9C3q5Q,fWKvX83p0-ka4JS3dc6E5A,5,2011-01-26,My wife took me here on my birthday for breakf...,review,9yKzy9PApeiPPOUJEtnvkg
1,"{'funny': 0, 'useful': 0, 'cool': 0}",0a2KyEL0d3Yb1V6aivbIuQ,IjZ33sJrzXqU-0X6U8NwyA,5,2011-07-27,I have no idea why some people give bad review...,review,ZRJwVLyzEJq1VAihDhYiow
2,"{'funny': 0, 'useful': 1, 'cool': 0}",0hT2KtfLiobPvh6cDC8JQg,IESLBzqUCLdSzSqm0eCSxQ,4,2012-06-14,love the gyro plate. Rice is so good and I als...,review,6oRAC4uyJCsJl1X0WZpVSA
3,"{'funny': 0, 'useful': 2, 'cool': 1}",uZetl9T0NcROGOyFfughhg,G-WvGaISbqqaMHlNnByodA,5,2010-05-27,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,_1QQZuf4zZOyFCvXc0o6Vg
4,"{'funny': 0, 'useful': 0, 'cool': 0}",vYmM4KTsC8ZfQBg-j5MWkw,1uJFq2r5QfJG_6ExMRCaGw,5,2012-01-05,General Manager Scott Petello is a good egg!!!...,review,6ozycU1RpktNG2-1BroVtw


### 1.2 Dataset Glossary

**votes**: This column represents the aggregated counts of votes from users on the usefulness, humor, or coolness of a review.

**user_id**: This is a unique identifier for each user who provides reviews on Yelp.

**review_id**: This is a unique identifier for each review.

**star**: Ratings range from 1 to 5 stars, with 1 being the lowest rating and 5 being the highest rating. Users assign stars to indicate their overall satisfaction or experience with the business or service being reviewed.

**date**: This column represents the date and time when users submitted their reviews on Yelp.

**text**: This column contains the comments or written reviews provided by users.

**type**: This column indicates the type of content, which is a review in this case.

**business_id**: This is a unique identifier for each business. Each review is associated with a specific business identified by this ID.

## 2. Data Preprocessing

In [5]:
# check how many rows and columns in this dataset

df.shape

(229907, 8)

There are 229907 rows and 8 columns in this dataset.

In [6]:
# to get all the columns' name
df.columns

Index(['votes', 'user_id', 'review_id', 'stars', 'date', 'text', 'type',
       'business_id'],
      dtype='object')

In [7]:
# To check datatype of each column, and check if there is any null value in this dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 229907 entries, 0 to 229906
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   votes        229907 non-null  object        
 1   user_id      229907 non-null  object        
 2   review_id    229907 non-null  object        
 3   stars        229907 non-null  int64         
 4   date         229907 non-null  datetime64[ns]
 5   text         229907 non-null  object        
 6   type         229907 non-null  object        
 7   business_id  229907 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 15.8+ MB


There is no any null value in this dataset.

In [8]:
# nomalize json formet in votes column.
votes_normalize = pd.json_normalize(df['votes'])
votes_normalize

Unnamed: 0,funny,useful,cool
0,0,5,2
1,0,0,0
2,0,1,0
3,0,2,1
4,0,0,0
...,...,...,...
229902,0,0,0
229903,0,2,0
229904,0,0,0
229905,1,2,0


In [9]:
# Replace normalized 'votes' columns with original 'votes' columns.
df_normalized = pd.concat((df,votes_normalize),axis=1).drop(columns='votes')
df_normalized

Unnamed: 0,user_id,review_id,stars,date,text,type,business_id,funny,useful,cool
0,rLtl8ZkDX5vH5nAx9C3q5Q,fWKvX83p0-ka4JS3dc6E5A,5,2011-01-26,My wife took me here on my birthday for breakf...,review,9yKzy9PApeiPPOUJEtnvkg,0,5,2
1,0a2KyEL0d3Yb1V6aivbIuQ,IjZ33sJrzXqU-0X6U8NwyA,5,2011-07-27,I have no idea why some people give bad review...,review,ZRJwVLyzEJq1VAihDhYiow,0,0,0
2,0hT2KtfLiobPvh6cDC8JQg,IESLBzqUCLdSzSqm0eCSxQ,4,2012-06-14,love the gyro plate. Rice is so good and I als...,review,6oRAC4uyJCsJl1X0WZpVSA,0,1,0
3,uZetl9T0NcROGOyFfughhg,G-WvGaISbqqaMHlNnByodA,5,2010-05-27,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,_1QQZuf4zZOyFCvXc0o6Vg,0,2,1
4,vYmM4KTsC8ZfQBg-j5MWkw,1uJFq2r5QfJG_6ExMRCaGw,5,2012-01-05,General Manager Scott Petello is a good egg!!!...,review,6ozycU1RpktNG2-1BroVtw,0,0,0
...,...,...,...,...,...,...,...,...,...,...
229902,6e7pZofhDuIlD_rX2oYirQ,f9JaiNg_FMoPNWxt7MlbZQ,2,2011-04-14,I really wanted to like this place because it'...,review,vnffHkFJbmd-J3OaBbK2Eg,0,0,0
229903,dDNfSFT0VApxPmURclX6_g,QDWRP1pW5r0huIBAoGmFyg,1,2011-01-23,My husband I stayed here for two nights. Of c...,review,l5oUrgQ190l8CcN8uzd_pA,0,2,0
229904,M5wHt6Odh1k5v0tIjqd8DQ,JmR3yk7JlS1LVVxtIc3xBQ,4,2010-10-11,Cool atmosphere. A lot of beers on tap and goo...,review,-EctXOb3B7T177jGYUhjVA,0,0,0
229905,jopndPrv-H5KW2CfScnw9A,z5b2p5TbCg0uaIiIe8n62w,3,2011-01-18,I have to take a star off for the spotty servi...,review,YQvg0JCGRFUkb6reMMf3Iw,1,2,0


In [10]:
# df_normalized = df_normalized.sample(10)

In [60]:
import string

In [61]:
punct_to_space = {x: ' ' for x in string.punctuation}

In [62]:
# create a function for text cleaning
def text_process(text):
    
    no_punct_text = text.translate(str.maketrans(punct_to_space)).lower() # remove all punctuations 
    no_puct_no_number_text= ''.join([i for i in no_punct_text if not i.isdigit()]) # remove all the digits
    cleaned_text = [i for i in no_puct_no_number_text.split() if i not in stopwords.words('english')] # remoce all stopwords
    
    
    return len(cleaned_text), ' '.join(cleaned_text)  

In [63]:
string = 'hi, how 2 djkfjd '
text_process(string)

(2, 'hi djkfjd')

In [64]:
df_cleaned = df_normalized.apply(lambda x: text_process(x['text']), axis=1, result_type='expand') # apply text_process funtion to the data

In [65]:
df_cleaned.columns = ['text_length','text_transformed'] 

In [48]:
df_cleaned.head()

Unnamed: 0,text_length,text_transformed
168634,41,nordstrom cafe awesome else get healthy presen...
204166,138,ok last time lot fun nothing free sincerely en...
180702,21,must back chicago clark lake enjoying deliciou...
182441,37,good special diets vegan gluten free luck pizz...
225926,19,hung pool cabana last weekend day lovely degre...


In [66]:
df_normalized = df_normalized.query('stars == 1 or stars == 5' ) # only select highest review and lowest review for binary classification
df_transformed = pd.concat((df_normalized,df_cleaned), axis=1)[['stars','text_transformed','text_length']]
df_transformed.to_csv('yelp_training_set_review_transformed.csv', encoding='utf-8', index=False)