In [47]:
import json
import csv
import string
import pandas as pd
import sklearn.feature_extraction.text as sk_text
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
#Open business.json file, create tsv file with business_id, business name, categories, and review count to be used as features 
#and stars as label

outfile = open("business.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','categories', 'stars', 'review_count'])
with open('yelp_academic_dataset_business.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['categories'], row['stars'],row['review_count'] ])

outfile.close()

business_df= pd.read_csv('business.tsv', delimiter ="\t", nrows=10000)

In [9]:
#Open review.json file, create tsv file with business_id, text to be used as features 
#and stars as label

outfile = open("review_stars.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars', 'text'])
with open('yelp_academic_dataset_review.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['stars'], (row['text']).encode('utf-8')])

outfile.close()

review_df= pd.read_csv('review_stars.tsv', delimiter ="\t", nrows=10000)

In [13]:
# Group all reviews by business_id
review_agg_df = review_df.groupby('business_id')['text'].sum()


In [14]:
df_ready_for_sklearn = pd.DataFrame({'business_id': review_agg_df.index, 'all_reviews': review_agg_df.values})

In [15]:
#Merge the resulting review aggregate dataframe with business dataframe
merge_df = pd.merge(business_df, df_ready_for_sklearn, on='business_id')

In [19]:
merge_df

Unnamed: 0,business_id,categories,stars,review_count,all_reviews
0,DR30lzIHVTF6xhyMI-3IlQ,"Thrift Stores, Shopping, Used, Vintage & Consi...",3.5,17,"b""If this place was on on fire i wouldn't reac..."
1,YIez_A3WOt9J2SXN7OMa2Q,"Caribbean, Food, Bakeries, Restaurants",4.0,105,b'Love the jerk chicken sandwich and jerk chic...
2,Gc8R7b3I3CTwAiWv7MjtSg,"Body Shops, Auto Repair, Automotive",4.5,24,b'My experience was excellent. They expedited ...
3,pIzuXtFdkj8fHuzJfYiwqw,"Restaurants, Event Planning & Services, Italia...",4.5,3,"b""I'm visiting Calgary from Toronto for a few ..."
4,5T6kFKFycym_GkhgOiysIw,"Poutineries, Restaurants, Diners",4.0,1565,"b'This place is amazing. I mean, you really ca..."
5,OyJDaAAMr220qkZsovCARQ,"Food, Coffee & Tea",3.0,49,"b""My favorite Starbucks. Extremely friendly st..."
6,YkAIlxYZ1guSqbbowU9X4g,"Restaurants, Chinese, Dim Sum, Breakfast & Brunch",3.5,171,b'Came here for a lovely dinner with husband ...
7,ZQ-7uFQk21NHoOzJfhEjBw,"Coffee & Tea, Food",3.0,59,"b""Wish I had positive things to say. I ordered..."
8,2ktKjN5z8EcqmUv6EDiDgA,"Fashion, Department Stores, Automotive, Shoppi...",3.5,121,b'Got $1000 worth of tires today. They told me...
9,ohYgabP6PqkNsF0vnZUxeg,"Arts & Entertainment, Coffee & Tea, Bars, Food...",4.5,149,"b""Just. Yes.\n\nCoffee: 9.2/10\nBeer: 10/10 \n..."


In [20]:
print(list(merge_df.columns))

['business_id', 'categories', 'stars', 'review_count', 'all_reviews']


In [62]:
#convert into lower case all words in category
merge_df['categories'] = merge_df['categories'].str.lower()

In [57]:
#Remove punctuations from categories
merge_df['categories'] = merge_df['categories'].str.replace('[^\w\s]','')

In [59]:
#Remove stop words
stop = stopwords.words('english')
merge_df['categories'] = merge_df['categories'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


In [60]:
# one hot coding of categories
pd.get_dummies(merge_df['categories'])


Unnamed: 0,acai bowls food ice cream frozen yogurt restaurants glutenfree juice bars smoothies,active life gyms martial arts trainers fitness instruction,active life trainers fitness instruction gyms,acupuncture health medical massage therapy massage beauty spas skin care,american new breakfast brunch food sandwiches event planning services party event planning coffee tea restaurants caterers,american new caterers southern restaurants barbeque event planning services,american new restaurants canadian new breakfast brunch cafes,american new sandwiches bars nightlife restaurants lounges,american traditional breakfast brunch restaurants diners,american traditional chicken shop comfort food soul food chicken wings burgers canadian new fast food waffles restaurants breakfast brunch,...,translation services nightlife lounges bars hotels airports event planning services airport lounges travel services hotels travel professional services,transportation hotels travel public transportation,turkish restaurants middle eastern,venues event spaces hotels travel event planning services hotels,venues event spaces restaurants vegetarian breakfast brunch american new comfort food canadian new event planning services,veterinarians pets,veterinarians pets pet services,vietnamese restaurants chinese,water delivery local services ice delivery,wine bars nightlife bars cocktail bars seafood restaurants
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
#convert into lower case all words in category
merge_df['all_reviews'] = merge_df['all_reviews'].str.lower()

In [63]:
#Remove punctuations from reviews
merge_df['all_reviews'] = merge_df['all_reviews'].str.replace('[^\w\s]','')

In [65]:
#Remove stop words
stop = stopwords.words('english')
merge_df['all_reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

0      bIf place fire wouldnt reach phone call help T...
1      bLove jerk chicken sandwich jerk chicken dinne...
2      bMy experience excellent They expedited repair...
3      bIm visiting Calgary Toronto days discovered C...
4      bThis place amazing I mean really cant go wron...
5      bMy favorite Starbucks Extremely friendly staf...
6      bCame lovely dinner husband weeks ago great di...
7      bWish I positive things say I ordered mocha la...
8      bGot 1000 worth tires today They told would lo...
9      bJust YesnnCoffee 9210nBeer 1010 nWine 1210 te...
10     bThis great place taste authentic Japanese Iza...
11     bI given store many chances I live close conve...
12     bWe locals decided try Jayde since love M Hote...
13     bA Squirrel Hill destination easy access Forbe...
14     bThis place rocks A quaint little diner makes ...
15     bReally good place go bunch friends Its sort b...
16     bA less pretentious place find Valley Good ass...
17     bI choice place Yours Tr

[4.41303184 5.44265126 6.35894199 ... 5.66579481 6.35894199 6.35894199]


(1, 6573)
[[0. 0. 0. ... 0. 0. 0.]]


In [73]:
#Common words display to decide for removal
freq = pd.Series(' '.join(merge_df['all_reviews']).split()).value_counts()[:10]
freq

it      529
that    426
with    394
my      361
but     352
on      349
they    347
you     326
The     324
have    311
dtype: int64

In [74]:
#remove common words
freq = list(freq.index)
merge_df['all_reviews'] = merge_df['all_reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
merge_df['all_reviews'].head()

0    bIf this place fire i wouldnt reach phone call...
1    bLove jerk chicken sandwich jerk chicken dinne...
2    bMy experience excellent They expedited repair...
3    bIm visiting Calgary from Toronto few days dis...
4    bThis place amazing mean really cant go wrong ...
Name: all_reviews, dtype: object

In [75]:
#Rare words display to decide for removal
freq = pd.Series(' '.join(merge_df['all_reviews']).split()).value_counts()[-10:]
freq

cab             1
YIKES           1
carpetthe       1
nnConvenient    1
bc              1
liver           1
blood           1
Additionally    1
Laura           1
paired          1
dtype: int64

In [76]:
# Rare words removal

freq = list(freq.index)
merge_df['all_reviews'] = merge_df['all_reviews'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
merge_df['all_reviews'].head()

0    bIf this place fire i wouldnt reach phone call...
1    bLove jerk chicken sandwich jerk chicken dinne...
2    bMy experience excellent They expedited repair...
3    bIm visiting Calgary from Toronto few days dis...
4    bThis place amazing mean really cant go wrong ...
Name: all_reviews, dtype: object

In [77]:
#TF-IDF calculation
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))

#Learn vocabulary and idf, return term-document matrix.
train_vect = tfidf.fit_transform(merge_df['all_reviews'])



In [82]:
print(train_vect)

  (0, 70)	0.26393457857224006
  (0, 642)	0.10449764255446632
  (0, 986)	0.24057180521630242
  (0, 632)	0.21232136937175883
  (0, 393)	0.2180705393509186
  (0, 11)	0.25094432353822355
  (0, 807)	0.24549831720156845
  (0, 624)	0.15832961524590627
  (0, 604)	0.21511606364413244
  (0, 983)	0.27190251204235855
  (0, 268)	0.25094432353822355
  (0, 817)	0.1413494854584172
  (0, 385)	0.2570324406524025
  (0, 241)	0.14717980027821803
  (0, 947)	0.1751504559924734
  (0, 854)	0.25094432353822355
  (0, 943)	0.2212041807583114
  (0, 243)	0.2180705393509186
  (0, 515)	0.17384179741338626
  (0, 300)	0.18871186880334231
  (0, 481)	0.12596508890082286
  (0, 239)	0.24057180521630242
  (1, 78)	0.3313571069727136
  (1, 139)	0.6752615169350954
  (1, 746)	0.26655921737914556
  :	:
  (422, 267)	0.09751291922970463
  (422, 248)	0.10114327891297929
  (422, 651)	0.10806380162137155
  (422, 409)	0.1741820167800166
  (422, 815)	0.08511404220818888
  (422, 611)	0.23655550514106427
  (422, 722)	0.09168310211765054


In [None]:
#TF-IDF calculation

# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(merge_df['all_reviews'])

# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_)

In [None]:
# encode document
vector = vectorizer.transform([merge_df['all_reviews'][0]])
# summarize encoded vector
print(vector.shape)
print(vector.toarray())