In [None]:
# all imports

import json
import csv
import pandas as pd
import numpy as np
import scipy as sci
from scipy import sparse
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as sk_text
import nltk
#nltk.download()
from nltk.corpus import stopwords

In [None]:
# create tsv file with selected attibutes of reviews.json

outfile = open("review_stars.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars', 'text'])
with open('yelp_dataset/yelp_academic_dataset_review.json') as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['stars'], (row['text']).encode('utf-8')])

outfile.close()

review_df= pd.read_csv('review_stars.tsv', delimiter ="\t", encoding = "utf-8")

In [None]:
# create tsv file with selected attibutes of business.json

outfile = open("business.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','categories', 'stars', 'review_count'])
with open('yelp_dataset/yelp_academic_dataset_business.json') as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['categories'], row['stars'],row['review_count'] ])

outfile.close()

business_df= pd.read_csv('business.tsv', delimiter ="\t")

In [None]:
# group-by business id

review_agg_df = review_df.groupby('business_id')['text'].sum()

# for Sklearn

df_ready_for_sklearn = pd.DataFrame({'business_id': review_agg_df.index, 'all_reviews': review_agg_df.values})

In [None]:
# join both files and create new

merge_df = pd.merge(business_df, df_ready_for_sklearn, on='business_id')

In [None]:
# normalization 

merge_df['review_count'] = zscore(merge_df['review_count'])

In [None]:
# IT-IDF Vectorizer

vectorizer = sk_text.TfidfVectorizer(stop_words='english', min_df=5)
matrix = vectorizer.fit_transform(merge_df['all_reviews'])
matrix = matrix.toarray()       # Compressed Sparse Row matrix 

In [None]:
# X Data

review_array = np.vstack(merge_df['review_count'])

features_matrix = np.concatenate((matrix,review_array),axis=1)

# Y Data

from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

target_array = label_encoder.fit_transform(merge_df['stars'])

In [None]:
# Spliting the data into Training and Testing

x_train, x_test, y_train, y_test = train_test_split(features_matrix, target_array, test_size=0.2, random_state=42)

In [None]:
# training data stats

print('X training set shape ',x_train.shape)
print('y training set shape ',y_train.shape)
print('X test set shape ',x_test.shape)
print('y test set shape ',y_test.shape)

In [None]:
# Linear Regrassion

from sklearn.linear_model import LinearRegression

lin_reg_clf = LinearRegression()

lin_reg_clf.fit(x_train, y_train)

y_pred = lin_reg_clf.predict(x_test)

# print predicted and target values

for i in range(0,len(y_pred)):
     
    print(y_pred[i],y_test[i])

In [None]:
# logistic Regression

from sklearn.linear_model import LogisticRegression

Log_reg_clf = LogisticRegression()

# Train the model 
Log_reg_clf.fit(x_train, y_train)

# predict
y_pred = Log_reg_clf.predict(x_test)

# print predicted and target values

for i in range(0,len(y_pred)):
     
    print(y_pred[i],y_test[i])

In [None]:
#Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB

mnb_clf = MultinomialNB()

mnb_clf.fit(x_train, y_train)

y_pred = mnb_clf.predict(x_test)

# print predicted and target values

for i in range(0,len(y_pred)):
     
    print(y_pred[i],y_test[i])

In [None]:
# SVM

from sklearn.svm import SVC

svm_clf = SVC()

svm_clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)


# print predicted and target values

for i in range(0,len(y_pred)):
     
    print(y_pred[i],y_test[i])

In [None]:
# KNN 

from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=1)

knn_clf.fit(x_train, y_train)

y_pred = knn_clf.predict(x_test)

# print predicted and target values

for i in range(0,len(y_pred)):
     
    print(y_pred[i],y_test[i])