In [None]:
!pip install -q kaggle
from google.colab import files
import json
import pandas as pd
import gc
import matplotlib.pyplot as plt
from IPython.display import HTML, display
import time
!pip install transformers
from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
!pip install datasets
from datasets import load_metric
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import re
import tensorflow as tf
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import RepeatedKFold
from keras.models import Sequential
from keras.layers import Dense

# Serializing json
# Enter a username and API key in these strings to automatically download Kaggle data
json_object = json.dumps({"username":"_","key":"_"}, indent=4)
 
# Writing to kaggle.json
with open("kaggle.json", "w") as outfile:
    outfile.write(json_object)

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d yelp-dataset/yelp-dataset
! mkdir train
! unzip -o yelp-dataset.zip -d train
! rm yelp-dataset.zip

In [None]:
max = 0
def progress(value):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 90%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))

In [None]:
# Read in and process data
max = 6290515
chunk = 100000
chunks = pd.read_json('train/yelp_academic_dataset_review.json', lines=True, chunksize=chunk)
df = pd.DataFrame()

out = display(progress(0), display_id=True)
i = chunk

for c in chunks:
    df = pd.concat([df, c])
    out.update(progress(i))
    i += chunk

In [None]:
print(df["funny"].max())
print(df["funny"].mean())
print(df["useful"].max())
print(df["useful"].mean())
print(df["cool"].max())
print(df["cool"].mean())

def setHistLabels(title):
  plt.title(title + ' Score Distribution')
  plt.xlabel('Number of Votes')
  plt.ylabel('Number of Reviews')

hist = df.hist(bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], column=["funny"])
setHistLabels("Funny")
hist1 = df.hist(bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], column=["useful"])
setHistLabels("Useful")
hist2 = df.hist(bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], column=["cool"])
setHistLabels("Cool")
hist = df.hist(bins=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], column=["funny"])
setHistLabels("Funny without 0")
hist1 = df.hist(bins=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], column=["useful"])
setHistLabels("Useful without 0")
hist2 = df.hist(bins=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], column=["cool"])
setHistLabels("Cool without 0")

In [None]:
def findCorrelations(column, colTitle):
  print("Correlation between " + colTitle + " and Funny scores", column.corr(df['funny']))
  print("Correlation between " + colTitle + " and Useful scores", column.corr(df['useful']))
  print("Correlation between " + colTitle + " and Cool scores", column.corr(df['cool']), "\n")

def getAverageWordLength(n):
  words = n.split()
  return sum(len(word) for word in words) / len(words)

findCorrelations(df["stars"], "Star Rating")
findCorrelations(df["text"].apply(lambda n: len(n.split())), "Word Count")
findCorrelations(df["text"].apply(getAverageWordLength), "Average Word Length")
df.corr()

In [None]:
import seaborn as sns

plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True)
# Give a title to the heatmap. Pad defines the distance of the title from the top of the heatmap.
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

In [None]:
def getCategoryResults(df, useful, funny, cool):
    mask = ((df['useful'] > 1 if useful else df['useful'] == 0)
            & (df['funny'] > 1 if funny else df['funny'] == 0)
            & (df['cool'] > 1 if cool else df['cool'] == 0 ))
    return df.loc[mask,:]

In [None]:
# Need a dataframe of a sample size with text, star rating, user stats
df.drop(columns=['business_id', 'date'], inplace=True)
gc.collect()

data = pd.DataFrame()

# Min number of rows out of these categories
dividedNum = 1000
data = pd.concat([data, getCategoryResults(df, True, False, False).sample(dividedNum)])
data = pd.concat([data, getCategoryResults(df, False, True, False).sample(dividedNum)])
data = pd.concat([data, getCategoryResults(df, False, False, True).sample(dividedNum)])
data = pd.concat([data, getCategoryResults(df, True, True, False).sample(dividedNum)])
data = pd.concat([data, getCategoryResults(df, True, False, True).sample(dividedNum)])
data = pd.concat([data, getCategoryResults(df, False, True, True).sample(dividedNum)])
data = pd.concat([data, getCategoryResults(df, True, True, True).sample(dividedNum)])
data = pd.concat([data, getCategoryResults(df, False, False, False).sample(dividedNum)])

data = data.sample(frac=1)
del df
gc.collect()

In [None]:
chunk = 100000
max = 1987896
chunks = pd.read_json('train/yelp_academic_dataset_user.json', lines=True, chunksize=chunk)
users = pd.DataFrame()

out = display(progress(0), display_id=True)
i = chunk

for c in chunks:
    users = pd.concat([users, c])
    out.update(progress(i))
    i += chunk

In [None]:
# Renaming columns to prevent clash between useful, funny, cool columns in the review and user datasets
data.rename(columns={"useful": "review_useful", "funny": "review_funny", "cool": "review_cool"}, inplace=True)
data = pd.merge(data, users[["user_id", "useful", "funny", "cool"]], on=["user_id"])
userScoreSums = data[["useful", "funny", "cool"]].sum(axis=1)
data["user_useful"] = (data["useful"] / userScoreSums).fillna(1/3)
data["user_funny"] = (data["funny"] / userScoreSums).fillna(1/3)
data["user_cool"] = (data["cool"] / userScoreSums).fillna(1/3)
data.drop(columns=['useful', 'funny', 'cool'], inplace=True)
data.rename(columns={"review_useful": "useful", "review_funny": "funny", "review_cool": "cool"}, inplace=True)
data.drop(columns=['user_id'], inplace=True)

In [None]:
del users
gc.collect()

In [None]:
X_train, X_test, _, _ = train_test_split(data, data["useful"], train_size=0.8)

del data
gc.collect()

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [None]:
embeddings_index = {}
with open("glove.6B.100d.txt") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
def getWordEmbedding(text):
  num_tokens = 256
  embedding_dim = 100

  # Prepare embedding matrix
  embedding_matrix = np.zeros((num_tokens, embedding_dim))
  for i, word in enumerate(text):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
  return embedding_matrix

In [None]:
def processX(x):
  x["text"] = x["text"].apply(lambda text: re.sub('[^A-Za-z0-9]+', ' ', text.lower()).split()[0:256])
  x["text"] = x["text"].apply(getWordEmbedding)
  x['text'] = x['text'].apply(lambda text: np.concatenate(text).ravel()).values
  X = x[["text", "stars", "user_useful", "user_funny", "user_cool"]].values
  y = x[["useful", "funny", "cool"]].values
  del x
  gc.collect()
  X = np.apply_along_axis(lambda row: np.concatenate([row[0].tolist(), row[1:4]]).ravel(), 1, X)
  X = np.asarray(X).astype(np.float32)
  return X, y

In [None]:
x_train, y_train = processX(X_train)
del X_train
gc.collect()
x_test, y_test = processX(X_test)
del X_test
gc.collect()
del embeddings_index
gc.collect()

In [None]:
# https://machinelearningmastery.com/deep-learning-models-for-multi-output-regression/
# mlp for multi-output regression

# get the model
def get_model(n_inputs, n_outputs):
	model = Sequential()
	model.add(Dense(20, input_dim=n_inputs, \
		kernel_initializer='he_uniform', activation='relu'))
	model.add(Dense(20, activation='relu'))
	model.add(Dense(20, activation='relu'))
	model.add(Dense(20, activation='relu'))
	model.add(Dense(20, activation='relu'))
	model.add(Dense(20, activation='relu'))
	model.add(Dense(n_outputs))
	model.compile(loss='mae', optimizer='adam')
	return model

# evaluate a model using repeated k-fold cross-validation
def evaluate_model(ratingIndex):
	results = list()
	n_inputs, n_outputs = x_train.shape[1], len(y_train[:, ratingIndex])
	# define model
	model = get_model(n_inputs, n_outputs)
	# fit model
	model.fit(x_train, y_train[:, ratingIndex], verbose=1, epochs=100)
	# evaluate model on test set
	mae = model.evaluate(x_test, y_test[:, ratingIndex], verbose=1)
	# store result
	print('>%.3f' % mae)
	results.append(mae)
	return results

In [None]:
# evaluate model
results = evaluate_model(0)
# summarize performance
print('Useful - MAE: %.3f' % (mean(results)))

In [None]:
# evaluate model
results = evaluate_model(1)
# summarize performance
print('Funny - MAE: %.3f (%.3f)' % (mean(results), std(results)))

In [None]:
# evaluate model
results = evaluate_model(2)
# summarize performance
print('Cool - MAE: %.3f (%.3f)' % (mean(results), std(results)))