# CS105 Survey
Members: John Huh, Emily Mai, Calvin Ng

# Data Pre-processing

In [10]:
# import necessary libraries
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub

In [11]:
# load the data
def load_data(start=0, end=10000, path="data/yelp_academic_dataset_review.json"):
    datafile = open(path, encoding='utf-8')
    data = []
    for i, line in tqdm(enumerate(datafile)):
        if i in list(range(start, end)):
            data.append(json.loads(line))
        if i > end:
            break
    datafile.close()
    df = pd.DataFrame(data)
    return df

# Text Embedding

In [12]:
# function to embedd the reviews
def load_embeddings(dataframe, column):
    embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    messages = np.asarray(dataframe[column])
    tensor_list = tf.convert_to_tensor(messages)
    embeddings = np.array(embed(tensor_list)).tolist()
    # return the reviews followed by their respective embeddings
    return messages, embeddings

In [13]:
# function to show the embeddings
def show_embeddings(n, reviews, embeddings):
    # print first n review embeddings
    for i, message_embedding in enumerate(embeddings[:n]):
        print("Message: {}".format(reviews[i]))
        print("Embedding size: {}".format(len(message_embedding)))
        message_embedding_snippet = ", ".join(
            (str(x) for x in message_embedding[:3]))
        print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

# Batch Processing

In [14]:
# function to process batches of the data
def batch_process(step=10000, stop=50000):
    review_df = pd.DataFrame()
    # generate ranges (i.e. [(0,1000), (1000,2000), ...]
    batches = [(n, min(n + step, stop)) for n in range(0, stop, step)]
    for s, e in batches:
        print("Processing {}:{}...".format(s, e))
        # load data and embeddings
        df = load_data(s, e)
        text, embeddings = load_embeddings(df, "text")
        # store embeddings in dataframe
        df["embedding"] = embeddings
        # add new embeddings to main dataframe
        review_df = pd.concat([review_df, df], ignore_index=True, sort=False)

    # save to pickle file to avoid reprocessing of embeddings
    review_df.to_pickle("data/data.pkl")
    print(review_df)


# batch_process()

# if you have the pkl file just run this:
review_df = pd.load_pkl()

181it [00:00, 1765.55it/s]

Processing 0:10000...


10001it [00:05, 1951.46it/s]








265it [00:00, 2634.06it/s]

Processing 10000:20000...


20001it [00:09, 2011.41it/s]












159it [00:00, 1577.62it/s]

Processing 20000:30000...


30001it [00:15, 1953.83it/s]












140it [00:00, 1393.54it/s]

Processing 30000:40000...


40001it [00:22, 1798.80it/s]












83it [00:00, 816.15it/s]

Processing 40000:50000...


50001it [00:34, 1444.86it/s]














                    review_id                 user_id             business_id  \
0      lWC-xP3rd6obsecCYsGZRg  ak0TdVmGKo4pwqdJSTLwWw  buF9druCkbuXLX526sGELQ   
1      8bFej1QE5LXp4O05qjGqXA  YoVfDbnISlW0f7abNQACIg  RA4V8pr014UyUbDvI-LW2A   
2      NDhkzczKjLshODbqDoNLSg  eC5evKn1TWDyHCyQAwguUw  _sS2LBIGNT5NQb6PD1Vtjw   
3      T5fAqjjFooT4V0OeZyuk1w  SFQ1jcnGguO0LYWnbbftAA  0AzLzHfOJgL7ROwhdww2ew   
4      sjm_uUcQVxab_EeLCqsYLg  0kA0PAJ8QFMeveQWHFqz2A  8zehGz9jnxPqXtOc7KaJxA   
...                       ...                     ...                     ...   
49995  eLkFmRh40KVwtc93cVZDJw  B0yN2gt_A56qzdX1_U-yHw  owPn7z9gvHp_UMWetZ67Yg   
49996  eNIn4FkfY_5zL9zW7aNs9w  8KQhzS8aEjKfxsd41BB3ag  GKPLJQW8FqHAWQbllymsog   
49997  0gcsSmvDOHBBF2TNbTd1KA  ySqKXdWrWSsMk9hHAORlSA  cPyQFwnQNW4HLAGt17hxYA   
49998  5cTLlmyxjPxPfrsaGF-eyA  DykEgF6WshRmoc2nk61zTQ  cxgffDJkf88suSwy7bAPTg   
49999  uP2lNP3autkVrxYD_fHXbQ  i-zCvzBn5NBWmZtHyZAv5g  or-Kyw7kmNin1pWbu2RLJw   

       stars  useful  funny