In [37]:
from collections.abc import Sequence
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


# Import Data

In [38]:
import pandas as pd
import numpy as np

# Convert Data from JSON to Pandas
reviews = pd.read_json('yelp_academic_dataset_review.json', lines=True, nrows=15000)
reviews = reviews.rename(columns={'stars': 'review_rating'})


businesses = pd.read_json('yelp_academic_dataset_business.json', lines=True, nrows=15000)
businesses = businesses.rename(columns={'stars': 'business_rating'})


# Keep all business with more than 20 lines & merge the reviews

In [39]:

# Remove all business with reivew count < 20
print("removed businesses: ", businesses[businesses.review_count < 20].shape[0])
businesses =  businesses[businesses.review_count > 20]


all_reviews = pd.merge(businesses,reviews, on='business_id') 

removed businesses:  8741


# Get all reviews for one business

In [40]:
# print("Individual Reviews (after join): ", all_reviews.shape)

all_reviews = all_reviews[['business_id', 'text', 'business_rating']]
# print(all_reviews.head())

reviews_agg = all_reviews.groupby('business_id')['text'].sum()
rating_avg = all_reviews.groupby('business_id')['business_rating'].mean()

df_reviews_agg = pd.DataFrame({'business_id': reviews_agg.index, 'all_reviews': reviews_agg.values})
df_rating_avg = pd.DataFrame({'business_id': rating_avg.index, 'business_rating': rating_avg.values})

df_agg = pd.merge(df_reviews_agg, df_rating_avg, on='business_id')
df_agg.head()
# print(df_reviews_agg)


Unnamed: 0,business_id,all_reviews,business_rating
0,--ZVrH2X2QXBFdCilbirsw,This place is sadly perm closed. I was hoping ...,4.5
1,-02xFuruu85XmDn2xiynJw,Dr. Curtis Dechant has an excellent chair-side...,4.5
2,-1MhPXk1FglglUAmuPLIGg,Great food and drink. The staff are always fri...,4.0
3,-1ueCbvIpUPi8KT95ETTKw,I was very happy here.there is a shower with a...,4.0
4,-2Axhv9AZ_n7qjQefECpVw,"The service was excellent, extremely friendly....",3.5


In [41]:
# df_reviews_agg[df_reviews_agg.isnull().any(axis=1)].shape
df_agg['all_reviews'].shape


(3623,)

# TF_IDF Vectorizer

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000,min_df=1)
# Fit and transform the aggregated reviews into a TF-IDF matrix
tfidf_wm = vectorizer.fit_transform(df_agg['all_reviews'])

# Retrieve the feature names (i.e., the vocabulary)
tfidf_tokens = vectorizer.get_feature_names_out()

# Create a DataFrame from the TF-IDF matrix for easier viewing
df_tfidfvect = pd.DataFrame(data=tfidf_wm.toarray(), columns=tfidf_tokens)

df_tfidfvect


Unnamed: 0,00,000,00pm,10,100,1000,10am,10pm,11,11am,...,young,younger,yuck,yum,yummy,zero,zone,zoo,zucchini,étouffée
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.092386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3618,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3619,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.093902,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3620,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3621,0.0,0.0,0.0,0.030686,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Create one table with all information

In [43]:
# business_id, rating concatonated onto the df_idfvect
df_business = pd.concat([df_agg[['business_id', 'business_rating']], df_tfidfvect], axis=1)



Unnamed: 0,business_id,business_rating,00,000,00pm,10,100,1000,10am,10pm,...,young,younger,yuck,yum,yummy,zero,zone,zoo,zucchini,étouffée
0,--ZVrH2X2QXBFdCilbirsw,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-02xFuruu85XmDn2xiynJw,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1MhPXk1FglglUAmuPLIGg,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1ueCbvIpUPi8KT95ETTKw,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.092386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-2Axhv9AZ_n7qjQefECpVw,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Test/Train Split