In [1]:
import tensorflow as tf

In [2]:
tf.__version__

'2.16.1'

In [59]:
import os
import json

# Define the path to the JSON file
business_path = '/Users/davidcastrejon/Documents/CSC180/hw/yelp/yelp_dataset/yelp_academic_dataset_business.json'
checkin_path = '/Users/davidcastrejon/Documents/CSC180/hw/yelp/yelp_dataset/yelp_academic_dataset_checkin.json'
review_path = '/Users/davidcastrejon/Documents/CSC180/hw/yelp/yelp_dataset/yelp_academic_dataset_review.json'
tip_path = '/Users/davidcastrejon/Documents/CSC180/hw/yelp/yelp_dataset/yelp_academic_dataset_tip.json'
user_path = '/Users/davidcastrejon/Documents/CSC180/hw/yelp/yelp_dataset/yelp_academic_dataset_user.json'

paths = [review_path, checkin_path, tip_path]

# Open the business file and read it line by line
data = []
with open(business_path, 'r') as f:
    for line in f:
        # Parse each JSON object and append it to the data list
        entry = json.loads(line)
        data.append(entry)
print(f'Length of unfiltered business data: {len(data)} lines')

# Define a function to filter entries with less than 20 total reviews
def filter_reviews(entry):
    return entry['review_count'] >= 20

# Filter the entries for businesses with more than 20 reviews
filtered_data = list(filter(filter_reviews, data))

# Specify the path for the new JSON file
output_file = 'filtered_yelp_academic_dataset_business.json'

# Save the filtered business dataset to a JSON file
with open(output_file, 'w') as f:
    json.dump(filtered_data, f, indent=4)

# Read the entire file and parse it as a JSON array
with open(output_file, 'r') as f:
    data = json.load(f)

# Now, the data list contains all the parsed JSON objects from the file
print(f'Length of filtered business data: {len(data)} lines')

# Extract business IDs from the filtered data
filtered_business_ids = [entry['business_id'] for entry in data]
print(f'Number of business id in business data: {len(data)} lines')

# Define a function to filter entries based on business IDs
def filter_entries(entry):
    return entry['business_id'] in filtered_business_ids

# Iterate through each path
for path in paths:
    # Define the output file name for the filtered data
    output_file = f'filtered_{os.path.basename(path)[:-5]}.json'

    # Open the file and read it line by line
    filtered_data = []
    with open(path, 'r') as f:
        for line in f:
            # Parse each JSON object and filter entries based on business IDs
            entry = json.loads(line)
            if filter_entries(entry):
                filtered_data.append(entry)

    # Save the filtered dataset to a JSON file
    with open(output_file, 'w') as f:
        json.dump(filtered_data, f, indent=4)
    print(f'Length of filtered path data: {len(filtered_data)} lines')
    print(f'Processed {path}')
print('Finished!')


Length of unfiltered business data: 150346 lines
Length of filtered business data: 61919 lines
Number of business id in business data: 61919 lines
Length of filtered path data: 60448 lines
Processed /Users/davidcastrejon/Documents/CSC180/hw/yelp/yelp_dataset/yelp_academic_dataset_checkin.json
Length of filtered path data: 799452 lines
Processed /Users/davidcastrejon/Documents/CSC180/hw/yelp/yelp_dataset/yelp_academic_dataset_tip.json
Finished!


In [1]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Define the path to the JSON file
filtered_business_path = '/Users/davidcastrejon/Documents/CSC180/hw/yelp/filtered_yelp_academic_dataset_business.json'
filtered_checkin_path = '/Users/davidcastrejon/Documents/CSC180/hw/yelp/filtered_yelp_academic_dataset_checkin.json'
filtered_review_path = '/Users/davidcastrejon/Documents/CSC180/hw/yelp/filtered_yelp_academic_dataset_review.json'
filtered_tip_path = '/Users/davidcastrejon/Documents/CSC180/hw/yelp/filtered_yelp_academic_dataset_tip.json'
filtered_user_path = '/Users/davidcastrejon/Documents/CSC180/hw/yelp/filtered_yelp_academic_dataset_user.json'

# Load data from the reviews JSON file
print("Loading data...")
with open(filtered_review_path, 'r') as f:
    reviews_data = json.load(f)
    
# Extract text reviews and star ratings
print("Extracting text reviews and star ratings...")
texts = [review['text'] for review in reviews_data]
stars = [review['stars'] for review in reviews_data]

print(len(texts))
print(len(stars))

# TF-IDF Vectorization with min_df and max_df
print("Performing TF-IDF Vectorization...")
vectorizer = TfidfVectorizer(min_df=.4, max_df=0.6)  # Adjust min_df and max_df values as needed
X = vectorizer.fit_transform(texts)
X = X.toarray()
print("Finished!")

# Train-test split
print("Performing Train-Test Split...")
X_train, X_test, y_train, y_test = train_test_split(X, stars, test_size=0.2, random_state=42)

Loading data...
Extracting text reviews and star ratings...
6114269
6114269
Performing TF-IDF Vectorization...
Finished!
Performing Train-Test Split...


In [9]:
import numpy as np

# Check the shape of X_train
print("Shape of X_train:", X_train.shape)

# Check the shape of y_train
print("Shape of y_train:", y_train.shape)

# Check the shape of X_test
print("Shape of X_test:", X_test.shape)

# Check the shape of y_test
print("Shape of y_test:", y_test.shape)

Shape of X_train: (4891415, 11)


AttributeError: 'list' object has no attribute 'shape'

In [10]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from sklearn import metrics

model = Sequential()

# Build the model
print("Building the model...")
model = Sequential()
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer with a single neuron (for regression)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
checkpointer = ModelCheckpoint(filepath="dnn/best_weights.keras", verbose=0, save_best_only=True) # save best model

model.fit(X_train, y_train ,callbacks=[monitor,checkpointer],verbose=2,epochs=1000)

model.load_weights('dnn/best_weights.keras') # load weights from best model

Building the model...


ValueError: Unrecognized data type: x=[[0.         0.         0.         ... 0.60526911 0.         0.        ]
 [0.23194477 0.         0.41139921 ... 0.24214848 0.4483258  0.23116176]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.74903734]
 [0.55521324 0.47752286 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]] (of type <class 'numpy.ndarray'>)

In [None]:
# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define model checkpoint callback to save the best weights
checkpoint_path = "best.weights.h5"
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_best_only=True, save_weights_only=True, monitor='val_loss', mode='min')

# Build the model
print("Building the model...")
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer with a single neuron (for regression)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
print("Training the model...")
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1, callbacks=[early_stopping, model_checkpoint])

# Load the best weights
model.load_weights(checkpoint_path)

# Evaluate the model
print("Evaluating the model...")
mse = model.evaluate(X_test, y_test)
print("Mean Squared Error:", mse)

# Prediction
print("Performing prediction on a new review...")
new_review = ["This gym is amazing! The instructors are knowledgeable and motivating."]
X_new = vectorizer.transform(new_review).toarray()
predicted_star_rating = model.predict(X_new)
print("Predicted Star Rating:", predicted_star_rating)

In [None]:
pred = model.predict(X_test)
pred

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate regression metrics
mae = mean_absolute_error(y_true, pred)
mse = mean_squared_error(y_true, pred)
rmse = mean_squared_error(y_true, pred, squared=False)  # Compute RMSE from MSE
r2 = r2_score(y_true, pred)

# Print regression report
print("Regression Report:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

NameError: name 'model' is not defined

In [None]:
y_true= np.argmax(y_test,axis=1) 

score = metrics.accuracy_score(y_true, pred)

print("Accuracy score: {}".format(score))

In [3]:
score = metrics.precision_score(y_true, pred, average= "weighted")
print("Precision score: {}".format(score))

NameError: name 'metrics' is not defined

In [4]:
score = metrics.recall_score(y_true, pred, average= "weighted")
print("Recall score: {}".format(score))

NameError: name 'metrics' is not defined

In [5]:
score = metrics.f1_score(y_true, pred, average= "weighted")
print("F1 score: {}".format(score))

NameError: name 'metrics' is not defined