In [1]:
import pandas as pd
import json

In [2]:
chunk_size = 10000  # Process 10,000 records at a time
chunks = []

with open('data/yelp_dataset/yelp_academic_dataset_review.json', 'r') as f:
    reviews = []
    for i, line in enumerate(f):
        reviews.append(json.loads(line))
        
        # When chunk_size is reached, convert to DataFrame and store
        if (i + 1) % chunk_size == 0:
            chunks.append(pd.DataFrame(reviews))
            reviews = []  # Clear memory for next chunk
    
    # Handle any remaining records (less than a full chunk)
    if reviews:
        chunks.append(pd.DataFrame(reviews))

# Combine all chunks into final DataFrame
reviews_df = pd.concat(chunks, ignore_index=True)

In [3]:
# Create a new sentiment column based on star ratings
reviews_df.loc[:, 'sentiment'] = reviews_df['stars'].apply(lambda x: 
    "negative" if x <= 2 else 
    "neutral" if x == 3 else 
    "positive")

In [4]:
# Get 5 random reviews
random_reviews = reviews_df.sample(n=5)

# Print each review text
for i, (index, row) in enumerate(random_reviews.iterrows()):
    print(f"\nRandom review #{i+1} (index {index}):")
    print(f"Rating: {row['stars']} stars")
    print(f"Sentiment: {row['sentiment']}")
    print(row['text'])
    print("-" * 50)


Random review #1 (index 6490664):
Rating: 3.0 stars
Sentiment: neutral
Historical and a good burger. The service was quick and friendly. Locals eat here worth the trolley ride to go eat here.
--------------------------------------------------

Random review #2 (index 1668098):
Rating: 4.0 stars
Sentiment: positive
Wow, Shawn and Lakeview Movers are really amazing. 
I got his name from the Brown Elephant, a thrift store where I bought my couch. 
When he saw my apartment and the size of my couch, Shawn and his coworker doubted whether the couch would fit. 
Well, to my surprise, he got it in and he was very friendly and efficient as well. They were very kind and though there were only 2 of them there, I felt like I had 4 movers helping with that couch! 
I highly recommend them.
--------------------------------------------------

Random review #3 (index 5969170):
Rating: 5.0 stars
Sentiment: positive
I have been taking my car to Tiger for the past 10 years. They have helped me multiple ti

In [5]:
chunk_size = 5000 
chunks = []

with open('data/yelp_dataset/yelp_academic_dataset_user.json', 'r') as f:
    users = []
    for i, line in enumerate(f):
        users.append(json.loads(line))
        
        if (i + 1) % chunk_size == 0:
            chunks.append(pd.DataFrame(users))
            users = []  # Reset for next chunk
    
    # Add any remaining users
    if users:
        chunks.append(pd.DataFrame(users))

# Concatenate all chunks
user_df = pd.concat(chunks, ignore_index=True)

# Preview the DataFrame
print(user_df.shape)
print(user_df.columns)

(1987897, 22)
Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos'],
      dtype='object')


In [6]:
chunk_size = 5000 
chunks = []

with open('data/yelp_dataset/yelp_academic_dataset_business.json', 'r') as f:
    businesses = []
    for i, line in enumerate(f):
        businesses.append(json.loads(line))
        
        if (i + 1) % chunk_size == 0:
            chunks.append(pd.DataFrame(businesses))
            businesses = []  # Reset for next chunk
    
    # Add any remaining businesses
    if businesses:
        chunks.append(pd.DataFrame(businesses))

# Concatenate all chunks
business_df = pd.concat(chunks, ignore_index=True)

# Preview the DataFrame
print(business_df.head())
print(business_df.columns)

              business_id                      name  \
0  Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ   
1  mpf3x-BjTdTEA3yCZrAYPw             The UPS Store   
2  tUFrWirKiKi_TAnsVWINQQ                    Target   
3  MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries   
4  mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery   

                           address           city state postal_code  \
0           1616 Chapala St, Ste 2  Santa Barbara    CA       93101   
1  87 Grasso Plaza Shopping Center         Affton    MO       63123   
2             5255 E Broadway Blvd         Tucson    AZ       85711   
3                      935 Race St   Philadelphia    PA       19107   
4                    101 Walnut St     Green Lane    PA       18054   

    latitude   longitude  stars  review_count  is_open  \
0  34.426679 -119.711197    5.0             7        0   
1  38.551126  -90.335695    3.0            15        1   
2  32.223236 -110.880452    3.5            22        0   
3  39.9555

In [7]:
# Count the number of unique cities
city_counts = business_df['city'].value_counts()
print(city_counts.head(10))

city
Philadelphia     14569
Tucson            9250
Tampa             9050
Indianapolis      7540
Nashville         6971
New Orleans       6209
Reno              5935
Edmonton          5054
Saint Louis       4827
Santa Barbara     3829
Name: count, dtype: int64


In [8]:
city = 'Nashville'
# Filter businesses in city
city_businesses = business_df[business_df['city'] == city]

# Preview the filtered DataFrame
print(f"Number of {city} businesses: {len(city_businesses)}")

# Filter for businesses where 'Restaurants' appears in the categories
city_businesses_with_categories = city_businesses.dropna(subset=['categories'])
city_restaurants = city_businesses_with_categories[
    city_businesses_with_categories['categories'].str.contains('Restaurants', case=False, na=False)
]

# Preview the filtered restaurants
print(f"Number of restaurants in {city}: {len(city_restaurants)}")

# Get the business IDs for city restaurants
city_restaurants_ids = set(city_restaurants['business_id'].values)

# Get the business IDs for all city businesses
city_businesses_ids = set(city_businesses['business_id'].values)

# Filter reviews for only city restaurants
city_restaurant_reviews = reviews_df[reviews_df['business_id'].isin(city_restaurants_ids)].copy()
print(f"Number of reviews for city restaurants: {len(city_restaurant_reviews)}")

# Filter reviews for all businesses in the city
city_reviews = reviews_df[reviews_df['business_id'].isin(city_businesses_ids)].copy()
print(f"Number of reviews for city: {len(city_reviews)}")

Number of Nashville businesses: 6971
Number of restaurants in Nashville: 2502
Number of reviews for city restaurants: 325726
Number of reviews for city: 451571


In [9]:
city_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,sentiment
29,elqRpX9T3YwL07uLNtN3Bg,-sryo4gDYxbZ1T5Bz4l5Bw,ltBBYdNzkeKdCNPDAsxwAA,2.0,0,0,0,I at least have to give this restaurant two st...,2015-02-02 04:29:13,negative
34,p198qZsKOMCUhgdtRWsOKQ,3MpDvy5gEdsbZh9-p92dHg,8QnuWGVNBhzyYXGSeRdi4g,4.0,0,0,0,After my ROTD yesterday of a different Sweet ...,2013-10-24 19:24:33,positive
39,E9AB7V4z8xrt2uPF7T55FQ,iYY5Ii1LGpZCpXFkHlMefw,Zx7n8mdt8OzLRXVzolXNhQ,5.0,0,0,0,Amazing biscuits and (fill in the blank). Grea...,2018-04-27 23:03:21,positive
43,qGQvUHmC02PAZW3H1WTIfw,RRTQpg8hutdimzAYuP_Hbw,eaJCpC6IhYphj7bwCDHTwQ,5.0,0,0,0,Stopped by after a Sunday morning walk in the ...,2015-08-30 13:41:47,positive
63,67cGcRrS0jTvB8p7rpaPew,8YSxVaD203mE_9FR4nCEVA,oQ5CPRt0R3AzFvcjNOqB1w,3.0,1,0,0,Honestly the food doesn't knock my socks off b...,2016-01-17 13:26:19,neutral


In [10]:
city_restaurant_reviews.head()
print(f"Average review length: {city_restaurant_reviews['text'].str.len().mean()}")

Average review length: 540.8182951314909


In [11]:
# pickle dataframes (uncomment if you want to save as pickle file)
#city_restaurant_reviews.to_pickle('data/city_restaurant_reviews.pkl')
#city_reviews.to_pickle('data/city_all_reviews.pkl')

In [12]:
star_distribution = city_reviews['stars'].value_counts().sort_index()
print("Original star rating distribution for city:")
print(star_distribution)

Original star rating distribution for city:
stars
1.0     60410
2.0     34377
3.0     46373
4.0     96165
5.0    214246
Name: count, dtype: int64


In [13]:
star_distribution = city_restaurant_reviews['stars'].value_counts().sort_index()
print("Original star rating distribution for city restaurants:")
print(star_distribution)

Original star rating distribution for city restaurants:
stars
1.0     34955
2.0     26276
3.0     37067
4.0     76512
5.0    150916
Name: count, dtype: int64


In [14]:
# Sample 1000 reviews for each star rating
balanced_reviews = []

for star in range(1, 6):  # 1 to 5 stars
    star_reviews = city_restaurant_reviews[city_restaurant_reviews['stars'] == star]
    
    if len(star_reviews) >= 1000:
        sampled_reviews = star_reviews.sample(n=1000, random_state=42)
    else:
        sampled_reviews = star_reviews
        print(f"Warning: Only {len(star_reviews)} reviews available for {star} star rating.")
    
    balanced_reviews.append(sampled_reviews)

# Combine all sampled reviews into a single DataFrame
balanced_reviews_df = pd.concat(balanced_reviews, ignore_index=True)
# Shuffle
balanced_reviews_df = balanced_reviews_df.sample(frac=1, random_state=42)
# Verify the distribution in the new DataFrame
balanced_distribution = balanced_reviews_df['stars'].value_counts().sort_index()
print("\nBalanced star rating distribution:")
print(balanced_distribution)

# Total number of reviews in the balanced dataset
print(f"\nTotal reviews in balanced dataset: {len(balanced_reviews_df)}")


Balanced star rating distribution:
stars
1.0    1000
2.0    1000
3.0    1000
4.0    1000
5.0    1000
Name: count, dtype: int64

Total reviews in balanced dataset: 5000


In [15]:
#balanced_reviews_df.to_pickle('data/balanced_resto_reviews.pkl')

In [16]:
# Sample 1000 restaurant reviews for each sentiment
sentiment_restaurant_reviews = []

for s in ['positive', 'neutral', 'negative']:  
    s_reviews = city_restaurant_reviews[city_restaurant_reviews['sentiment'] == s]
    
    if len(s_reviews) >= 5000:
        sampled_reviews = s_reviews.sample(n=5000, random_state=1234)
    else:
        sampled_reviews = s_reviews
        print(f"Warning: Only {len(s_reviews)} reviews available for {s} star rating.")
    
    sentiment_restaurant_reviews.append(sampled_reviews)

# Combine all sampled reviews into a single DataFrame
sentiment_restaurant_reviews_df = pd.concat(sentiment_restaurant_reviews, ignore_index=True)
# Shuffle
sentiment_restaurant_reviews_df = sentiment_restaurant_reviews_df.sample(frac=1, random_state=42)
# Verify the distribution in the new DataFrame
sentiment_distribution = sentiment_restaurant_reviews_df['sentiment'].value_counts().sort_index()
print("\nSentiment distribution:")
print(sentiment_distribution)

# Total number of reviews in the sentiment dataset
print(f"\nTotal reviews in sentiment dataset: {len(sentiment_restaurant_reviews_df)}")
star_distribution = sentiment_restaurant_reviews_df['stars'].value_counts().sort_index()
print("Star rating distribution:")
print(star_distribution)



Sentiment distribution:
sentiment
negative    5000
neutral     5000
positive    5000
Name: count, dtype: int64

Total reviews in sentiment dataset: 15000
Star rating distribution:
stars
1.0    2873
2.0    2127
3.0    5000
4.0    1697
5.0    3303
Name: count, dtype: int64


In [17]:
sentiment_restaurant_reviews_df.to_pickle('data/resto_reviews_3classes_15k.pkl')

In [18]:
# Sample 1000 business reviews for each sentiment
sentiment_reviews = []

for s in ['positive', 'neutral', 'negative']:  
    s_reviews = city_reviews[city_reviews['sentiment'] == s]
    
    if len(s_reviews) >= 1000:
        sampled_reviews = s_reviews.sample(n=1000, random_state=42)
    else:
        sampled_reviews = s_reviews
        print(f"Warning: Only {len(s_reviews)} reviews available for {s} star rating.")
    
    sentiment_reviews.append(sampled_reviews)

# Combine all sampled reviews into a single DataFrame
sentiment_reviews_df = pd.concat(sentiment_reviews, ignore_index=True)
# Shuffle
sentiment_reviews_df = sentiment_reviews_df.sample(frac=1, random_state=42)
# Verify the distribution in the new DataFrame
sentiment_distribution = sentiment_reviews_df['sentiment'].value_counts().sort_index()
print("\nSentiment distribution:")
print(sentiment_distribution)

# Total number of reviews in the sentiment dataset
print(f"\nTotal reviews in sentiment dataset: {len(sentiment_reviews_df)}")
star_distribution = sentiment_reviews_df['stars'].value_counts().sort_index()
print("Star rating distribution:")
print(star_distribution)

print(f"Average review length: {sentiment_reviews_df['text'].str.len().mean()}")


Sentiment distribution:
sentiment
negative    1000
neutral     1000
positive    1000
Name: count, dtype: int64

Total reviews in sentiment dataset: 3000
Star rating distribution:
stars
1.0     639
2.0     361
3.0    1000
4.0     313
5.0     687
Name: count, dtype: int64
Average review length: 638.736


In [19]:
#sentiment_reviews_df.to_pickle('data/all_reviews_3classes.pkl')