In [1]:
import pandas as pd
import json

In [2]:
chunk_size = 10000  # Process 10,000 records at a time
chunks = []

with open('data/yelp_dataset/yelp_academic_dataset_review.json', 'r') as f:
    reviews = []
    for i, line in enumerate(f):
        reviews.append(json.loads(line))
        
        # When chunk_size is reached, convert to DataFrame and store
        if (i + 1) % chunk_size == 0:
            chunks.append(pd.DataFrame(reviews))
            reviews = []  # Clear memory for next chunk
    
    # Handle any remaining records (less than a full chunk)
    if reviews:
        chunks.append(pd.DataFrame(reviews))

# Combine all chunks into final DataFrame
reviews_df = pd.concat(chunks, ignore_index=True)

In [3]:
# Get 5 random reviews
random_reviews = reviews_df.sample(n=5)

# Print each review text
for i, (index, row) in enumerate(random_reviews.iterrows()):
    print(f"\nRandom review #{i+1} (index {index}):")
    print(f"Rating: {row['stars']} stars")
    print(row['text'])
    print("-" * 50)


Random review #1 (index 5317509):
Rating: 4.0 stars
Very good lunch type food.  L I'll Bartendar was not very atentative on my last visit in Tampa
--------------------------------------------------

Random review #2 (index 2984221):
Rating: 5.0 stars
Somehow in a five day trip to New Orleans, I've managed to show up at Sucre for four of them. The first visit was just a pop in to kill some time before dinner, but the subsequent three, as you can imagine, were all about how great it was. 

It's a very cute, very comfortable, French style cafe that serves gelato, macarons, king cake, assorted pastries, and freshly brewed teas. In my many visits I sampled macarons, king cake, gelato, an eclair, and every one of their teas. 

I really enjoyed the macarons and the tea, and even more than that I enjoyed the fact that they frequently (early in the day) put out samples of macarons (whole macarons) and king cake. Also, on one occasion when my boyfriend and I were the only two in the cafe, the m

In [4]:
chunk_size = 5000 
chunks = []

with open('data/yelp_dataset/yelp_academic_dataset_user.json', 'r') as f:
    users = []
    for i, line in enumerate(f):
        users.append(json.loads(line))
        
        if (i + 1) % chunk_size == 0:
            chunks.append(pd.DataFrame(users))
            users = []  # Reset for next chunk
    
    # Add any remaining users
    if users:
        chunks.append(pd.DataFrame(users))

# Concatenate all chunks
user_df = pd.concat(chunks, ignore_index=True)

# Preview the DataFrame
print(user_df.shape)
print(user_df.columns)

(1987897, 22)
Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos'],
      dtype='object')


In [5]:
chunk_size = 5000 
chunks = []

with open('data/yelp_dataset/yelp_academic_dataset_business.json', 'r') as f:
    businesses = []
    for i, line in enumerate(f):
        businesses.append(json.loads(line))
        
        if (i + 1) % chunk_size == 0:
            chunks.append(pd.DataFrame(businesses))
            businesses = []  # Reset for next chunk
    
    # Add any remaining businesses
    if businesses:
        chunks.append(pd.DataFrame(businesses))

# Concatenate all chunks
business_df = pd.concat(chunks, ignore_index=True)

# Preview the DataFrame
print(business_df.head())
print(business_df.columns)

              business_id                      name  \
0  Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ   
1  mpf3x-BjTdTEA3yCZrAYPw             The UPS Store   
2  tUFrWirKiKi_TAnsVWINQQ                    Target   
3  MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries   
4  mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery   

                           address           city state postal_code  \
0           1616 Chapala St, Ste 2  Santa Barbara    CA       93101   
1  87 Grasso Plaza Shopping Center         Affton    MO       63123   
2             5255 E Broadway Blvd         Tucson    AZ       85711   
3                      935 Race St   Philadelphia    PA       19107   
4                    101 Walnut St     Green Lane    PA       18054   

    latitude   longitude  stars  review_count  is_open  \
0  34.426679 -119.711197    5.0             7        0   
1  38.551126  -90.335695    3.0            15        1   
2  32.223236 -110.880452    3.5            22        0   
3  39.9555

In [6]:
# Count the number of unique cities
city_counts = business_df['city'].value_counts()
print(city_counts.head(10))

city
Philadelphia     14569
Tucson            9250
Tampa             9050
Indianapolis      7540
Nashville         6971
New Orleans       6209
Reno              5935
Edmonton          5054
Saint Louis       4827
Santa Barbara     3829
Name: count, dtype: int64


In [7]:
# Filter businesses in Philadelphia
philly_businesses = business_df[business_df['city'] == 'Philadelphia']

# Preview the filtered DataFrame
print(f"Number of Philadelphia businesses: {len(philly_businesses)}")
print(philly_businesses.head())

# Filter for businesses where 'Restaurants' appears in the categories
philly_businesses_with_categories = philly_businesses.dropna(subset=['categories'])
philly_restaurants = philly_businesses_with_categories[
    philly_businesses_with_categories['categories'].str.contains('Restaurants', case=False, na=False)
]

# Preview the filtered restaurants
print(f"Number of restaurants in Philadelphia: {len(philly_restaurants)}")

# Get the business IDs for Philadelphia restaurants
philly_restaurant_ids = set(philly_restaurants['business_id'].values)

# Filter reviews for only Philadelphia restaurants
philly_restaurant_reviews = reviews_df[reviews_df['business_id'].isin(philly_restaurant_ids)]
print(f"Number of reviews for Philadelphia restaurants: {len(philly_restaurant_reviews)}")

Number of Philadelphia businesses: 14569
               business_id                name        address          city  \
3   MTSW4McQd7CbVtyjqoe9mw  St Honore Pastries    935 Race St  Philadelphia   
15  MUTTqe8uqyMdBl186RmNeA            Tuna Bar    205 Race St  Philadelphia   
19  ROeacJQwBeh05Rqg7F6TCg                 BAP  1224 South St  Philadelphia   
28  QdN72BWoyFypdGJhhI5r7g             Bar One   767 S 9th St  Philadelphia   
31  Mjboz24M9NlBeiOJKLEd_Q    DeSandro on Main   4105 Main St  Philadelphia   

   state postal_code   latitude  longitude  stars  review_count  is_open  \
3     PA       19107  39.955505 -75.155564    4.0            80        1   
15    PA       19106  39.953949 -75.143226    4.0           245        1   
19    PA       19147  39.943223 -75.162568    4.5           205        1   
28    PA       19147  39.939825 -75.157447    4.0            65        0   
31    PA       19127  40.022466 -75.218314    3.0            41        0   

                           

In [8]:
print(philly_restaurant_reviews.head())

                 review_id                 user_id             business_id  \
3   AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
5   JrIxlS1TzJ-iCu79ul40cQ  eUta8W_HdHMXPzLBBZhL1A  04UD14gamNjLY0IDYVhHJg   
13  8JFGBuHMoiNDyfcxuWNtrA  smOvOajNG0lS4Pq7d8g4JQ  RZtGWDLCAtuipwaZ-UfjmQ   
16  oyaMhzBSwfGgemSGuZCdwQ  Dd1jQj7S-BFGqRbApFzCFw  YtSqYv1Q_pOltsVPSx54SA   
19  Xs8Z8lmKkosqW5mw_sVAoA  IQsF3Rc6IgCzjVV9DE8KXg  eFvzHawVJofxSnD7TgbZtg   

    stars  useful  funny  cool  \
3     5.0       1      0     1   
5     1.0       1      2     1   
13    4.0       0      0     0   
16    5.0       0      0     0   
19    5.0       0      0     0   

                                                 text                 date  
3   Wow!  Yummy, different,  delicious.   Our favo...  2015-01-04 00:01:03  
5   I am a long term frequent customer of this est...  2015-09-23 23:10:31  
13  Good food--loved the gnocchi with marinara\nth...  2009-10-14 19:57:14  
16  Tremendous ser

In [9]:
philly_restaurant_reviews.to_pickle('data/philly_restaurant_reviews.pkl')

In [10]:
star_distribution = philly_restaurant_reviews['stars'].value_counts().sort_index()
print("Original star rating distribution:")
print(star_distribution)

Original star rating distribution:
stars
1.0     66624
2.0     57480
3.0     91702
4.0    194366
5.0    277117
Name: count, dtype: int64


In [11]:
# Sample 400 reviews for each star rating
balanced_reviews = []

for star in range(1, 6):  # 1 to 5 stars
    star_reviews = philly_restaurant_reviews[philly_restaurant_reviews['stars'] == star]
    
    # Check if we have at least 400 reviews for this star rating
    if len(star_reviews) >= 400:
        sampled_reviews = star_reviews.sample(n=400, random_state=42)
    else:
        # If less than 400 reviews, take all available and warn
        sampled_reviews = star_reviews
        print(f"Warning: Only {len(star_reviews)} reviews available for {star} star rating.")
    
    balanced_reviews.append(sampled_reviews)

# Combine all sampled reviews into a single DataFrame
balanced_reviews_df = pd.concat(balanced_reviews, ignore_index=True)

# Verify the distribution in the new DataFrame
balanced_distribution = balanced_reviews_df['stars'].value_counts().sort_index()
print("\nBalanced star rating distribution:")
print(balanced_distribution)

# Total number of reviews in the balanced dataset
print(f"\nTotal reviews in balanced dataset: {len(balanced_reviews_df)}")


Balanced star rating distribution:
stars
1.0    400
2.0    400
3.0    400
4.0    400
5.0    400
Name: count, dtype: int64

Total reviews in balanced dataset: 2000


In [12]:
balanced_reviews_df.to_pickle('data/balanced_philly_resto_reviews.pkl')