In [1]:
import pandas as pd

# Load the Yelp business data
businesses = pd.read_json('yelp_academic_dataset_business.json', lines=True)

# Define Canadian provinces and Asian cuisine keywords for filtering
canadian_provinces = ['ON', 'QC', 'BC', 'AB', 'MB', 'SK', 'NS', 'NB', 'NL', 'PE', 'NT', 'NU', 'YT']
asian_cuisine_keywords = ['Chinese', 'Japanese', 'Korean', 'Thai', 'Vietnamese', 'Indian', 'Filipino', 'Malaysian', 'Asian Fusion', 'Sushi Bars', 'Dim Sum', 'Ramen']

# Filter businesses located in Canada and in the Asian cuisine category
canadian_asian_businesses = businesses[
    (businesses['state'].isin(canadian_provinces)) &
    (businesses['categories'].str.contains('|'.join(asian_cuisine_keywords), case=False, na=False))
]

# Further filter businesses with an overall rating between 3 and 3.5 stars
mid_range_businesses = canadian_asian_businesses[(canadian_asian_businesses['stars'] >= 3) & 
                                                 (canadian_asian_businesses['stars'] <= 3.5)]

# Get the list of business IDs for restaurants with an average rating of 3 to 3.5 stars
business_ids = mid_range_businesses['business_id'].tolist()

In [2]:
# Load review data in chunks to filter for relevant business IDs
reviews = pd.read_json('yelp_academic_dataset_review.json', lines=True, chunksize=100000)

# List to store filtered reviews from each chunk
filtered_reviews_list = []

# Filter reviews for the selected business IDs
for chunk in reviews:
    filtered_chunk = chunk[chunk['business_id'].isin(business_ids)]
    filtered_reviews_list.append(filtered_chunk)

# Combine all filtered review chunks into a single DataFrame
filtered_reviews = pd.concat(filtered_reviews_list, ignore_index=True)

# Show a sample of the filtered reviews to confirm the filtering worked
print("Sample of reviews for mid-range businesses:")
print(filtered_reviews[['business_id', 'stars', 'text']].head())

Sample of reviews for mid-range businesses:
              business_id  stars  \
0  fdjzXyPw1Zt2avRokULJOw      5   
1  a24bmNPatJARqO1H4QnWgQ      1   
2  O9W0Qs32RL5kU2_9oqZCKQ      4   
3  N-ej51lLtIl4TMC-Qb3k4A      1   
4  b-4xNydyGmdv2bGFOHTgdw      3   

                                                text  
0  I love Doan's. It's the kind of place you have...  
1  So disappointed with this place. Read great re...  
2  Had #42. Very good. I like being able to add a...  
3  Ok I checked lots of reviews on this place , a...  
4  Pretty tasty ! The moo moo milk or whatever it...  


In [3]:
# Save the filtered reviews for future use
filtered_reviews.to_csv('filtered_mid_range_business_reviews.csv', index=False)