In [23]:
# Clear all variables from the current environment
locals().clear()
globals().clear()

In [1]:
import pickle
with open('../input/business.pkl', 'rb') as f:
    businesses = pickle.load(f)
with open('../input/review.pkl', 'rb') as f:
    all_reviews = pickle.load(f)

# Cut the data size for reviews to 1000
reviews = all_reviews.head(1000)

Now we have filtered the businesses in Philadelphia, merges the reviews and businesses dataframes, performs sentiment analysis, and then performs KMeans clustering on the latitude, longitude, and sentiment values. The resulting clusters are visualized on a map and saved to an HTML file named "clusters_map.html".

In [2]:
with open('../input/user.pkl', 'rb') as f:
    users = pickle.load(f)

In [4]:
print(users.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987897 entries, 0 to 1987896
Data columns (total 22 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_id             object 
 1   name                object 
 2   review_count        int64  
 3   yelping_since       object 
 4   useful              int64  
 5   funny               int64  
 6   cool                int64  
 7   elite               object 
 8   friends             object 
 9   fans                int64  
 10  average_stars       float64
 11  compliment_hot      int64  
 12  compliment_more     int64  
 13  compliment_profile  int64  
 14  compliment_cute     int64  
 15  compliment_list     int64  
 16  compliment_note     int64  
 17  compliment_plain    int64  
 18  compliment_cool     int64  
 19  compliment_funny    int64  
 20  compliment_writer   int64  
 21  compliment_photos   int64  
dtypes: float64(1), int64(16), object(5)
memory usage: 333.7+ MB
None


To perform a geospatial analysis of user behavior, we can consider the number of reviews and average sentiment scores from users living in different regions.

In [6]:
import folium
from folium.plugins import HeatMap
from sklearn.preprocessing import MinMaxScaler
from textblob import TextBlob
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from folium.plugins import MarkerCluster

# Merge dataframes
reviews_users = pd.merge(reviews, users, on='user_id', how='inner')
merged_data = pd.merge(reviews_users, businesses, on='business_id', how='inner')

# Filter businesses in Philadelphia
philadelphia_data = merged_data[merged_data['city'] == 'Philadelphia']

# Function to visualize user behavior on a map
def plot_user_behavior_on_map(df):
    map = folium.Map(location=[39.952583, -75.165222], zoom_start=12)
    marker_cluster = MarkerCluster()

    for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc='Plotting markers'):
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            icon=None,
            popup=f"<b>Name:</b> {row['name_y']}<br><b>Stars:</b> {row['stars_x']}<br><b>Reviews:</b> {row['review_count_y']}<br><b>Address:</b> {row['address']}"
        ).add_to(marker_cluster)

    map.add_child(marker_cluster)
    return map

# Visualize the user behavior on a map
user_behavior_map = plot_user_behavior_on_map(philadelphia_data)
user_behavior_map.save("../result/user_behavior_map_demo.html")

Plotting markers:   0%|          | 0/186 [00:00<?, ?it/s]

We merged the reviews, businesses, and users dataframes, performs sentiment analysis on the reviews, calculated the number of reviews and average sentiment scores for each user, normalized the number of reviews and average sentiment scores, and then visualized the user behavior on a map using a heatmap. The resulting map is saved to an HTML file named "user_behavior_map.html".

In [9]:
import nltk
from nltk.corpus import stopwords
from collections import Counter
import string
from tqdm.notebook import tqdm

nltk.download('punkt')
nltk.download('stopwords')

# Define a function to get the top N keywords
def get_top_keywords(texts, n=10):
    words = []
    
    # Define stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    for text in texts:
        # Tokenize and filter words
        tokens = nltk.word_tokenize(text.lower())
        filtered_words = [word for word in tokens if word not in stop_words and word not in punctuation]
        words.extend(filtered_words)
    
    # Count word occurrences
    word_counts = Counter(words)
    
    # Get the top N keywords
    top_keywords = [item[0] for item in word_counts.most_common(n)]
    
    return top_keywords

# Calculate the top N keywords for each user
philadelphia_data['top_keywords'] = philadelphia_data.groupby('user_id')['text'].transform(lambda texts: ', '.join(get_top_keywords(texts)))

# Add top N keywords to the map
def plot_user_behavior_with_keywords_on_map(df):
    map = folium.Map(location=[39.952583, -75.165222], zoom_start=12)
    marker_cluster = MarkerCluster()

    for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc='Plotting markers'):
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            icon=None,
            popup=f"<b>Name:</b> {row['name_y']}<br><b>Stars:</b> {row['stars_x']}<br><b>Reviews:</b> {row['review_count_y']}<br><b>Address:</b> {row['address']}<br><b>Keywords:</b> {row['top_keywords']}"
        ).add_to(marker_cluster)

    map.add_child(marker_cluster)
    return map

# Visualize the user behavior with keywords on a map
user_behavior_map_with_keywords = plot_user_behavior_with_keywords_on_map(philadelphia_data)
user_behavior_map_with_keywords.save("../result/user_behavior_map_with_keywords_demo.html")

[nltk_data] Downloading package punkt to /Users/cynthiali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cynthiali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  philadelphia_data['top_keywords'] = philadelphia_data.groupby('user_id')['text'].transform(lambda texts: ', '.join(get_top_keywords(texts)))


Plotting markers:   0%|          | 0/186 [00:00<?, ?it/s]