In [None]:
import json
import glob
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
from collections import defaultdict
import re

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Set up data paths for Google Business Profile Takeout data
takeout_dir = Path("../data/Takeout/Google Business Profile")
print(f"Data directory: {takeout_dir}")
print(f"Exists: {takeout_dir.exists()}")

if takeout_dir.exists():
    # Find all account directories
    account_dirs = [d for d in takeout_dir.iterdir() if d.is_dir() and d.name.startswith('account-')]
    print(f"Found {len(account_dirs)} account(s):")
    for acc_dir in account_dirs:
        print(f"  {acc_dir.name}")
        
        # Find location directories within each account
        location_dirs = [d for d in acc_dir.iterdir() if d.is_dir() and d.name.startswith('location-')]
        print(f"    Found {len(location_dirs)} location(s):")
        for loc_dir in location_dirs:
            print(f"      {loc_dir.name}")
            
            # List key files in each location
            key_files = ['data.json', 'reviews.json', 'additionalData.json']
            for key_file in key_files:
                file_path = loc_dir / key_file
                if file_path.exists():
                    print(f"        ✓ {key_file} ({file_path.stat().st_size} bytes)")
                else:
                    print(f"        ✗ {key_file}")
else:
    print("Takeout directory not found. Please check if the data was extracted correctly.")

In [None]:
# Load and examine the main business data
def load_business_data():
    """Load business location data from Google Takeout"""
    business_data = []
    
    if not takeout_dir.exists():
        print("No takeout data found")
        return pd.DataFrame()
    
    account_dirs = [d for d in takeout_dir.iterdir() if d.is_dir() and d.name.startswith('account-')]
    
    for account_dir in account_dirs:
        account_id = account_dir.name.replace('account-', '')
        
        # Load account-level data
        account_data_file = account_dir / 'data.json'
        if account_data_file.exists():
            with open(account_data_file, 'r', encoding='utf-8') as f:
                account_info = json.load(f)
        else:
            account_info = {}
        
        # Load location data
        location_dirs = [d for d in account_dir.iterdir() if d.is_dir() and d.name.startswith('location-')]
        
        for location_dir in location_dirs:
            location_id = location_dir.name.replace('location-', '')
            
            # Load main location data
            location_data_file = location_dir / 'data.json'
            if location_data_file.exists():
                with open(location_data_file, 'r', encoding='utf-8') as f:
                    location_info = json.load(f)
                
                # Extract key information
                business_info = {
                    'account_id': account_id,
                    'location_id': location_id,
                    'business_name': location_info.get('title', 'Unknown'),
                    'address': location_info.get('storefrontAddress', {}).get('addressLines', ['Unknown'])[0] if location_info.get('storefrontAddress') else 'Unknown',
                    'category': location_info.get('primaryCategory', 'Unknown'),
                    'phone': location_info.get('primaryPhone', 'Unknown'),
                    'website': location_info.get('websiteUrl', 'Unknown'),
                    'location_dir': str(location_dir)
                }
                
                business_data.append(business_info)
    
    return pd.DataFrame(business_data)

# Load business data
business_df = load_business_data()
print("Business Locations Found:")
print(business_df)

In [None]:
# Load and analyze reviews data
def load_reviews_data(business_df):
    """Load all reviews data from Google Takeout"""
    all_reviews = []
    
    for _, business in business_df.iterrows():
        location_dir = Path(business['location_dir'])
        
        # Find all review files (main reviews.json and individual review files)
        review_files = list(location_dir.glob('reviews*.json'))
        
        for review_file in review_files:
            try:
                with open(review_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # Handle different review file formats
                if 'reviews' in data:
                    # Main reviews.json file
                    reviews = data['reviews']
                else:
                    # Individual review file
                    reviews = [data] if isinstance(data, dict) else data
                
                for review in reviews:
                    if isinstance(review, dict):
                        review_info = {
                            'business_name': business['business_name'],
                            'location_id': business['location_id'],
                            'review_id': review.get('reviewId', ''),
                            'reviewer_name': review.get('reviewer', {}).get('displayName', 'Anonymous'),
                            'star_rating': review.get('starRating', 'NOT_SPECIFIED'),
                            'comment': review.get('comment', ''),
                            'create_time': review.get('createTime', ''),
                            'update_time': review.get('updateTime', ''),
                            'review_reply': review.get('reviewReply', {}).get('comment', '') if review.get('reviewReply') else '',
                            'review_file': str(review_file)
                        }
                        all_reviews.append(review_info)
                        
            except (json.JSONDecodeError, UnicodeDecodeError) as e:
                print(f"Error reading {review_file}: {e}")
                continue
    
    return pd.DataFrame(all_reviews)

# Load reviews data
if not business_df.empty:
    reviews_df = load_reviews_data(business_df)
    print(f"Total reviews found: {len(reviews_df)}")
    print("\nReview data sample:")
    if not reviews_df.empty:
        print(reviews_df.head())
        print(f"\nRating distribution:")
        print(reviews_df['star_rating'].value_counts())
else:
    print("No business data found to load reviews from")