In [1]:
import json
import os

In [18]:
def get_company_locations_metadata(company_name: str, filename: str):
    lower_company_name = company_name.lower()

    company_locations = []
    with open(filename,"r") as file:
        for line in file:
            location_metadata = json.loads(line)
            if location_metadata["name"] and lower_company_name in location_metadata["name"].lower():
                company_locations.append(location_metadata)

    return company_locations

In [19]:
def get_reviews(gmap_ids: set, filename: str):
    reviews = []
    with open(filename, "r") as file:
        for line in file:
            review = json.loads(line)
            if review["gmap_id"] in gmap_ids:
                reviews.append(review)
    return reviews

In [34]:
def get_company_data(company_name: str, state: str):
    data_folder = os.path.join("..", "data", "raw")
    snake_case_state = state.replace(" ", "_")

    company_locations = get_company_locations_metadata(company_name, os.path.join(data_folder, f"meta-{snake_case_state}.json"))
    gmap_ids = set(company_location["gmap_id"] for company_location in company_locations)

    reviews = get_reviews(gmap_ids, os.path.join(data_folder, f"review-{snake_case_state}.json"))
    return company_locations, reviews

In [85]:
def save_company_data(company_name: str, state: str):
    company_locations, reviews = get_company_data(company_name, state)
    folder = os.path.join("..", "data", "raw")

    company_name_formatted = company_name.lower().replace(" ", "_")
    state_formatted = state.lower().replace(" ", "_")

    with open(os.path.join(folder, f"{company_name_formatted}_{state_formatted}_2021_locations.json"), "w+") as file:
        json.dump(company_locations, file, indent=2)

    with open(os.path.join(folder, f"{company_name_formatted}_{state_formatted}_2021_reviews.json"), "w+") as file:
        json.dump(reviews, file, indent=2)


In [24]:
states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "District of Columbia",
    "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland",
    "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
    "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania",
    "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington",
    "West Virginia", "Wisconsin", "Wyoming", "other",
]

In [86]:
save_company_data("Costco", states[0])

In [80]:
for state in states:
    save_company_data("Costco", state)

In [87]:
def merge_files(company_name):
    folder = os.path.join("..", "data", "raw")

    company_name_formatted = company_name.lower().replace(" ", "_")
    for datatype in ["locations", "reviews"]:
        data = []
        for state in states:
            state_formatted = state.lower().replace(" ", "_")

            with open(os.path.join(folder, f"{company_name_formatted}_{state_formatted}_2021_{datatype}.json"), "r") as file:
                state_data = json.load(file)

            data += state_data

        with open(os.path.join(folder, f"{company_name_formatted}_2021_{datatype}.json"), "w+") as file:
                json.dump(data, file, indent=2)

In [88]:
merge_files("Costco")