In [None]:
import re
import requests
from bs4 import BeautifulSoup
import lxml.html
import pandas as pd

# Define the list of cities
cities = [
    'North Jakarta city, Indonesia',
    'Central Jakarta city, Indonesia',
    'South Jakarta city, Indonesia',
    'East Jakarta city, Indonesia',
    'West Jakarta city, Indonesia'
]

# Define common payload parameters
base_payload = {
    'source': 'google_maps',
    'query': 'warkop',
    'user_agent_type': 'desktop',
    'domain': 'com',
    'start_page': '1',
    'pages': '8'
}

# Loop through each city
for city in cities:
    # Update the payload with the current city
    payload = base_payload.copy()
    payload['geo_location'] = city

    # Make the request to the API
    response = requests.request(
        'POST',
        'https://realtime.oxylabs.io/v1/queries',
        auth=('Scraper_7pVIN', 'Rumahjoglo01'),
        json=payload,
        timeout=180
    )

    # Initialize an empty list to store data for the current city
    city_data = []

    # Check if the request was successful
    if response.status_code == 200:
        results = response.json()['results']
        html_files = [result['content'] for result in results]

        # Define selectors
        name_selector = '[role="heading"]'
        rating_selector = 'span[aria-hidden="true"]'
        rating_count_selector = '[class*="RDApEe"]'
        hours_selector = '.rllt__details div:nth-of-type(4)'
        details_selector = '.rllt__details div:nth-of-type(5)'
        price_selector = '.rllt__details div:nth-of-type(2) > span:nth-of-type(2)'
        lat_selector = '[data-lat]'
        lng_selector = '[data-lng]'
        type_selector = '//div[@class="rllt__details"]/div[2]/text()'
        address_selector = '.rllt__details div:nth-of-type(3)'

        # Extract data from each HTML file
        for html in html_files:
            soup = BeautifulSoup(html, 'html.parser')
            lxml_obj = lxml.html.fromstring(str(soup))
            index = -1

            for listing in soup.select('[class="VkpGBb"]'):
                index += 1
                place = listing.parent

                name_el = place.select_one(name_selector)
                name = name_el.text.strip() if name_el else ''

                rating_el = place.select_one(rating_selector)
                rating = rating_el.text.strip() if rating_el else ''

                rating_count_el = place.select_one(rating_count_selector)
                rating_count = ''
                if rating_count_el:
                    count_match = re.search(r'\((.+)\)', rating_count_el.text)
                    rating_count = count_match.group(1) if count_match else ''

                hours_el = place.select_one(hours_selector)
                hours = hours_el.text.strip() if hours_el else ''
                if 'opens' not in hours.lower():
                    hours = ''

                details_el = place.select_one(details_selector)
                details = details_el.text.strip() if details_el else ''

                price_level_el = place.select_one(price_selector)
                price_level = price_level_el.text.strip() if price_level_el else ''

                lat_el = soup.select_one(lat_selector)
                lat = lat_el.get('data-lat') if lat_el else ''

                lng_el = soup.select_one(lng_selector)
                lng = lng_el.get('data-lng') if lng_el else ''

                type_el = lxml_obj.xpath(type_selector)
                place_types = []
                for item in type_el:
                    parts = item.strip().split('·')
                    non_empty_parts = [part.strip() for part in parts if part.strip()]
                    if non_empty_parts:
                        place_types.append(non_empty_parts[-1])

                address_el = place.select_one(address_selector)
                address = address_el.text.strip() if address_el else ''

                place_data = {
                    'city': city,
                    'name': name,
                    'place_type': place_types[index] if index < len(place_types) else '',  # Ensure index is within bounds
                    'address': address,
                    'rating': rating,
                    'price_level': price_level,
                    'rating_count': rating_count,
                    'latitude': lat,
                    'longitude': lng,
                    'hours': hours,
                    'details': details,
                }
                city_data.append(place_data)
    else:
        print(f"Failed to retrieve data for {city}, status code: {response.status_code}")

    # Convert the collected data to a DataFrame and save it to a CSV file for the current city
    df = pd.DataFrame(city_data)
    city_name = city.split(',')[0].replace(' ', '_').lower()
    df.to_csv(f"data_{city_name}.csv", index=False)


In [None]:
columns_to_delete = ['place_type', 'price_level', 'latitude', 'longitude', 'hours', 'details']

In [None]:
columns_to_add = ['ac',
                  'parking',
                  'toilet',
                  'wifi',
                  '24hours']

In [None]:
import numpy as np

for city in cities:
    # Read the CSV file for the current city
    df = pd.read_csv(f"data_{city_name}.csv")

    # Delete specified columns
    df = df.drop(columns=columns_to_delete, axis=1)

    # Remove rows with missing values
    df = df.dropna()

    # Remove duplicate rows
    df = df.drop_duplicates()

    # Add 7 new columns with 90% 1s and 10% 0s
    for column_name in range(columns_to_add):
        column_name = f'new_column_{i+1}'
        data = np.random.choice([1, 0], size=len(df), p=[0.9, 0.1])
        df[column_name] = data

    # Save the modified DataFrame to a new CSV file
    city_name = city.split(',')[0].replace(' ', '_').lower()
    df.to_csv(f"new_data_{city_name}.csv", index=False)



In [None]:
from google.colab import files
for city in cities:
  city_name = city.split(',')[0].replace(' ', '_').lower()
  files.download(f"new_data_{city_name}.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>