In [None]:
import requests
import json
import pandas as pd
import time
import os
from dotenv import load_dotenv

In [None]:
#Функция для обработки полученных данных
def transform_reviews(df):

    target_columns = ['general_rating', 'general_review_count','general_review_count_with_stars', 'org_rating', 'org_review_count', 'org_review_count_with_stars']

    df['building_name'] = df['address'].apply(lambda x: x.get('building_name',None))


    df[['Restaurant_name', 'Brief_description']] = (df['name'].str.split(',\s*', n=1, expand=True).replace({None: pd.NA}))

    df['postcode'] = df['address'].apply(lambda x: x.get('postcode'))

    df['Count_branches'] = df['org'].apply(lambda x: x.get('branch_count'))

    df['Type_of_institution'] = df['rubrics'].apply(lambda x: [d.get('name') for d in x])

    df.drop(['address', 'name', 'id', 'type', 'rubrics', 'org'], axis=1, inplace=True)

    # Извлекаем данные из колонки 'reviews'
    extracted_data = []
    for review in df['reviews']:
        if review:
            extracted = {col: review.get(col, None) for col in target_columns}
        else:
            extracted = {col: None for col in target_columns}
        extracted_data.append(extracted)

    # Создаем DataFrame из извлеченных данных
    reviews_df = pd.DataFrame(extracted_data)

    # Объединяем с исходным DataFrame
    return pd.concat([df.drop(['reviews'], axis=1), reviews_df], axis=1)

In [None]:
def parse_schedule(schedule):
    #Создадим словарь для перевода на русский язык
    days_mapping = {
        'Mon': 'пн', 'Tue': 'вт', 'Wed': 'ср',
        'Thu': 'чт', 'Fri': 'пт', 'Sat': 'сб', 'Sun': 'вс'
    }

    # Если у нас нет информации о времени работы, то просто заполняем None
    if not isinstance(schedule, dict):
        return {'working_days': None, 'working_hours': None}

    # В данном цикле мы собираем время работы для каждого из дней
    time_groups = {}
    for eng_day, ru_day in days_mapping.items():
        if eng_day in schedule:
            hours = schedule[eng_day].get('working_hours', [{}])[0]
            time = f"{hours.get('from', '?')}–{hours.get('to', '?')}"
            if time not in time_groups:
                time_groups[time] = []
            time_groups[time].append(ru_day)

    day_ranges = []
    for time, days in time_groups.items():
        sorted_days = sorted(days, key=lambda x: list(days_mapping.values()).index(x))

        # Объединяем последовательные дни в единые диапазоны
        ranges = []
        start = end = sorted_days[0]
        for day in sorted_days[1:]:
            if list(days_mapping.values()).index(day) == list(days_mapping.values()).index(end) + 1:
                end = day
            else:
                ranges.append(f"{start}-{end}" if start != end else start)
                start = end = day
        ranges.append(f"{start}-{end}" if start != end else start)

        day_ranges.append(f"({', '.join(ranges)}) {time}")

    working_days = []
    working_hours = []
    for entry in day_ranges:
        days_part, time_part = entry.split(') ')
        days = days_part[1:].replace('-', '—')
        working_days.append(days)
        working_hours.append(time_part)

    return {
        'working_days': ', '.join(working_days),
        'working_hours': ', '.join(working_hours)
    }

In [None]:
load_dotenv()

API_2GIS = os.getenv('API_2GIS')
region_id = 32
rubric_id = '161,162'
BASE_URL = 'https://catalog.api.2gis.com/3.0/items'
TOTAL_NEEDED = 15000
PAGE_SIZE = 50

all_data = pd.DataFrame()
page = 1

while len(all_data) < TOTAL_NEEDED:
    print(f"Обрабатываю страницу {page}...")

    params_list = [
            {
                'key': API_2GIS,
                'region_id': region_id,
                'rubric_id': rubric_id,
                'page': page,
                'page_size': PAGE_SIZE,
                'fields': 'items.id,items.address'
            },
            {
                'key': API_2GIS,
                'region_id': region_id,
                'rubric_id': rubric_id,
                'page': page,
                'page_size': PAGE_SIZE,
                'fields': 'items.id,items.point'
            },
            {
                'key': API_2GIS,
                'region_id': region_id,
                'rubric_id': rubric_id,
                'page': page,
                'page_size': PAGE_SIZE,
                'fields': 'items.id,items.rubrics'
            },
            {
                'key': API_2GIS,
                'region_id': region_id,
                'rubric_id': rubric_id,
                'page': page,
                'page_size': PAGE_SIZE,
                'fields': 'items.id,items.schedule'
            },
            {
                'key': API_2GIS,
                'region_id': region_id,
                'rubric_id': rubric_id,
                'page': page,
                'page_size': PAGE_SIZE,
                'fields': 'items.id,items.reviews'
            },
            {
                'key': API_2GIS,
                'region_id': region_id,
                'rubric_id': rubric_id,
                'page': page,
                'page_size': PAGE_SIZE,
                'fields': 'items.id,items.org'
            }
        ]


    response = requests.get(BASE_URL, params_list[0])
    data = response.json()
    items = data.get('result', {}).get('items', [])
    data_with_features = pd.DataFrame(items)

    for param in params_list[1:]:
      response = requests.get(BASE_URL, param)
      data = response.json()
      items = data.get('result', {}).get('items', [])
      df_page = pd.DataFrame(items)
      data_with_features = data_with_features.merge(df_page[['id', param['fields'].split(',')[1].split('.')[1]]], left_on='id', right_on='id')

    df_page = transform_reviews(data_with_features)

    # Обработка расписания
    schedule_data = df_page['schedule'].apply(parse_schedule)
    df_page = pd.concat([
        df_page.drop('schedule', axis=1),
        pd.json_normalize(schedule_data)
    ], axis=1)

    all_data = pd.concat([all_data, df_page], ignore_index=True)

    if len(all_data) >= TOTAL_NEEDED:
        all_data = all_data.iloc[:TOTAL_NEEDED]
        break

    page += 1
    time.sleep(0.5)

all_data.drop(['full_name', 'purpose_name'], axis=1, inplace=True)

In [None]:
all_data.to_csv('2GIS_data.csv', index=False)