In [1]:
import os
import time
import datetime

import requests as r
import json
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

In [None]:
response = r.get("https://auto.ru/moskva/cars/used/")
response.encoding = 'utf-8'

In [None]:
page = BeautifulSoup(response.text, 'html.parser')

In [None]:
json_data = json.loads(page.find('script', id='initial-state').string)

In [None]:
marks = list(map(lambda x: x['itemFilterParams']['mark'], json_data['breadcrumbsPublicApi']['data'][0]['entities']))

In [None]:
def get_params_and_headers(mark: str, page_num: int, transmission: str):
    PARAMS = {
         'catalog_filter' : [{"mark": mark}],
         'section': 'used',
         'category': 'cars',
         'sort': 'fresh_relevance_1-desc',
         'page': page_num
        }
    
    if len(transmission) > 0:
        PARAMS['transmission'] = transmission
    
    HEADERS = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
        'Connection': 'keep-alive',
        'content-type': 'application/json',
        'Cookie': '_csrf_token=12d20142d10520f7b82c3dec0761a49668ff01a261b9171c; autoru_sid=a:g608d8c462ardieoc0p4gq330989vaks.c732a85395b1c9ec165133ee4095e6a6|1619889222514.604800.HkPE-x1SxJKvhQahdUKJSA.VSWxbotagLLIQcVL5CB-znX8JlN0JrRgEnPJcWp-cZA; autoruuid=g608d8c462ardieoc0p4gq330989vaks.c732a85395b1c9ec165133ee4095e6a6; autoru_gdpr=1; suid=82ade37eba6f1ecf225433ca6047bb66.047145ec6efb2e2ae89d4d21754e2986; from=direct; yuidlt=1; yandexuid=8117481671618082737; my=YwA=; crookie=PrzkiqRgqdN/tIYCniScgxCemcKp24LXjs81XXozOneCz4wUfylD6HFpbUGEjxb7zS1A5gUXNT3f5DG+MozPfSzHHtE=; cmtchd=MTYxOTg4OTIzNzk5OA==; gdpr=0; _ym_isad=1; _ym_uid=161988924290763830; autoru-visits-count=1; X-Vertis-DC=sas; gids=213; bltsr=1; gradius=0; from_lifetime=1619899678001; _ym_d=1619900682',
        'Host': 'auto.ru',
        'origin': 'https://auto.ru',
        'Referer': f'https://auto.ru/moskva/cars/{mark.lower()}/used/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
        'x-client-app-version': '202002.03.092255',
        'x-client-date': '1580933207763',
        'x-csrf-token': '12d20142d10520f7b82c3dec0761a49668ff01a261b9171c',
        'x-page-request-id': '60142cd4f0c0edf51f96fd0134c6f02a',
        'x-requested-with': 'fetch'
    }
    
    return PARAMS, HEADERS

In [None]:
URL = 'https://auto.ru/-/ajax/desktop/listing/'

In [3]:
PATH = './cars/'

In [4]:
transmissions = ['AUTOMATIC', 'ROBOT', 'VARIATOR', 'MECHANICAL']

In [None]:
# save what was found to the file system
for mark in marks:
    for transmission in transmissions:
        print(mark.lower() + '-' + transmission.lower(), end=': ')

        new_path = PATH + mark.lower() + '-' + transmission.lower()

        if not os.path.exists(new_path):
            os.mkdir(new_path)
        
        # goes by pages from 1 to 100
        for i in range(1, 100):        
            PARAMS, HEADERS = get_params_and_headers(mark, i, transmission)    
            response = requests.post(URL, json=PARAMS, headers=HEADERS)
            response.encoding = 'utf-8'
            offers = response.json()['offers']

            cars_count = len(offers)

            if cars_count == 0:
                break;

            new_file_path = new_path + '/page-' + str(i) + '.txt'

            if not os.path.exists(new_file_path):
                with open(new_file_path, 'w') as out_file:
                    out_file.write(json.dumps(offers))

            print(i, f'({cars_count})', end='  ')

        print()

In [5]:
test_df = pd.read_csv('./hw/test.csv', header=0)

In [6]:
t = time.gmtime()

In [7]:
colors_dict = {
    '040001' : 'чёрный',
    'FAFBFB': 'белый',
    '97948F': 'серебристый',
    'CACECB': 'серый',
    '0000CC': 'синий',
    'EE1D19': 'красный',
    '200204': 'коричневый',
    '007F00': 'зелёный',
    'C49648': 'бежевый',
    '22A0F8': 'голубой',
    'FFD600': 'золотистый',
    '660099': 'пурпурный',
    '4A2197': 'фиолетовый',
    'DEA522': 'жёлтый',
    'FF8649': 'оранжевый',
    'FFC0CB': 'розовый'
}

In [8]:
steering_wheel_dict = {
    'LEFT': 'Левый',
    'RIGHT': 'Правый'
}

In [9]:
pts_dict = {
    'ORIGINAL': 'Оригинал',
    'DUPLICATE': 'Дубликат'
}

In [10]:
fuel_dict = {
    'GASOLINE': 'бензин',
    'DIESEL': 'дизель',
    'HYBRID': 'гибрид',
    'ELECTRO': 'электро',
    'LPG': 'газ'
}

In [11]:
gear_type_dict = {
    'FORWARD_CONTROL': 'передний',
    'ALL_WHEEL_DRIVE': 'полный',
    'REAR_DRIVE': 'задний'
}

In [12]:
transmission_dict = {
    'AUTOMATIC': 'автоматическая',
    'MECHANICAL': 'механическая',
    'ROBOT': 'роботизированная',
    'VARIATOR': 'вариатор'
}

In [13]:
def get_owner_count(x):
    if x == 1:
        return '1\xa0владелец'
    elif x == 2:
        return '2\xa0владельца'
    elif x > 2:
        return '3 или более'
    
    return None

In [14]:
def get_year_ru(x):
    if x % 10 == 1 and x != 11 and x % 100 != 11:
        return  'год'
    elif 1 < x % 10 <= 4 and x != 12 and x != 13 and x != 14:
        return 'года'
    
    return 'лет'

In [15]:
def get_month_ru(m):
    if m == 1:
        return 'месяц'
    elif m == 2 or m == 3 or m == 4:
        return 'месяца'
    return 'месяцев'

In [16]:
def get_ownership_period(months):
    if months == 0 or months == None:
        return None
    
    if months > 11:
        years = months // 12
        months = months % 12
        if months > 0:
            return f'{years} {get_year_ru(years)} и {months} {get_month_ru(months)}'
        else:
            return f'{years} {get_year_ru(years)}'
    else:
        return f'{months} {get_month_ru(months)}'

In [17]:
cars_list = []

# read cars from file system
for f in os.scandir(PATH):
    for file in os.scandir(f.path):
        with open(file.path, 'r') as r:
            page_json = json.loads(r.read())
            for car in page_json:
                model_info = car['vehicle_info']['model_info']
                
                brand = car['vehicle_info']['mark_info']['code']
                model_name = model_info['code']
                saleId = car['saleId']
                car_url = f'https://auto.ru/cars/used/sale/{brand.lower()}/{model_name.lower()}/{saleId}/'
                color = colors_dict[car['color_hex']]
                
                complectation_dict = car['vehicle_info']['complectation']
                if complectation_dict['id'] == "0":
                    complectation_dict = None
                    
                bodyType = car['vehicle_info']['configuration']['human_name'].lower()
                try: description = car['description']
                except: description = None
                engineDisplacement = str(round(int(car['vehicle_info']['tech_param']['displacement'])/1000, 1)) + ' LTR'
                enginePower = str(car['vehicle_info']['tech_param']['power']) + ' N12'
                equipment_dict = car['vehicle_info']['equipment']
                fuelType = fuel_dict[car['vehicle_info']['tech_param']['engine_type']]
                # image = car[]
                mileage = car['state']['mileage']
                modelDate = car['vehicle_info']['super_gen']['year_from']
            
                name = car['vehicle_info']['tech_param']['human_name']
                numberOfDoors = car['vehicle_info']['configuration']['doors_count']
                parsing_unixtime = round(time.time())
                
                if 'currency' in car['price_info']:
                    priceCurrency = car['price_info']['currency']
                    if priceCurrency == 'RUR':
                        priceCurrency = 'RUB'
                else:
                    priceCurrency = None
                
                if 'price' in car['price_info']:
                    price = car['price_info']['price']
                else:
                    price = None
                    
                productionDate = car['documents']['year']
                sell_id = car['id']
                super_gen = car['vehicle_info']['tech_param']
                vehicleConfiguration = f"{(car['vehicle_info']['configuration']['body_type'])} {(car['vehicle_info']['tech_param']['transmission'])} {engineDisplacement}"
                vehicleTransmission = transmission_dict[car['vehicle_info']['tech_param']['transmission']]
                vendor = car['vehicle_info']['vendor']
                
                if 'owners_number' in car['documents']:
                    owners_number = get_owner_count(car['documents']['owners_number'])
                else:
                    owners_number = None
                
                if 'purchase_date' in car['documents']:
                    dt_now = datetime.datetime(t.tm_year, t.tm_mon, 1)
                    td_purchase = datetime.datetime(car['documents']['purchase_date']['year'], car['documents']['purchase_date']['month'], 1)
                    num_months = (dt_now.year - td_purchase.year) * 12 + (dt_now.month - td_purchase.month) + 1
                    ownership_period = get_ownership_period(num_months)
                else:
                    ownership_period = None
                
                if 'pts' in car['documents']:
                    pts = pts_dict[car['documents']['pts']]
                else:
                    pts = None
                
                gear_type = gear_type_dict[car['vehicle_info']['tech_param']['gear_type']]
                steering_wheel = steering_wheel_dict[car['vehicle_info']['steering_wheel']]
                state = 'Не требует ремонта' if car['state']['state_not_beaten'] == True else 'Требует ремонта'
                custom_cleared = 'Растаможен' if car['documents']['custom_cleared'] == True else 'Не растаможен'
                
                cars_list.append({
                    'brand': brand, 
                    'saleId': saleId, 
                    'car_url': car_url, 
                    'color': color,
                    'complectation_dict': complectation_dict, 
                    'bodyType': bodyType, 
                    'description': description, 
                    'engineDisplacement': engineDisplacement, 
                    'enginePower': enginePower, 
                    'equipment_dict': equipment_dict, 
                    'fuelType': fuelType, 
                    'image': None, 
                    'mileage': mileage, 
                    'modelDate': modelDate, 
                    'model_info': model_info, 
                    'model_name': model_name, 
                    'name': name, 
                    'numberOfDoors': numberOfDoors, 
                    'parsing_unixtime': parsing_unixtime, 
                    'priceCurrency': priceCurrency, 
                    'productionDate': productionDate, 
                    'sell_id': sell_id, 
                    'super_gen': super_gen, 
                    'vehicleConfiguration': vehicleConfiguration, 
                    'vehicleTransmission': vehicleTransmission, 
                    'vendor': vendor, 
                    'Владельцы': owners_number, 
                    'Владение': ownership_period, 
                    'ПТС': pts, 
                    'Привод': gear_type, 
                    'Руль': steering_wheel, 
                    'Состояние': state, 
                    'Таможня': custom_cleared,
                    'price': price
                })

In [18]:
len(cars_list)

52571

In [19]:
cars_df = pd.DataFrame(cars_list, columns=[
    'bodyType',
    'brand', 
    'car_url', 
    'color',
    'complectation_dict',
    'description', 
    'engineDisplacement', 
    'enginePower', 
    'equipment_dict',
    'fuelType', 
    'image', 
    'mileage', 
    'modelDate', 
    'model_info', 
    'model_name',
    'name', 
    'numberOfDoors', 
    'parsing_unixtime', 
    'priceCurrency',
    'productionDate', 
    'sell_id', 
    'super_gen', 
    'vehicleConfiguration',
    'vehicleTransmission', 
    'vendor',
    'Владельцы',
    'Владение',
    'ПТС',
    'Привод',
    'Руль',
    'Состояние',
    'Таможня',
    'price'
])

In [20]:
# удаляем записи где не указано количество владельцев (их мало, всего 1)
cars_df = cars_df[~cars_df['Владельцы'].isna()]

In [21]:
# удаляем дубликаты
cars_df = cars_df.drop_duplicates(subset=['sell_id'])

In [22]:
# удаляем записи где нет цены
cars_df = cars_df[~cars_df['price'].isna()]

In [24]:
cars_df.to_csv('./hw/parsed_data.csv', index=False)