In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from ast import literal_eval

In [2]:
# Creating empty dictionary for the data 
cars={'category_id':[], 'mark_id':[], 'mark':[], 'model_id':[], 'model':[], 'modification':[], 'year':[], 'price':[], 'currency':[], 'mileage_str':[], 'mileage':[], 'engine':[], 'engine_volume':[], 'fuel_id':[], 'fuel':[], 'transmission':[], 'body_type_id':[], 'body_type':[], 'color_id':[], 'location':[], 'damaged':[], 'repaired':[], 'confiscated':[],'date_added':[], 'link':[]}

In [3]:
# Copy URL from source website for particular brand and model
search_url='https://auto.ria.com/search/?categories.main.id=1&brand.id[0]=24&model.id[0]=240&price.currency=1&abroad.not=0&custom.not=1&page='
max_pages=300
# Opening each page of ads
for p in range(max_pages):
    url=search_url+str(p)    
    html=urlopen(url)
    soup=BeautifulSoup(html, 'lxml')
    sections=soup.find_all('section', {"class":"ticket-item"})
    for s in sections:
        # Mark, model and year
        tag_title=s.find_all('div', {"class":"hide"})
        for t in tag_title:
            if t.get("data-mark-name") is not None:  
                mark=t.get("data-mark-name")
                model=t.get("data-model-name")
                year=t.get("data-year")                          
                cars['mark'].append(mark)
                cars['model'].append(model)
                cars['year'].append(year)
        # Price and currency
        tag_price=s.find('div', {"class":"price-ticket"})
        price=tag_price.get("data-main-price")
        currency=tag_price.get("data-main-currency")            
        cars['price'].append(price)
        cars['currency'].append(currency)
        # Mileage, location, engine, transmission
        tag_details=s.find_all('li', {"class":"item-char"})
        mileage_str=tag_details[0].get_text()
        location=tag_details[1].get_text()
        engine=tag_details[2].get_text()
        transmission=tag_details[3].get_text()
        cars['mileage_str'].append(mileage_str)
        cars['location'].append(location)
        cars['engine'].append(engine)
        cars['transmission'].append(transmission)
        # Date added
        tag_added=s.find_all('span')
        date_added=None
        for t in tag_added:
            if t.get("data-add-date") is not None:  
                date_added=t.get("data-add-date")
        cars['date_added'].append(date_added)       
        # Link
        tag_link=s.find('a', {"class":"m-link-ticket"})
        link=tag_link.get('href')
        cars['link'].append(link)
        # Going to individual ads pages to get info unavailable on search page
        html_details=urlopen(link)
        soup_details=BeautifulSoup(html_details, 'lxml')
        results=soup_details.find_all('script')
        for r in results:            
            if r.get('data-mark-name') is not None:
                #print(r)
                category_id=r.get('date-category-id')
                mark_id=r.get('data-marka-id')
                model_id=r.get('data-model-id')
                modification=r.get('data-modification')
                engine_volume=r.get('data-engine-volume')
                fuel_id=r.get('date-fuel')                
                mileage=r.get('data-race')
                damaged=r.get('data-damaged')
                repaired=r.get('data-repair')
                confiscated=r.get('data-confiscated')
                body_type_id=r.get('data-body-id')   
                color_id=r.get('data-color-id')
                # Parsing nested dict to get titles for fuel type and body type
                fuel=literal_eval(r.get('data-dfp-data')).get('fuel')
                body_type=literal_eval(r.get('data-dfp-data')).get('vehicle_body')
        cars['category_id'].append(category_id) 
        cars['mark_id'].append(mark_id)
        cars['model_id'].append(model_id)
        cars['modification'].append(modification)
        cars['engine_volume'].append(engine_volume) 
        cars['fuel_id'].append(fuel_id) 
        cars['fuel'].append(fuel)
        cars['mileage'].append(mileage)
        cars['damaged'].append(damaged) 
        cars['repaired'].append(repaired)
        cars['confiscated'].append(confiscated) 
        cars['body_type_id'].append(body_type_id)
        cars['body_type'].append(body_type)
        cars['color_id'].append(color_id)  

In [5]:
# Creating dataframe from the dictionary
df=pd.DataFrame(data=cars)        

In [6]:
# Checking if the data has been populated
df.head(10)

Unnamed: 0,category_id,mark_id,mark,model_id,model,modification,year,price,currency,mileage_str,...,transmission,body_type_id,body_type,color_id,location,damaged,repaired,confiscated,date_added,link
0,,24,Ford,240,Focus,SE,2014,7999,USD,154 тыс. км,...,Автомат,3,sedan,8,Днепр (Днепропетровск),0,0,0,2019-11-17 12:12:13,https://auto.ria.com/auto_ford_focus_25324085....
1,,24,Ford,240,Focus,FLEX FUEL,2014,9650,USD,87 тыс. км,...,Автомат,3,sedan,13,Одесса,0,0,0,2019-11-16 23:14:06,https://auto.ria.com/auto_ford_focus_25559989....
2,,24,Ford,240,Focus,SE,2014,10300,USD,96 тыс. км,...,Автомат,4,khetchbek,2,Киев,0,0,0,2019-11-15 14:42:23,https://auto.ria.com/auto_ford_focus_25670087....
3,,24,Ford,240,Focus,,2017,10800,USD,26 тыс. км,...,Автомат,3,sedan,0,Тернополь,0,0,0,2019-11-14 15:06:07,https://auto.ria.com/auto_ford_focus_25456273....
4,,24,Ford,240,Focus,SE,2018,12400,USD,6 тыс. км,...,Автомат,4,khetchbek,8,Одесса,0,0,0,2019-11-15 21:40:10,https://auto.ria.com/auto_ford_focus_25528755....
5,,24,Ford,240,Focus,SE,2018,12750,USD,56 тыс. км,...,Автомат,3,sedan,13,Киев,0,0,0,2019-10-31 20:26:10,https://auto.ria.com/auto_ford_focus_25553043....
6,,24,Ford,240,Focus,,2014,9050,USD,133 тыс. км,...,Автомат,4,khetchbek,2,Винница,0,0,0,2019-11-16 08:13:54,https://auto.ria.com/auto_ford_focus_25656761....
7,,24,Ford,240,Focus,,2010,6650,USD,197 тыс. км,...,Ручная / Механика,2,universal,2,Ровно,0,0,0,2019-11-16 17:36:05,https://auto.ria.com/auto_ford_focus_25678764....
8,,24,Ford,240,Focus,SE,2016,9399,USD,60 тыс. км,...,Автомат,3,sedan,2,Киев,0,0,0,2019-11-15 18:02:59,https://auto.ria.com/auto_ford_focus_25671860....
9,,24,Ford,240,Focus,SE,2015,10700,USD,98 тыс. км,...,Автомат,4,khetchbek,8,Киев,0,0,0,2019-11-16 13:45:01,https://auto.ria.com/auto_ford_focus_25458867....


In [7]:
# Saving data for further work
df.to_csv("data_raw.csv")