# 1. Collecting data

## 1.1 Importing useful libraries

In [1]:
from concurrent.futures import ThreadPoolExecutor
import concurrent
import json
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from time import sleep
import re
import time
pd.set_option('display.max_columns', None)

In [2]:
provincias_dict = {
    "north": ["coruna+a", "lugo", "ourense", "pontevedra","asturias","cantabria","alava", "guipuzcoa", "vizcaya","navarra","rioja+la"],
    "north_centre":  ["avila", "burgos", "leon", "palencia", "salamanca", "segovia", "soria", "valladolid", "zamora","huesca", "teruel", "zaragoza"],
    "south_centre": ["badajoz", "caceres","albacete", "ciudad+real", "cuenca", "guadalajara", "toledo"],
    "south": ["almeria", "cadiz", "cordoba", "granada", "huelva", "jaen", "malaga","palmas+la", "tenerife"],
    "east": ["murcia", "castellon", "valencia","girona", "lleida", "tarragona", "illes+balears"],
    "big_capitals": ['madrid','barcelona','sevilla','alicante'],
    "prueba": ['huesca','teruel']
}

## 1.2 Custom functions to collect data

In [3]:
def connect_to(link, sleeptime, max_tries=3):
    """
    Attempts to connect to a car listing. Some links are mispelled (it's the web fault),
    so we could have an error. The function tries to connect a max number of #tries.
    Args:
    * link (string): The link of the car listing.
    * sleeptime (float): The pproximated number seconds to sleep between connections.
    * max_tries (int): The max. number of times it tries to connect.
    Returns:
    * response or string: If there was a connection, the return is a response. Otherwise it returns 'Failed'.
    """

    tries = 0
    while True:
        
        try: # connecting
            response = requests.get(link)
            break
                    
        except: # error connection
            tries += 1
            if tries > max_tries:
                return 'Failed'            
            print(' '*100,end='\r')
            print(f"({tries}) Reconnecting to web...", end='\r')
            sleep(abs(np.random.normal(sleeptime+tries,0.15)))
            continue
    
    return response

In [4]:
def extract_feature(css_path,numerical,soup):
    """
    Extracts a feature from the html of a car listing.
    Args:
    * css_path (string): The css path towards the feature.
    * numerical (boolean): Equal to True if the feature is numerical, otherwise false.
    * soup (bs4.BeautifulSoup): Contains the html information of the car listing.
    Returns:
    * float or string: If the feature is numerical, it returns a float. Otherwise it returns a string.
    """
    if numerical == 'True':
        feature = soup.select(css_path)[0]
        feature = re.findall(r'\d+[.]?[,]?\d*',feature.get_text().strip())[0].replace('.','').replace(',','.')
    elif numerical == 'False':
        feature = soup.select(css_path)[0].get_text().strip()
    
    return feature

In [5]:
def get_sales_data(location,car_class,n_pages,sleeptime):
    """
    Gets the sales data for a car class in a specific location.
    Args:
    * location (string): The location (province).
    * car_class (string): The car class ('standard','sport','commercial' or '4x4').
    * n_pages (int or string): The number of pages that will be read. Use 'auto' to automatically read all.
    Returns:
    * pd.DataFrame: Table containing the sales data for the specified conditions.
    """
    
    # We create the empty database where we'll add car samples.
    data = pd.DataFrame(columns=['year','cv','km','fuel','doors','gearbox','emissions',\
                                 'color','warranty','seller','id','brand','price','boot',\
                                 'length','height','width','seats','max_sp','cmixto',\
                                'curban','extraurban','0-100','autonomy','displac','cylinders',\
                                'transmission','max_par','gear','class','location','link'])

    
    # Reading CSS Path for features from the HTML table.
    with open("../data/css_paths.txt", "r") as f:
        css_paths = json.load(f)
        
    paths = [list_css[0] for list_css in css_paths.values()] 
    # The following list states if the feature is numerical ('True'), otherwise 'False'
    paths_bool = [list_css[1] for list_css in css_paths.values()] 
    
    # Rewriting car_classes for web search compatibility.
    if car_class == 'standard':
        class_string = 'pequeno-mediano-grande-familiar-monovolumen'
    elif car_class == '4x4':
        class_string = 'coches-4x4-todoterreno'
    elif car_class == 'commercial':
        class_string = 'furgonetas-segunda-mano'
    elif car_class == 'sport':
        class_string = 'coches-deportivos-segunda-mano'
    else:
        print('ERROR. The provided class does not exist. Please choose "standard","4x4","commercial" or "sport"')
        return
    
    
    ncar = 0 # Numbers of cars added to the database.
    skipped = 0 # Number of skipped cars.
    
    # This is the baselink, from which we we'll create each page link:
    baselink = "https://www.coches.com/coches-segunda-mano/"+class_string+"-en-"+location+".htm?page="
    
    # If n_pages == 'auto', we will search 499 pages (the maximum available). Otherwise, we go for the specified number.
    max_page = 499
    if n_pages != 'auto':
        max_page = n_pages
        
    # --- ITERATION OVER EACH PAGE ---
    for i in range(max_page):
        
        # We create the link for this page
        link = baselink+str(i)      
        
        # Connecting to the web page
        while True:
            try:
                response = requests.get(link)
                break
                
            except:
                print(' '*100,end='\r')
                print('Reconnecting to next page...', end='\r')
                sleep(abs(np.random.normal(sleeptime+4,0.15)))
                continue
        
        # Extracting HTML and searching for cars posted in the page.
        soup_page = BeautifulSoup(response.content, "html.parser")
        sleep(abs(np.random.normal(sleeptime,0.15)))
        car_links = soup_page.select("html body main.content-page div#vo-results.vo-results--rebranding div.pillList.vo-results__card-list.script__vo-results-card-list div.cc-car-card.vo-results__card.pill.script__pill")
        

        # --- ITERATION OVER EACH CAR --- 
        for car_link in car_links:
            
            if car_link.a['href'] == '':
                break # If this card has no car associated, move on
            
            # Extract ID from the car
            car_id = re.findall('id=([\d]+)', car_link.a['href'])[0]
            
            # Connect to the car listing (try 2 times, otherwise move on)   
            response = connect_to(car_link.a['href'],sleeptime=sleeptime)
            if response == 'Failed':
                skipped += 1
                print(' '*100,end='\r')
                print(f'Skipping one car (total = {skipped})',end='\r')
                break
            
            # Extract HTML from the car listing
            soup = BeautifulSoup(response.content, "html.parser")
            sleep(abs(np.random.normal(sleeptime,0.15)))
            
            # Read basic features
            features_html = soup.select(".cc-car-overview")
            if len(features_html)==0:
                break # If we can't find the basic features, skip the car
                
            features = features_html[0].select("p")
            car_features = [code.get_text() for i,code in enumerate(features) if (i+1)%2==0]

            # Read the car brand from the title
            car_brand = soup.select("h1.index-card__make-model")[0].get_text()
            car_brand = re.findall(r"[\w]+",car_brand)[0]
                
            # Obtain the price as the max of the prices (we are not interested in prices associated with bank loans)
            car_price = [a.get_text() for a in soup.select("div.index-card__price-number")] 
            car_prices = [int(re.findall(r'\d+[.]*\d*',price)[0].replace(".","")) for price in car_price if len(re.findall(r'\d+[.]*\d*',price))>0]
            
            # Read additional features              
            some_features = [extract_feature(path,var_bool,soup=soup) for path, var_bool in zip(paths,paths_bool)]
            
            
            # Concatenating them to form the new list with the car features
            more_features = [car_id, car_brand, np.max(car_prices)]+some_features+[car_class, location, car_link.a['href']]
            car_features = car_features + more_features
            
            # Storing the car features in the database
            data.loc[ncar] = car_features
            ncar += 1
            
        # --- FINISHED ITERATION OVER EACH CAR --- 
        
        if n_pages == 'auto':
            
            # Check what's the next page link
            try:
                next_page = soup_page.select(".pager-next")[0].a['href']
            except:
                # sometimes we only have 1 page, and so there's no next button.In that case, we're also in the last page:
                next_page = link
            
            # Check if we reached the limit
            if next_page == link:
                break
        
        else:
            # Show progress if n_pages != 'auto'
            print(' '*100,end='\r')
            print(f'{(round((i+1)/n_pages*100,1))}% complete ({data.shape[0]} items).',end='\r')
            
    # --- FINISHED ITERATION OVER EACH PAGE ---
    
    print('                                                                                ',end='\r')
    print(f"Obtained {data.shape[0]} items of class '{car_class}' in {location.capitalize()}.",end='\r')   
    return data

In [6]:
def get_regional_data(location,n_pages,sleeptime):
    car_classes = ['sport','commercial','4x4','standard']
    
    car_data = [get_sales_data(location=location,car_class=car_class, n_pages = n_pages, sleeptime = sleeptime) for car_class in car_classes]
    
    car_data = pd.concat(car_data,axis=0).reset_index(drop=True)
    
    print(f"Regional data from {location.capitalize()} scanned succesfully for a total of {car_data.shape[0]} items.")   
    
    return car_data

In [7]:
def get_community_data(comunidad, n_pages = 'auto', sleeptime = 1.5, save_csv = False):
    provincias = provincias_dict[comunidad]
    with ThreadPoolExecutor() as executor:
        reg_cars_futures = [executor.submit(lambda x: get_regional_data(x,sleeptime = 0, n_pages = n_pages), prov) for prov in provincias]
        reg_cars = [future.result() for future in concurrent.futures.as_completed(reg_cars_futures)]
        regional_cars = pd.concat(reg_cars,axis=0).reset_index(drop=True)
    
    if save_csv == True:
        regional_cars.to_csv('../data/'+comunidad+'.csv',index=False)
    print(f"All data from {comunidad.capitalize()} saved succesfully with a total of {regional_cars.shape[0]} items.")   
     
    return regional_cars

# North

In [8]:
%%time
north = get_community_data('north', sleeptime = 0, save_csv = True)

Regional data from Rioja+la scanned succesfully for a total of 161 items.                           
Regional data from Lugo scanned succesfully for a total of 1556 items.                              
Regional data from Asturias scanned succesfully for a total of 5698 items.                          
Regional data from Alava scanned succesfully for a total of 7032 items.                             
Regional data from Guipuzcoa scanned succesfully for a total of 7344 items.                         
Regional data from Pontevedra scanned succesfully for a total of 9514 items.                        
Regional data from Coruna+a scanned succesfully for a total of 8675 items.                          
Regional data from Ourense scanned succesfully for a total of 12657 items.      
Regional data from Cantabria scanned succesfully for a total of 11166 items.    
Regional data from Navarra scanned succesfully for a total of 12084 items.      
Regional data from Vizcaya scanned succesfully for

In [9]:
north

Unnamed: 0,year,cv,km,fuel,doors,gearbox,emissions,color,warranty,seller,id,brand,price,boot,length,height,width,seats,max_sp,cmixto,curban,extraurban,0-100,autonomy,displac,cylinders,transmission,max_par,gear,class,location,link
0,2017,326 CV,94.000 km,Gasolina,2 puertas,automática secuencial,154 gr/m3,blanco,SÍ,Profesional,6830673,Bmw,35900,445,464,138,183,4,250,6.6,8.8,5.3,5.0,0,2998,6 en línea,Automática secuencial,450,8,sport,rioja+la,https://www.coches.com/coches-segunda-mano/oca...
1,2021,252 CV,29.187 km,Gasolina,2 puertas,manual,144 gr/m3,blanco,SÍ,Profesional,7477502,Alpine,65450,100,418,125,180,2,250,6.4,8.7,5.1,4.5,0,1798,4 en línea,Manual,320,7,sport,rioja+la,https://www.coches.com/coches-segunda-mano/oca...
2,01/2021,115 CV,37.502 km,Diesel,5 puertas,manual,128 gr/m3,Blanco Mineral S?lido,SÍ,Profesional,7553929,Nissan,19500,0,467,180,185,4,163,4.8,5.3,4.4,13.8,0,1461,4 en línea,Manual,260,5,commercial,rioja+la,https://www.coches.com/coches-segunda-mano/oca...
3,2017,75 CV,101.785 km,Diesel,3 puertas,manual,112 gr/m3,blanco,SÍ,Profesional,7481450,Renault,13950,0,390,181,183,2,150,4.7,5.4,4.4,16.3,0,1461,4 en línea,Manual,180,5,commercial,rioja+la,https://www.coches.com/coches-segunda-mano/oca...
4,06/2019,100 CV,89.597 km,Diesel,3 puertas,manual,106 gr/m3,Blanco,SÍ,Profesional,7444336,Citroen,19600,0,440,180,211,2,166,5.0,5.8,4.6,16.6,0,1499,4 en línea,Manual,250,5,commercial,rioja+la,https://www.coches.com/coches-segunda-mano/oca...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91964,03/2017,110 CV,98.588 km,Diesel,5 puertas,manual,101 gr/m3,Plata,SÍ,Profesional,6661516,Volkswagen,16450,500,434,158,181,5,192,3.9,4.5,3.5,11.3,0,1598,4 en línea,Manual,250,5,standard,vizcaya,https://www.coches.com/coches-segunda-mano/oca...
91965,06/2016,218 CV,84.000 km,Diesel,4 puertas,automática secuencial,119 gr/m3,Gris,SÍ,Profesional,6621948,Audi,26900,480,473,143,184,5,250,4.6,4.8,4.4,6.3,0,2967,6 en V,Automática secuencial,400,7,standard,vizcaya,https://www.coches.com/coches-segunda-mano/oca...
91966,07/2017,190 CV,116.000 km,Diesel,5 puertas,automática secuencial,135 gr/m3,Negro,SÍ,Profesional,6618030,Audi,30890,480,471,139,185,4,232,5.1,6.1,4.6,7.4,0,1968,4 en línea,Automática secuencial,400,7,standard,vizcaya,https://www.coches.com/coches-segunda-mano/oca...
91967,05/2017,116 CV,52.573 km,Diesel,5 puertas,manual,99 gr/m3,Negro,SÍ,Profesional,6605901,Mini,19400,360,425,144,180,5,192,3.8,4.4,3.4,10.4,0,1496,3 en línea,Manual,270,6,standard,vizcaya,https://www.coches.com/coches-segunda-mano/oca...


# North centre

In [10]:
%%time
north_centre = get_community_data('north_centre', sleeptime = 0, save_csv = True)

Regional data from Soria scanned succesfully for a total of 31 items.           
Regional data from Teruel scanned succesfully for a total of 72 items.          
Regional data from Huesca scanned succesfully for a total of 144 items.         
Regional data from Palencia scanned succesfully for a total of 185 items.       
Regional data from Segovia scanned succesfully for a total of 221 items.        
Regional data from Salamanca scanned succesfully for a total of 220 items.      
Regional data from Zamora scanned succesfully for a total of 236 items.         
Regional data from Avila scanned succesfully for a total of 244 items.          
Regional data from Leon scanned succesfully for a total of 712 items.           
Regional data from Burgos scanned succesfully for a total of 811 items.         
Regional data from Valladolid scanned succesfully for a total of 6386 items.    
Regional data from Zaragoza scanned succesfully for a total of 12149 items.     
All data from North_centre s