In [1]:
import time
import numpy as np
import pandas as pd
import pickle

from MotoScraper import MotoDownloader

Starting API gateways in 10 regions.
Using 10 endpoints with name 'https://www.2dehands.be/ - IP Rotate API' (0 new).


<h2> Update the Database with new observation and Retrieve all the information for each motorcycle </h2>
<p> We use the moto id to scrape each page individually. So around 8000 pages will be scraped.

In [3]:
# Opens the ID List file.
with open ('list_all_id.ob', 'rb') as fp:
    list_all_id = pickle.load(fp)
    
# Current database of motorcycles
database = pd.read_csv('database_moto_updated.csv')

In [4]:
# Check which motorcycles we are missing from our database (comparing existing moto ID and the 'fresh' ones from the list_all_id)
df_all_id = pd.DataFrame({'id' : list_all_id})
cond = df_all_id['id'].isin(database['id'])
moto_id_missing = df_all_id.drop(df_all_id[cond].index)

In [5]:
print(f'There are {len(list_all_id)} motorcycles available online.')
print(f'We have {len(database)} Motorcycles in our database.')
print(f'And we could add {len(moto_id_missing)} motorcycles to our database.')

There are 7196 motorcycles available online.
We have 14125 Motorcycles in our database.
And we could add 13 motorcycles to our database.


In [6]:
# Some motorcycles weren't retrieved, mostly because they aren't online anymore (got sold or advertiser changed his mind, ...)
with open ('went_wrong.ob', 'rb') as fp:
    went_wrong = pickle.load(fp) 
print(f'However, some motorcycles are not available anymore and they account for {len(went_wrong)} motorcycles')
moto_id_missing = moto_id_missing[~moto_id_missing.id.isin(went_wrong)]
print(f'Therefore, we can still retrieve {len(moto_id_missing)} motorcycles')

However, some motorcycles are not available anymore and they account for 692 motorcycles
Therefore, we can still retrieve 0 motorcycles


In [11]:
went_wrong = []
all_motorcycle_characteristics = []

for url in moto_id_missing['id']:
    instance = MotoDownloader()
    instance.get_soup(moto_url = url)
    
    if instance.request.status_code == 200:

    
        instance.get_main_attributes()
        instance.get_extra_attributes()
        instance.get_text()
        instance.get_brand()
        instance.get_id()
        instance.get_extracted_date()
        instance.get_price()
        instance.get_location()
        instance.get_seller_name()
        instance.get_image()
        df = instance.parse_all()
        all_motorcycle_characteristics.append(df)
    else:
        print(f'something went wrong with the following moto id: {url}')
        went_wrong.append(url)
        
    # Let the platform recover
    x = np.random.uniform(low=0.8,high=1.8)
    time.sleep(x)

something went wrong with the following moto id: https://www.2dehands.be/m1832386249
something went wrong with the following moto id: https://www.2dehands.be/m1832328804
something went wrong with the following moto id: https://www.2dehands.be/m1832529636
something went wrong with the following moto id: https://www.2dehands.be/m1833106341
something went wrong with the following moto id: https://www.2dehands.be/m1831933927
something went wrong with the following moto id: https://www.2dehands.be/m1832682504
something went wrong with the following moto id: https://www.2dehands.be/m1832835750
something went wrong with the following moto id: https://www.2dehands.be/m1831373449
something went wrong with the following moto id: https://www.2dehands.be/m1832740194
something went wrong with the following moto id: https://www.2dehands.be/m1832799048
something went wrong with the following moto id: https://www.2dehands.be/m1833276719
something went wrong with the following moto id: https://www.2deh

In [12]:
fresh_df = pd.concat(all_motorcycle_characteristics)

<h2> Merging the "fresh" motorcycles to the existing database </h2>

In [13]:
# Add the fresh data to the current database
df_together = pd.concat([database,fresh_df])
# Drop duplicates in case that happened.
df_together.drop_duplicates('id',inplace=True)

In [14]:
# Save the new database
df_together.to_csv('database_moto_updated.csv', index=False)

In [15]:
# Add new wrong ID to the list and SAVE it
with open ('went_wrong.ob', 'rb') as fp:
    previously_wrong = pickle.load(fp) 
new_wrong_list = previously_wrong + went_wrong

# SAVE THE WRONG_LIST
with open('went_wrong.ob', 'wb') as fp:
    pickle.dump(new_wrong_list, fp)