# Load Packages and self-defined Functions

In [1]:
%run Functions/Pipeline_Functions.ipynb
%run Functions/Detailed_Scraping_Config.ipynb
%run logging/Logging_Functions.ipynb
%run logging/Logging_Dictionary.ipynb

# Set Scraping Parameters

Use readme.txt "1.2) Data Aqcuisition - Websrape Terminal" for further usage information.

In [2]:
# Parameters:
## USER PARAMETERS
user = "recent_data"  # user (str): choose an existing user for your scraping task

## LOGGING
create_new_logging_df = False  # create_new_logging_df (boolean): Whether you want to instantiate a new logging_df before starting the scraping process. If set to true, this kind of "resets" the scraping tasks for the current user to the values that you're giving in the next parameters.
start_year = int(2005)  # start_year (int): Only used if create_new_logging_df = True: Set the oldest year of initial approval that you want to respect when scraping.
end_year = int(2024)  # end_year (int): Only used if create_new_logging_df = True: Set the newest year of initial approval that you want to respect when scraping.
runner = 0  # runner(int): Only used if create_new_logging_df = True: Helper variable that is usually set to 0.
user_list = ["chris", "christopher", "janik", "chris2", "recent_data"]  # user_list (list(of string)): Only used if create_new_logging_df = True: Provide a list of all users that should be respected in the newly created logging_df. Make sure to include your own user here!

## SCRAPING LOOP PARAMETERS
n_tries = 100  # n_tries (int): Specify the number of retries when an error occurs in the main loop while scraping. Errors typically arise when AutoScout24.de detects a robot and closes the connection. When this happens, the IP-address is changed and the scraping is attempted additional n_tries-1 times.
sleep_interval = [1, 2]  # sleep_interval (list(of integers)): Interval to pick a random float from to define sleeping time after a request is done on AutoScout24.de. We decided to implement a random number for sleeping to prevent robot detection.
max_pages = 20  # max_pages (int): Define how many pages per search request on AutoScout24.de are searched through. This should generally be set to 20 (maximum amount of pages on AutoScout24.de).
print_duplicate_url = False  # print_duplicate_url (boolean): Define whether the URL of found duplicates should be printed. This is only set to True for pipeline examination purposes.

## RECENCY PARAMETERS
adage =  21  # adage (int): Define how old in days the observations searched on AutoScout24.de should be at max. This variable is very useful when deploying a new scraping task after an old one was completed. In that case, the adage should be set to a low value to prevent rescraping data that was already scraped with the last scraping task. However, even if duplicates are scraped, they won't be added to the scraped_data, but are dropped immediately. Therefore, this variable just helps to make the loop more efficient.
use_recency = True  # use_recency (boolean): Whether to scrape only recent data (using adage) or not.

## VPN PARAMETERS
area_input = ['random countries europe 8']  # area_input (list(of area identifiers for NordVPN): Usually is ['random countries europe 8'] to make sure NordVPN only connects to IP-addresses in Europe.

## SAVING PARAMETERS
save_results_local = False  # save_results_local (boolean): Set whether the results df should be loaded and saved to a local path. This is only used for testing purposes, for example when the Cloud Service is not reachable.
do_backup = False  # do_backup (boolean): If set to true, the pipeline will create a backup csv after each model that is finished. This is a very time-intensive step when set to True. Backup solutions have later been implemented within the webscrape_loop_optimized(), which handle that the scraped data is saved (backed up) in any unwanted scenario like loss of internet connection or keyboard-interrupt.

if save_results_local:
    # in_out_path (str): Fixed path to local file where scraped data should be stored, when save_results_local is set to True.
    in_out_path = f"C:/Users/Chris/Desktop/Data Science Project Backup/Local Scraping Data/{user}_data.csv"
else:
    # in_out_path (str): Relative Path to where the scraped data should be stored on the cloud.
    in_out_path = f"scraped_data/{user}/{user}_data.csv"

# Instantiate New Logging DF if wanted

In [3]:
# reset logging df if defined ealier

if create_new_logging_df:
    
    logging_df = instantiate_logging_df(start_year = start_year, end_year = end_year,
                                    user_list = user_list,
                                    model_dictionaries = model_dictionaries)
    
    logging_df.to_csv("logging/logging_data/logging_df_" + str(user) + ".csv", index = False)

# Set up NordVPN Config

In [4]:
#instructions for VPN
instructions_vpn = initialize_VPN(area_input = area_input)


[33mYou're using Windows.
Performing system check...
###########################
[0m
NordVPN installation check: [92m✓[0m
NordVPN service check: [92m✓[0m
Opening NordVPN app and disconnecting if necessary...
NordVPN app launched: [92m✓[0m
#####################################

You've entered a list of connection options. Checking list...


Done!



# Run Webscrape Loop

In [5]:
result_df = webscrape_loop_optimized(user = user,
                          sleep_interval = sleep_interval,
                          attribute_exception_list = attribute_exception_list,
                          instructions_vpn = instructions_vpn,
                          in_out_path = in_out_path,
                          n_tries = n_tries,
                          max_pages = max_pages,
                          do_backup = do_backup,
                          adage = adage, use_recency = use_recency,
                          print_duplicate_url = print_duplicate_url)

Loaded 170913 entries.


Brand: mercedes-benz
Model: cla-(alle)
Year: 2024
Page: 1

Current Amount of total Entries: 170913!
Current Time: 2024-07-16 12:45:58
Number of total offers: 0
No offers for this year, continuing with next year!
Offers for 2024: 0, Duplicates: 0

CURRENTLY SAVING LOGGING_DF, DONT STOP!
SAVING DONE!

##################################################################


Your current ip-address is: 84.166.158.186

[34mConnecting you to United Kingdom ...
[0m

 An unknown error occurred while connecting to a different server! Retrying with a different server...


[34mConnecting you to Slovakia ...
[0m
your new ip-address is: 178.132.111.200

Done! Enjoy your new server.


Brand: mercedes-benz
Model: clk-(alle)
Year: 2005
Page: 1

Current Amount of total Entries: 170913!
Current Time: 2024-07-16 12:46:46
Number of total offers: 27
Found 20 offers on this page.


Interrupted by user. Saving data...
Data saved successfully!

