In [None]:
%load_ext autoreload
%autoreload 2

## Introduction

Notebook for scraping users from wallapop webpage

## Import libraries


In [None]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from tqdm import tqdm

## Initialize variables

In [None]:
OUTPUT_FOLDER = '../output_data/'
SAVED_DATA_PATH = f'{OUTPUT_FOLDER}items_list_Cuenca-Cuenca-Castilla-La+Mancha-16002-26_12.csv'

user_city = 'Cuenca'
user_province = 'Cuenca'  # Province to scrape
country_code = 'ES'  # Country code
user_postal_code = '16002'  # Postal code
distance = 10000  # In meters
start_idx = 0  # Start index
user_region = 'Castilla-La+Mancha'  # Region to scrape
lat = 40.07127
long = -2.13634

## Get full item list for the region selected

In [None]:
from wallapop_scraper.scraper import get_items_per_region

In [None]:
if SAVED_DATA_PATH:
    total_df = pd.read_csv(SAVED_DATA_PATH)
else:
    total_df = get_items_per_region(
        user_city, user_province, user_region, user_postal_code,
        lat, long, sleep=0.5, max_items=50000,
    )

print('Number of items:', len(total_df))

display(total_df.head(3))
display(total_df.tail(3))

In [None]:
print(f'Total of items on sale in {user_city}: {len(total_df)}')

unique_users = total_df.user_id.unique().tolist()
print(f'Total of unique seller users in {user_city}: {len(unique_users)}')

### Add sold items of the existing users

In [None]:
from wallapop_scraper.scraper import get_user_sold_items

for user_id in tqdm(unique_users):
    user_sold_items = get_user_sold_items(user_id)
    
    for sold_item in user_sold_items:
        total_df = total_df.append(sold_item, ignore_index=True)
        
# Remove duplicates
total_df = total_df.drop_duplicates()
print(f'Total of active and historical catched items in {user_city} after removing duplicates: {len(total_df)}')

### Add user information

In [None]:
from wallapop_scraper.scraper import get_user_info

# Get user information
total_df['user_alias'] = None
total_df['user_gender'] = None
total_df['register_date'] = None
total_df['user_postal_code'] = None
total_df['user_city'] = None
total_df['user_url'] = None

for user_id in tqdm(unique_users):
    # Get user info
    user_info = get_user_info(user_id)
    
    if user_info is not None:
        # Add info to dataframe
        try:
            total_df.loc[total_df.user_id == user_id, 'user_alias'] = user_info['user_alias']
            total_df.loc[total_df.user_id == user_id, 'user_gender'] = user_info['user_gender']
            total_df.loc[total_df.user_id == user_id, 'register_date'] = user_info['register_date']
            total_df.loc[total_df.user_id == user_id, 'user_postal_code'] = user_info['user_postal_code']
            total_df.loc[total_df.user_id == user_id, 'user_city'] = user_info['user_city']
            total_df.loc[total_df.user_id == user_id, 'user_url'] = user_info['user_url']
            
        except Exception as e:
            print(e)
            print(user_info)
            break
    

In [None]:
# Save dataframe to csv
dest_name = f'items_list_{user_city}-{user_province}-{user_region}-{user_postal_code}-26_12.csv'

try:
    total_df.to_csv(os.path.join(OUTPUT_FOLDER, dest_name), index=False)
    print('File saved successfully')
except Exception as e:
    print(e)

## Inspect collected info

In [None]:
# Print max and min distance of the items of your list
print('Item with max distance from your target location: ', total_df.distance.max())
print('Item with min distance from your target location: ', total_df.distance.min())

In [None]:
# Find users with substring in their alias

target_user_names_list = ['jose']

# Get the dataframe with none null values in user_alias column
existing_user_df = total_df[total_df.user_alias.notnull()].copy()
print(f'Number of items with user_alias: {len(existing_user_df)}')

existing_user_df['user_alias'] = existing_user_df['user_alias'].apply(lambda x: x.lower())
target_df = existing_user_df[existing_user_df.user_alias.str.contains('|'.join(target_user_names_list))]
print(f'Number of items with user_alias containing the target substring: {len(target_df)}')

In [None]:
# Show pandas table grouping by user_alias
display(target_df.groupby('user_alias').count().sort_values(by='user_id', ascending=False))

In [None]:
# Show items for a certain user_alias
target_user_alias = 'jose a.'

target_df = existing_user_df[existing_user_df.user_alias == target_user_alias]

print('Wallapop user url: ', target_df.iloc[0]['user_url'])
print('')
display(target_df)

In [None]:
# Show items that contains a certain word in the description

target_word = 'vino'

df_with_description = total_df[total_df.description.notnull()].copy()

target_df = df_with_description[df_with_description.description.str.contains(target_word)].copy()

display(target_df[['title','description','user_url','user_alias']])

In [None]:
print(target_df.loc[6125, 'user_url'])