# Wikiaves Registry Scraper

Author: Danilo Lessa Bernardineli (danilo.bernardineli@usp.br)

This is an notebook for retrieving all info in an given registries range. If used properly, you can use it for scraping all the Wikiaves.

I use here an simple form of parallelization through pandarallel, which can increase the scraping speed in an factor of 5-20x.

In [3]:
# Dependences

import requests as req
import json
import pandas as pd
import numpy as np
import random
from pandarallel import pandarallel
from bs4 import BeautifulSoup
import time
from tqdm import tqdm

In [4]:
# Initialize TQDM and Pandarallel

tqdm.pandas()
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  from pandas import Panel


In [5]:
# Scraping parameters

min_user_id = 1
max_user_id = 10
output_path = "wikiaves_authors_{}-{}.pkl".format(min_user_id, max_user_id)
URI = "https://www.wikiaves.com.br/getRegistrosJSON.php"
params = {"tm": "f",
  "t": "u",
  "o": "dp",
  "desc": 1,
  "p": 10}

#user_ids = pd.Series(random.sample(range(1, max_user_id), 5))
user_ids = pd.Series([i for i in range(min_user_id, max_user_id)])

# Shuffle
user_ids = user_ids.reindex(np.random.permutation(user_ids.index))

In [6]:
# Scraping functions

def get_user_registries(user_id: int) -> pd.DataFrame:
    """
    Gets an DataFrame containing all observation registries for an
    given user_id.
    """
    run_flag = True
    pag_num = 1
    user_data = []
    while run_flag:
        params["u"] = user_id
        params["p"] = pag_num
        r = req.get(URI, params)
        try:
            raw_data = json.loads(r.text)
        except json.JSONDecodeError:
            break
        pag_data = pd.DataFrame(raw_data["registros"]["itens"]).T
        if (pag_data.size < 1):
            run_flag = False
        user_data.append(pag_data)
        pag_num += 1
    if len(user_data) > 0:
        return pd.concat(user_data).assign(user_id=user_id)
    else:
        return None


def get_profile_hometown(profile: str) -> str:
    """
    Gets the author hometown for an given profile identifier.
    """
    profile_URI = "https://www.wikiaves.com.br/perfil_{}".format(profile)
    r = req.get(profile_URI)
    soup = BeautifulSoup(r.text, 'html.parser')
    try:
        id_municipio = soup.findAll("a", {"class": 'm-card-profile__email m-link'})[-1]["href"].split("?c=")[-1]
    except:
        id_municipio = None
    return id_municipio

In [7]:
time1 = time.time()
data = (pd.concat(user_ids.progress_apply(get_user_registries).tolist())
          .set_index("user_id"))
time2 = time.time()
print(time2 - time1)

100%|██████████| 9/9 [02:18<00:00, 15.42s/it]

138.78598403930664



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [11]:
data.to_pickle("temp.pkl")

In [8]:
profiles = (pd.DataFrame(data.perfil.unique())
              .rename(columns={0: "perfil"})
              .assign(hometown_ids=lambda series: series.parallel_applymap(get_profile_hometown))
              .set_index("perfil"))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4), Label(value='0 / 4'))), HBox(c…

In [None]:
final_data = data.join(profiles, on="perfil")

In [None]:
#final_data.to_csv("wikiaves_authors.csv
final_data.to_pickle(output_path)

In [22]:
print("oi")

oi


In [26]:
print("aaaa")

aaaa
