In [1]:
import pandas as pd
import requests
import gzip
from tqdm import tqdm
import io
import xml.etree.ElementTree as ET
import re

from tqdm import tqdm

In [2]:
# !pip install multiprocess --user
from multiprocess import Pool

In [3]:
# !pip install line_profiler
# %load_ext line_profiler

In [4]:
r = requests.get('https://apkpure.com/sitemap.xml').content

In [5]:
fp = io.StringIO(r.decode())
root = ET.parse(fp).getroot()
urls = [c[0].text for c in root]
urls = [u for u in urls if not ('default' in u or 'topics' in u or 'tag' in u or 'group' in u)]

In [6]:
def extract_apps(sitemap_url):
    try:
        r = requests.get(sitemap_url).content
    except:
        print('error', sitemap_url)
        return pd.DataFrame()
    
    fp = io.StringIO(gzip.decompress(r).decode())
    sitemap_root = ET.parse(fp).getroot()
    apps = list(sitemap_root)
    apps = [app for app in apps if 'image' in app[4].tag]
    
    df = pd.DataFrame({
        'url': [a[0].text for a in apps],
        'lastmod': [a[1].text for a in apps],
        'changefreq': [a[2].text for a in apps],
        'priority': [a[3].text for a in apps],
        'image_loc': [a[4][0].text for a in apps],
        'name': [a[4][1].text for a in apps],
    })
    df.lastmod = pd.to_datetime(df.lastmod)
    df.priority = pd.to_numeric(df.priority)
    df['sitemap_url'] = sitemap_url
    df['category'] = re.search('(\w+)', sitemap_url.split('/')[-1]).groups()[0]
    
    return df

In [7]:
with Pool(32) as p:
    df_list = list(tqdm(p.imap_unordered(extract_apps, urls), total=len(urls)))

print('ok')
metadata = pd.concat(df_list, ignore_index=True)

100%|██████████| 7774/7774 [06:52<00:00, 18.85it/s]


ok


In [8]:
# %lprun -f extract_apps [extract_apps(url) for url in urls[:20]]

In [9]:
metadata.shape

(7744702, 8)

## Cleaning

In [10]:
any(metadata.url.duplicated())

True

In [11]:
metadata[(metadata['name'] == 'Vendetta Miami Police Simulator 2019')]

Unnamed: 0,url,lastmod,changefreq,priority,image_loc,name,sitemap_url,category
1125303,https://apkpure.com/vendetta-miami-police-simu...,2019-05-17 11:24:48+00:00,weekly,0.6,https://image.winudf.com/v2/image1/Y29tLmZvcnR...,Vendetta Miami Police Simulator 2019,https://apkpure.com/sitemaps/comics-15.xml.gz,comics
6551267,https://apkpure.com/vendetta-miami-police-simu...,2019-05-17 11:24:48+00:00,weekly,0.6,https://image.winudf.com/v2/image1/Y29tLmZvcnR...,Vendetta Miami Police Simulator 2019,https://apkpure.com/sitemaps/game_action-97.xm...,game_action


In [12]:
metadata = metadata[~(
    (metadata['name'] == 'Vendetta Miami Police Simulator 2019') & (metadata['category'] == 'comics')
)]

In [13]:
metadata[(metadata['name'] == 'Vendetta Miami Police Simulator 2019')]

Unnamed: 0,url,lastmod,changefreq,priority,image_loc,name,sitemap_url,category
6551267,https://apkpure.com/vendetta-miami-police-simu...,2019-05-17 11:24:48+00:00,weekly,0.6,https://image.winudf.com/v2/image1/Y29tLmZvcnR...,Vendetta Miami Police Simulator 2019,https://apkpure.com/sitemaps/game_action-97.xm...,game_action


In [14]:
metadata.shape

(7744701, 8)

In [15]:
more_data = metadata['url'].str.rsplit('/', n=2, expand=True) \
    .rename(columns=dict(zip(range(3), ['domain', 'name_slug', 'dev'])))
more_data.head()

Unnamed: 0,domain,name_slug,dev
0,https://apkpure.com,wallpaper-i-love-cats-theme,jp.co.a_tm.android.plus_i_love_cats
1,https://apkpure.com,%D9%82%D9%81%D9%84-%D8%A7%D9%84%D9%87%D8%A7%D8...,com.zamalek.lockscreen.app
2,https://apkpure.com,doll-making,com.craftsapps.dollmaking
3,https://apkpure.com,how-to-draw-villians,com.drawvilains.villainsdraw
4,https://apkpure.com,christmas-wallpaper-theme,dq.christmas.hdwallpaper


In [16]:
metadata = pd.concat([metadata, more_data], axis=1).drop(columns='domain')

In [17]:
metadata[metadata[['name', 'dev']].duplicated()]

Unnamed: 0,url,lastmod,changefreq,priority,image_loc,name,sitemap_url,category,name_slug,dev


In [18]:
metadata.to_csv('../data/metadata.csv', index=False)