In [1]:
import pandas as pd
import requests
import gzip
from tqdm import tqdm
import io
import xml.etree.ElementTree as ET
import re
from sqlalchemy import create_engine

from tqdm import tqdm

In [2]:
# !pip install multiprocess --user
from multiprocess import Pool

# !pip install pyarrow --user

In [3]:
# !pip install line_profiler
# %load_ext line_profiler

In [4]:
r = requests.get('https://apkpure.com/sitemap.xml').content

In [5]:
fp = io.StringIO(r.decode())
root = ET.parse(fp).getroot()
urls = [c[0].text for c in root]
urls = [u for u in urls if not ('default' in u or 'topics' in u or 'tag' in u or 'group' in u)]

### XML file sizes

In [6]:
# def extract_size(sitemap_url):
#     try:
#         r = requests.get(sitemap_url, stream=True).headers['Content-length']
#     except:
#         print('error', sitemap_url)
#         return pd.DataFrame()
    
#     return r
# with Pool(32) as p:
#     ls = list(tqdm(p.imap_unordered(extract_size, urls), total=len(urls)))

# print('ok')


# len(ls)  # 7774
# sum(int(i) for i in df_list) / 1e9  # 2.35G

In [7]:
def extract_apps(sitemap_url):
    try:
        r = requests.get(sitemap_url).content
    except:
        print('error', sitemap_url)
        return pd.DataFrame()
    
    fp = io.StringIO(gzip.decompress(r).decode())
    sitemap_root = ET.parse(fp).getroot()
    apps = list(sitemap_root)
    apps = [app for app in apps if 'image' in app[4].tag]
    
    df = pd.DataFrame({
        'url': [a[0].text for a in apps],
        'lastmod': [a[1].text for a in apps],
#         'changefreq': [a[2].text for a in apps],
#         'priority': [a[3].text for a in apps],
#         'image_loc': [a[4][0].text for a in apps],
        'name': [a[4][1].text for a in apps],
    })
    df.lastmod = pd.to_datetime(df.lastmod)
#     df.priority = pd.to_numeric(df.priority)
#     df['sitemap_url'] = sitemap_url
    df['category'] = re.search('(\w+)', sitemap_url.split('/')[-1]).groups()[0]
    
    return df

In [8]:
with Pool(16) as p:
    df_list = list(tqdm(p.imap_unordered(extract_apps, urls), total=len(urls)))

print('ok')
metadata = pd.concat(df_list, ignore_index=True)

100%|██████████| 7774/7774 [09:08<00:00, 14.18it/s]


ok


In [9]:
# %lprun -f extract_apps [extract_apps(url) for url in urls[:20]]

In [10]:
metadata.shape

(7744702, 4)

## Cleaning and Processing

In [11]:
metadata.head()

Unnamed: 0,url,lastmod,name,category
0,https://apkpure.com/diy-bookshelf-design/com.D...,2017-04-07 20:20:28+00:00,DIY Bookshelf Design,art_and_design
1,https://apkpure.com/diy-bracelet-tutorials/com...,2017-03-24 13:59:31+00:00,DIY Bracelet Tutorials,art_and_design
2,https://apkpure.com/wpap-design-ideas/com.wpap...,2018-05-13 11:52:38+00:00,WPAP Design Ideas,art_and_design
3,https://apkpure.com/kids-jigsaw-puzzles-wooden...,2019-02-23 16:13:34+00:00,Kids jigsaw puzzles - Wooden puzzle,art_and_design
4,https://apkpure.com/text-on-photo-editor/ccom....,2017-01-10 17:14:35+00:00,Text on Photo Editor,art_and_design


In [12]:
any(metadata.url.duplicated())

True

In [13]:
metadata[(metadata['name'] == 'Vendetta Miami Police Simulator 2019')]

Unnamed: 0,url,lastmod,name,category
1125304,https://apkpure.com/vendetta-miami-police-simu...,2019-05-17 11:24:48+00:00,Vendetta Miami Police Simulator 2019,comics
6553267,https://apkpure.com/vendetta-miami-police-simu...,2019-05-17 11:24:48+00:00,Vendetta Miami Police Simulator 2019,game_action


In [14]:
metadata = metadata[~(
    (metadata['name'] == 'Vendetta Miami Police Simulator 2019') & (metadata['category'] == 'comics')
)]

In [15]:
metadata[(metadata['name'] == 'Vendetta Miami Police Simulator 2019')]

Unnamed: 0,url,lastmod,name,category
6553267,https://apkpure.com/vendetta-miami-police-simu...,2019-05-17 11:24:48+00:00,Vendetta Miami Police Simulator 2019,game_action


In [16]:
metadata.shape

(7744701, 4)

In [17]:
more_data = metadata['url'].str.rsplit('/', n=2, expand=True) \
    .rename(columns=dict(zip(range(3), ['domain', 'name_slug', 'package'])))
more_data.head()

Unnamed: 0,domain,name_slug,package
0,https://apkpure.com,diy-bookshelf-design,com.DIYBookshelfDesign.vanessastudio
1,https://apkpure.com,diy-bracelet-tutorials,com.DIYBraceletTutorial.vanessastudio
2,https://apkpure.com,wpap-design-ideas,com.wpap.anonymais
3,https://apkpure.com,kids-jigsaw-puzzles-wooden-puzzle,com.color.pokemon
4,https://apkpure.com,text-on-photo-editor,ccom.textonphotos.writeonphoto.free


In [18]:
metadata = pd.concat([metadata, more_data], axis=1)

In [19]:
metadata = metadata.drop(columns=['domain', 'url'])

In [20]:
metadata.head()

Unnamed: 0,lastmod,name,category,name_slug,package
0,2017-04-07 20:20:28+00:00,DIY Bookshelf Design,art_and_design,diy-bookshelf-design,com.DIYBookshelfDesign.vanessastudio
1,2017-03-24 13:59:31+00:00,DIY Bracelet Tutorials,art_and_design,diy-bracelet-tutorials,com.DIYBraceletTutorial.vanessastudio
2,2018-05-13 11:52:38+00:00,WPAP Design Ideas,art_and_design,wpap-design-ideas,com.wpap.anonymais
3,2019-02-23 16:13:34+00:00,Kids jigsaw puzzles - Wooden puzzle,art_and_design,kids-jigsaw-puzzles-wooden-puzzle,com.color.pokemon
4,2017-01-10 17:14:35+00:00,Text on Photo Editor,art_and_design,text-on-photo-editor,ccom.textonphotos.writeonphoto.free


In [21]:
metadata = metadata.reset_index(drop=True)[['package', 'name', 'category', 'name_slug', 'lastmod']]

In [22]:
metadata[metadata[['name', 'package']].duplicated()]

Unnamed: 0,package,name,category,name_slug,lastmod


In [23]:
metadata.isna().sum(axis=0)

package      0
name         0
category     0
name_slug    0
lastmod      0
dtype: int64

In [33]:
metadata.shape

(7744701, 5)

In [24]:
%time metadata.to_csv('../data/metadata.csv', index=False)

CPU times: user 1min 13s, sys: 2.14 s, total: 1min 15s
Wall time: 1min 16s


In [26]:
%time metadata.to_feather('../data/metadata.feather')

CPU times: user 2.92 s, sys: 2.75 s, total: 5.68 s
Wall time: 6.05 s


In [27]:
%time metadata.to_parquet('../data/metadata.parquet', engine='pyarrow')

CPU times: user 6.92 s, sys: 1.32 s, total: 8.24 s
Wall time: 8.5 s


In [28]:
engine = create_engine('sqlite:///../data/apkpure.db', echo=False)

In [31]:
%time metadata.to_sql('apps', con=engine, if_exists='replace', chunksize=2000)

CPU times: user 1min 25s, sys: 6.22 s, total: 1min 32s
Wall time: 1min 33s
