In [1]:
# data analysis and wrangling
import gc
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import re
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [2]:
def load_df(path, cols, use_threads=True):
    try:
        return pq.read_table(path, columns=cols, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols,dtypes):

    df = load_df(path,cols)
    df = df.astype(dtypes)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df

def get_label(value):
    labels_map = {"googlequicksearchbox" : "google",
                "music" : "google play music",
                "katana" : "facebook",
                "latin" : "gboard",
                "orca" : "messenger",
                "chrome" : "chrome",
                "android" : "instagram",
                "whatsapp" : "whatsapp",
                "gps" : "shareit",
                "talk" : "hangouts",
                "lite" : "fb lite",
                "vidmate" : "vidmate",
                "youtube" : "youtube",
                "maps" : "maps",
                "pop" : "es file explorer"}
    for k,v in labels_map.items():
        try:
            if re.match(r".+"+k+"$",value) :
                return v
        except ValueError as e:
            raise e
    return "No label"

In [3]:
cols = ['sample_id', 'name']
dtypes = {'sample_id' : 'uint32', 'name' : 'category'}
df = prepare('1-parquet-files/top15-apps.parquet',cols,dtypes)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159935697 entries, 0 to 159935696
Data columns (total 2 columns):
sample_id    uint32
name         category
dtypes: category(1), uint32(1)
memory usage: 762.6 MB


In [4]:
df.head(100)

Unnamed: 0,sample_id,name
0,1,com.google.android.music:main
1,1,com.facebook.katana:videoplayer
2,1,com.facebook.orca:videoplayer
3,1,com.facebook.katana
4,1,com.facebook.orca
...,...,...
95,12,com.google.android.googlequicksearchbox:intera...
96,13,com.google.android.music:main
97,13,com.facebook.katana:videoplayer
98,13,com.facebook.orca:videoplayer


In [5]:
df[['name', 'sub']] = df['name'].str.split(':', n=1, expand=True).astype('category')
df = df.reset_index(drop=True)


In [10]:
df['label'] = df['name'].apply(lambda x: get_label(x)).astype('category')


In [11]:
df['sub'] = df['sub'].mask(cond = df['sub'].isnull(), other = 'core_app').astype('category')



To preserve the current behavior, add the new categories to the categorical before calling 'where', or convert the categorical to a different dtype.
  errors=errors,


In [12]:
#df.head(100)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159935697 entries, 0 to 159935696
Data columns (total 4 columns):
sample_id    uint32
name         category
sub          category
label        category
dtypes: category(3), uint32(1)
memory usage: 1.0 GB


In [13]:
size = df.shape[0] // 2

table = pa.Table.from_pandas(df[:size], nthreads=4)

pqwriter = pq.ParquetWriter('2-datasets/top15Apps.parquet', table.schema)
pqwriter.write_table(table)

table = pa.Table.from_pandas(df[size:], nthreads=4)
pqwriter.write_table(table)

if pqwriter:
    pqwriter.close()