In [1]:
# data analysis and wrangling
import gc
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [2]:
def load_df(path, cols, use_threads=True):
    try:
        return pq.read_table(path, columns=cols, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols):

    df = load_df(path,cols)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df


In [3]:
cols = ['sample_id', 'name']
df = prepare('1-parquet-files/top15-apps.parquet',cols)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159935697 entries, 0 to 159935696
Data columns (total 2 columns):
sample_id    uint32
name         object
dtypes: object(1), uint32(1)
memory usage: 1.8+ GB


In [4]:
df.head(100)

Unnamed: 0,sample_id,name
0,1,com.google.android.music:main
1,1,com.facebook.katana:videoplayer
2,1,com.facebook.orca:videoplayer
3,1,com.facebook.katana
4,1,com.facebook.orca
...,...,...
95,12,com.google.android.googlequicksearchbox:intera...
96,13,com.google.android.music:main
97,13,com.facebook.katana:videoplayer
98,13,com.facebook.orca:videoplayer


In [5]:
df_split = np.array_split(df, 4)

In [6]:
df_split[0][['name', 'sub']] = df_split[0]['name'].str.split(':', n=1, expand=True)
df_split[1][['name', 'sub']] = df_split[1]['name'].str.split(':', n=1, expand=True)

framesA = [df_split[0],df_split[1]]
dfA = pd.concat(framesA)
dfA.to_parquet('1-parquet-files/top15AppsA.parquet', compression='none') 

#deletion of a position resets the array positions. only delete the head
del df_split[0] 
del df_split[0]
del dfA
gc.collect()

df_split[0][['name', 'sub']] = df_split[0]['name'].str.split(':', n=1, expand=True)

dfB = df_split[0]
dfB.to_parquet('1-parquet-files/top15AppsB.parquet', compression='none') 

del df_split[0]  
del dfB
gc.collect()

df_split[0][['name', 'sub']] = df_split[0]['name'].str.split(':', n=1, expand=True)

dfC = df_split[0]
dfC.to_parquet('1-parquet-files/top15AppsC.parquet', compression='none') 

del df_split[0]  
del dfC
gc.collect()

14

In [None]:
cols = ['sample_id', 'name', 'sub']
dfA = prepare('1-parquet-files/top15AppsA.parquet',cols)
dfB = prepare('1-parquet-files/top15AppsB.parquet',cols)
dfC = prepare('1-parquet-files/top15AppsC.parquet',cols)