In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [9]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols):

    df = load_df(path,cols)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df

def get_codename(value):
    codes_map = {(100, 100) : "No Codename", 
                 (110, 110) : "Petit Four",
                 (150, 150) : "Cupcake",
                 (160, 160) : "Donut",
                 (200, 210) : "Eclair",
                 (220, 223) : "Froyo",
                 (230, 237) : "Gingerbread",
                 (300, 326) : "Honeycomb",
                 (400, 404) : "Ice Cream Sandwich",
                 (410, 432) : "Jelly Bean",
                 (440, 444) : "KitKat",
                 (500, 512) : "Lollipop",
                 (600, 611) : "Marshmallow",
                 (700, 712) : "Nougat",
                 (800, 810) : "Oreo",
                 (900, 900) : "Pie",
                 (1000, 1000) : "Android Q",}
    for k,v in codes_map.items():
        try:
            if k[0] <= int(value) <= k[1]:
                return v
        except ValueError as e:
            return "Unknown"
    return "No name"

In [10]:
cols = ['id','model', 'manufacturer', 'brand', 'product', 'os_version', 'kernel_version', 'is_root']
df = prepare('1-parquet-files/devices.parquet',cols)

#fix unsigned int
df_level = df.id 
converted_level = df_level.astype(np.int32)
df['id'] = converted_level

df['model'] = df['model'].apply(lambda x: x.upper())
df['manufacturer'] = df['manufacturer'].apply(lambda x: x.upper())
df['brand'] = df['brand'].apply(lambda x: x.upper())
df['product'] = df['product'].apply(lambda x: x.upper())

df['version_num'] = df['os_version'].apply(lambda x: x.replace('.',''))

#not prepared for 10.0+
df['version_num'] = df['version_num'].apply(lambda x : "".join(filter(lambda i : i.isdigit(), x)))
df['version_num'] = df['version_num'].apply(lambda x: int(x)*10 if len(x) == 2 else int(x)*100 if len(x) == 1 else x)

df['codename'] = df['version_num'].apply(lambda x: get_codename(x))
df = df.drop(['version_num'], axis=1)

dfx = df.groupby(['codename'])['codename'].count()
print(dfx)
df.info()

codename
Ice Cream Sandwich      209
Jelly Bean             2997
KitKat                 7123
Lollipop              22536
Marshmallow           22918
No name                   9
Nougat                19161
Oreo                  10802
Pie                    1442
Unknown                   3
Name: codename, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87200 entries, 0 to 87199
Data columns (total 9 columns):
id                87200 non-null int32
model             87200 non-null object
manufacturer      87200 non-null object
brand             87200 non-null object
product           87200 non-null object
os_version        87200 non-null object
kernel_version    87200 non-null object
is_root           87200 non-null uint8
codename          87200 non-null object
dtypes: int32(1), object(7), uint8(1)
memory usage: 5.1+ MB


In [11]:
df.head(100)

Unnamed: 0,id,model,manufacturer,brand,product,os_version,kernel_version,is_root,codename
0,1,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,3.10.49-gf18313b-00011-g5e5ed63,0,Marshmallow
1,2,AO5510,YU,YU,YUREKA,5.1.1,3.10.49-cyanogenmod-gbca6118,1,Lollipop
2,3,ASUS_X014D,ASUS,ASUS,WW_PHONE,5.1.1,3.4.0-gdf95949-dirty,0,Lollipop
3,4,NEXUS 5,LGE,GOOGLE,HAMMERHEAD,6.0.1,3.4.0-gcf10b7e,0,Marshmallow
4,5,LG-D331,LGE,LGE,LUV80SS_GLOBAL_COM,4.4.2,3.4.67,0,KitKat
...,...,...,...,...,...,...,...,...,...
95,96,SM-N950F,SAMSUNG,SAMSUNG,GREATLTEXX,7.1.1,4.4.13-12007350,0,Nougat
96,97,A0001,ONEPLUS,ONEPLUS,BACON,7.1.2,3.4.113-lineageos,1,Nougat
97,98,ASUS_Z00AD,ASUS,ASUS,WW_Z00A,6.0.1,3.10.72-x86_64_moor-gb6d574d,0,Marshmallow
98,99,HUAWEI VNS-L31,HUAWEI,HUAWEI,VNS-L31,7.0,4.1.18-g436da75,0,Nougat


In [6]:
df.to_parquet('2-datasets/devices.parquet', compression='none') 

ArrowTypeError: ("Expected a bytes object, got a 'int' object", 'Conversion failed for column version_num with type object')