In [21]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [32]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols):

    df = load_df(path,cols)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df

def get_codename(value):
    codes_map = {(100, 100) : "No Codename", 
                 (110, 110) : "Petit Four",
                 (150, 150) : "Cupcake",
                 (160, 160) : "Donut",
                 (200, 210) : "Eclair",
                 (220, 223) : "Froyo",
                 (230, 237) : "Gingerbread",
                 (300, 326) : "Honeycomb",
                 (400, 404) : "Ice Cream Sandwich",
                 (410, 432) : "Jelly Bean",
                 (440, 444) : "KitKat",
                 (500, 512) : "Lollipop",
                 (600, 611) : "Marshmallow",
                 (700, 712) : "Nougat",
                 (800, 810) : "Oreo",
                 (900, 900) : "Pie",
                 (1000, 1000) : "Android Q",}
    for k,v in codes_map.items():
        if k[0] <= int(value) <= k[1]:
            return v
    
    return "No name"

In [33]:
cols = ['id','model', 'manufacturer', 'brand', 'product', 'os_version', 'kernel_version', 'is_root']
df = prepare('parquet_files/devices.parquet',cols)

#fix unsigned int
df_level = df.id 
converted_level = df_level.astype(np.int32)
df['id'] = converted_level

df['model'] = df['model'].apply(lambda x: x.upper())
df['manufacturer'] = df['manufacturer'].apply(lambda x: x.upper())
df['brand'] = df['brand'].apply(lambda x: x.upper())
df['product'] = df['product'].apply(lambda x: x.upper())

df['version_num'] = df['os_version'].apply(lambda x: x.replace('.',''))
#not prepared for 10.0+
df['version_num'] = df['version_num'].apply(lambda x: int(x)*10 if len(x) == 2 else int(x)*100 if len(x) == 1 else x)
#2 very specific cases
df['version_num'] = df['version_num'].apply(lambda x: 600 if (x == 'Android 60' or x == 'Marshmallow OS 60') else 430 if (x == '430\\\\n\\\\n\\\\n') else x)
df['codename'] = df['version_num'].apply(lambda x: get_codename(x))
df = df.drop(['version_num'], axis=1)

dfx = df.groupby(['codename'])['codename'].count()
print(dfx)
df.info()

codename
Ice Cream Sandwich       19
Jelly Bean              450
KitKat                 1072
Lollipop              10658
Marshmallow           11152
No Codename               1
Nougat                 9786
Oreo                   2546
Pie                      84
Name: codename, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35768 entries, 0 to 35767
Data columns (total 9 columns):
id                35768 non-null int32
model             35768 non-null object
manufacturer      35768 non-null object
brand             35768 non-null object
product           35768 non-null object
os_version        35768 non-null object
kernel_version    35768 non-null object
is_root           35768 non-null object
codename          35768 non-null object
dtypes: int32(1), object(8)
memory usage: 2.3+ MB


In [34]:
df.head(100)

Unnamed: 0,id,model,manufacturer,brand,product,os_version,kernel_version,is_root,codename
0,1,VS500PP,LGE,LGE,M1V_PP_VZW,6.0.1,3.10.49-gf18313b-00011-g5e5ed63,0,Marshmallow
1,2,AO5510,YU,YU,YUREKA,5.1.1,3.10.49-cyanogenmod-gbca6118,1,Lollipop
2,3,ASUS_X014D,ASUS,ASUS,WW_PHONE,5.1.1,3.4.0-gdf95949-dirty,0,Lollipop
3,4,NEXUS 5,LGE,GOOGLE,HAMMERHEAD,6.0.1,3.4.0-gcf10b7e,0,Marshmallow
4,5,LG-D331,LGE,LGE,LUV80SS_GLOBAL_COM,4.4.2,3.4.67,0,KitKat
5,6,NEXUS 5,LGE,GOOGLE,HAMMERHEAD,6.0.1,3.4.0-gcf10b7e,0,Marshmallow
6,7,HUAWEI P7-L10,HUAWEI,HUAWEI,P7-L10,5.1.1,3.0.8-00595-gc29cecf,0,Lollipop
7,8,ONE A2003,ONEPLUS,ONEPLUS,ONEPLUS2,6.0.1,3.10.84-perf+,0,Marshmallow
8,9,SM-G903F,SAMSUNG,SAMSUNG,S5NEOLTEXX,6.0.1,3.10.61-10798689,0,Marshmallow
9,10,ALE-L21,HUAWEI,HUAWEI,ALE-L21,6.0,3.10.86-gff4393b,0,Marshmallow


In [35]:
df.to_parquet('datasets/devices.parquet', compression='none') 