In [4]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import timedelta

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# configure visualizations
sns.set_style('whitegrid')
figsize=(8,6)

In [5]:
def load_df(path, columns=None, use_threads=True):
    try:
        return pq.read_table(path, columns=columns, use_threads=use_threads).to_pandas()
    except Exception as e:
        print(e)

def downcast_ints(df):
    # downcast int types
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    df[converted_int.columns] = converted_int
    
    return df
        
def prepare(path,cols):

    df = load_df(path,cols)
    df = df.reset_index(drop=True)
    df = downcast_ints(df)
    
    return df


In [None]:
cols = ['id', 'sample_id', 'network_type', 'mobile_network_type', 'mobile_data_status', 'mobile_data_activity', 'roaming_enabled', 'wifi_status', 'wifi_signal_strength', 'wifi_link_speed', 'wifi_ap_status', 'network_operator', 'sim_operator', 'mcc', 'mnc']
df = prepare('1-parquet-files/network_details.parquet',cols)


#fix unsigned int
df_level = df.id 
converted_level = df_level.astype(np.int32)
df['id'] = converted_level
df_level = df.sample_id 
converted_level = df_level.astype(np.int32)
df['sample_id'] = converted_level
df_level = df.roaming_enabled
converted_level = df_level.astype(np.int32)
df['roaming_enabled'] = converted_level

df['network_type'] = df['network_type'].apply(lambda x: x.upper())
df['mobile_network_type'] = df['mobile_network_type'].apply(lambda x: x.upper())
#df['mobile_network_type'] = df['mobile_network_type'].apply(lambda x: 'UNKNOWN' if '0' else 'IWLEN' if '18' else 'GSM' if '16' else 'NR5G' if '20' else 'TD_SCDMA' if '17' else x)

df['mobile_data_status'] = df['mobile_data_status'].apply(lambda x: x.upper())
df['mobile_data_activity'] = df['mobile_data_activity'].apply(lambda x: x.upper())
df['wifi_status'] = df['wifi_status'].apply(lambda x: x.upper())
df['wifi_ap_status'] = df['wifi_ap_status'].apply(lambda x: x.upper())
df['network_operator'] = df['network_operator'].apply(lambda x: x.upper() if pd.notnull(x) else x)
df['sim_operator'] = df['sim_operator'].apply(lambda x: x.upper())

df['network_type'] = df['network_type'].apply(lambda x: 'BLUETOOTH TETHERING' if (x == 'BLUETOOTH_TETHER' ) else x)

df.info()

In [None]:
df.to_parquet('2=datasets/network_details.parquet', compression='none') 

In [None]:
dfNetworkTypeGB = df.groupby(['network_type'])['network_type'].count().reset_index(name='count').sort_values(['count'], ascending=False).head(20)
print(dfNetworkTypeGB)