In [1]:
import pandas as pd
import numpy as np
import time
from itertools import islice

#load samples data
samples_data = pd.read_csv(r'C:\Users\ruben\Desktop\Investigation\updated_dataset_07_3_2018\samples.csv', 
                           usecols=['sample_id', 'device_id', 'timestamp', 'battery_state', 'battery_level', 'bluetooth_enabled', 
                                    'location_enabled', 'power_saver_enabled', 'flashlight_enabled', 'nfc_enabled', 'unknown_sources', 'developer_mode'],
                           dtype={'sample_id': np.int64, 'device_id': np.int64, 'battery_state': 'category', 'battery_level': np.float32, 
                                  'bluetooth_enabled': np.int8, 'location_enabled': np.int8, 'power_saver_enabled': np.int8,
                                  'flashlight_enabled': np.int8, 'nfc_enabled': np.int8, 'unknown_sources': np.int8, 'developer_mode': np.int8},
                           parse_dates=['timestamp'])

#load processes table
processes_table = pd.read_csv(r'C:\Users\ruben\Desktop\Investigation\updated_dataset_07_3_2018\new_processes.csv', 
                               usecols=['sample_id', 'application_label'],
                               dtype={'sample_id': np.int64, 'application_label': 'category'})
    
#create a empty services table
new_services_table = pd.DataFrame(columns=['service_id', 'bluetooth_enabled', 'location_enabled', 'power_saver_enabled', 
                                           'flashlight_enabled', 'nfc_enabled', 'unknown_sources', 'developer_mode'])

#create a empty processes table
new_process_table = pd.DataFrame(columns=['process_id'])

#create a empty time table
time_table = pd.DataFrame(columns=['time_id', 'year', 'month', 'day'])

#facts table
facts_table= pd.DataFrame(columns=['device_id', 'service_id', 'process_id', 'time_id', 'period_type', 'rate_per_unit', 'reach_full'])


processes_table['application_label'] = processes_table['application_label'].str.lower()
samples_data.head(1)

Unnamed: 0,device_id,timestamp,battery_state,battery_level,sample_id,bluetooth_enabled,location_enabled,power_saver_enabled,flashlight_enabled,nfc_enabled,unknown_sources,developer_mode
0,1,2017-10-08 12:50:04,Discharging,0.82,1,0,1,0,0,0,0,0


In [2]:
processes_table.head(1)

Unnamed: 0,sample_id,application_label
0,1,greenhub


In [None]:
def buildServiceTable(service_comb, new_services_table):
    if new_services_table.empty:
        row = [1]
        row.extend(service_comb)
        
        new_services_table.loc[-1] = row
        new_services_table.reset_index(inplace=True, drop=True)
        
        return 1
    else:
        query = 'bluetooth_enabled == ' + str(service_comb[0]) + ' & location_enabled == ' + str(service_comb[1]) + ' & power_saver_enabled == ' + str(service_comb[2]) + ' & flashlight_enabled == ' + str(service_comb[3]) + ' & nfc_enabled == ' + str(service_comb[4]) + ' & unknown_sources == ' + str(service_comb[5]) + ' & developer_mode == ' + str(service_comb[6])
        row_dataframe = new_services_table.query(query)
        
        if row_dataframe.empty:
            service_id = new_services_table['service_id'].max() + 1
            row = [service_id]
            row.extend(service_comb)
            
            new_services_table.loc[-1] = row
            new_services_table.reset_index(inplace=True, drop=True)
            
            return service_id
        else:
            return row_dataframe.iloc[0]['service_id']

        
def buildTimeTable(time_date, time_table):
    if time_table.empty:
        row = [1, time_date.year, time_date.month, time_date.day]
        
        time_table.loc[-1] = row
        time_table.reset_index(inplace=True, drop=True)
        
        return 1
    else:
        query = 'year == ' + str(time_date.year) + ' & month == ' + str(time_date.month) + ' & day == ' + str(time_date.day)
        row_dataframe = time_table.query(query)
        
        if row_dataframe.empty:
            time_id = time_table['time_id'].max() + 1
            row = [time_id, time_date.year, time_date.month, time_date.day]
            
            time_table.loc[-1] = row
            time_table.reset_index(inplace=True, drop=True)
            
            return time_id
        else:
            return row_dataframe.iloc[0]['time_id']
    

def buildProcessTable(process_comb, new_process_table):
    if new_process_table.empty:
        new_process_table.loc[-1] = [0 for col_name in new_process_table.columns]
        new_process_table.set_value(-1, 'process_id', 1)
        
        for pro in process_comb:
            new_process_table[pro] = 0
            new_process_table.set_value(-1, pro, 1)
        
        new_process_table.reset_index(inplace=True, drop=True)
        return 1
    else:
        dif_pro = set(process_comb) - set(new_process_table.columns)
        
        if len(dif_pro) == 0:
            listOfOnes = [1] * len(process_comb)
            new_process_table[process_comb]

            for ir in new_process_table.itertuples():
                if listOfOnes == list(ir[1:]):
                    return new_process_table.get_value(ir[0], 'process_id')

            pro_id = new_process_table['process_id'].max() + 1

            new_process_table.loc[-1] = [0 for col_name in new_process_table.columns]
            new_process_table.set_value(-1, 'process_id', pro_id)

            for pro in process_comb:
                new_process_table[pro] = 0
                new_process_table.set_value(-1, pro, 1)

            new_process_table.reset_index(inplace=True, drop=True)
            return pro_id
        else:
            pro_id = new_process_table['process_id'].max() + 1
            
            new_process_table.loc[-1] = [0 for col_name in new_process_table.columns]
            new_process_table.set_value(-1, 'process_id', pro_id)
            
            for pro in dif_pro:
                new_process_table[pro] = 0
            
            for process in process_comb:
                new_process_table[pro] = 0
                new_process_table.set_value(-1, pro, 1)

            new_process_table.reset_index(inplace=True, drop=True)
            return pro_id
        
        
def computePeriodCombinations(period):
    combinations_dict = dict()

    for row_index in period.index:
        service_combination = (period.get_value(row_index, 'bluetooth_enabled'),
                       period.get_value(row_index, 'location_enabled'),
                       period.get_value(row_index, 'power_saver_enabled'),
                       period.get_value(row_index, 'flashlight_enabled'),
                       period.get_value(row_index, 'nfc_enabled'),
                       period.get_value(row_index, 'unknown_sources'),
                       period.get_value(row_index, 'developer_mode'))
            
            
        sample_processes = processes_table[processes_table['sample_id'] == period.get_value(row_index, 'sample_id')]    
        processes_combination = sorted(list(sample_processes['application_label']))
        
        
        comb_res = combinations_dict.get(service_combination, -1)
        if comb_res == -1:
            combinations_dict[service_combination] = [processes_combination]
        else:
            if processes_combination not in comb_res:
                comb_res = comb_res.append(processes_combination)
            
    return combinations_dict


def computePeriodStats(period, type_of_period, devID):
    #if charge_subset has only one sample or all samples have the same level of battery
    if len(period) < 5 or period.get_value(0, 'battery_level') == period.get_value(len(period) - 1, 'battery_level'):
        return
    
    if type_of_period == 1:
        reach_full = 0
        
        if period.get_value(len(period) - 1, 'battery_state') == 'Full':
            reach_full = 1
            
        #computes charge per unit for each row
        #select the lines above the previous mean to remove noise samples from subset
        each_row_charge_per_unit_series = period['timestamp'].diff() / (period['battery_level'].diff() * 100)
        average_per_unit = each_row_charge_per_unit_series[each_row_charge_per_unit_series <= each_row_charge_per_unit_series.mean()].mean().seconds
        
        time_id = buildTimeTable(period.get_value(0, 'timestamp'), time_table)
        
        combinations_dict = computePeriodCombinations(period)
        for service_comb, pro_comb_list in combinations_dict.items():
            service_id = buildServiceTable(service_comb, new_services_table)
            
            for pro_comb in pro_comb_list:
                pro_id = buildProcessTable(pro_comb, new_process_table)
                
        facts_table.loc[-1] = [devID, service_id, pro_id, time_id, type_of_period, average_per_unit, reach_full]
        facts_table.reset_index(inplace=True, drop=True)

    else:
        #same logic as charge per unit for each row
        each_row_discharge_per_unit_series = period['timestamp'].diff() / (abs(period['battery_level'].diff()) * 100)
        average_per_unit = each_row_discharge_per_unit_series[each_row_discharge_per_unit_series <= each_row_discharge_per_unit_series.mean()].mean().seconds
    
        time_id = buildTimeTable(period.get_value(0, 'timestamp'), time_table)
        
        combinations_dict = computePeriodCombinations(period)
        for service_comb, pro_comb_list in combinations_dict.items():
            service_id = buildServiceTable(service_comb, new_services_table)
            
            for pro_comb in pro_comb_list:
                pro_id = buildProcessTable(pro_comb, new_process_table)
                
        facts_table.loc[-1] = [devID, service_id, pro_id, time_id, type_of_period, average_per_unit, -1]
        facts_table.reset_index(inplace=True, drop=True)
        

def computePeriods(devID):
    #check if the device has samples
    if samples_data.loc[samples_data['device_id'] == devID].empty:
        return
    
    else:
        device_charge_samples = samples_data.loc[(samples_data['device_id'] == devID) & (samples_data['battery_state'].isin(['Charging', 'Full']))].drop_duplicates(subset=['timestamp']).sort_values(by='timestamp').reset_index(drop=True)
        device_discharge_samples = samples_data.loc[(samples_data['device_id'] == devID) & (samples_data['battery_state'] == 'Discharging')].drop_duplicates(subset=['timestamp']).sort_values(by='timestamp').reset_index(drop=True)
        device_not_charging_samples = samples_data.loc[(samples_data['device_id'] == devID) & (samples_data['battery_state'] == 'Not charging')].drop_duplicates(subset=['timestamp']).sort_values(by='timestamp').reset_index(drop=True)
        
        if not device_charge_samples.empty:
            previous_index = 0
            for index in islice(device_charge_samples.index, 1, None):
                #creates subsets
                if device_charge_samples.get_value(index, 'battery_level') < device_charge_samples.get_value(index - 1, 'battery_level'):
                    period = device_charge_samples[previous_index:index].drop_duplicates(subset=['battery_level'],  keep='last').reset_index(drop=True)
                    
                    computePeriodStats(period, 1, devID)
                    
                    previous_index = index
                #last samples into subset
                if index == len(device_charge_samples) - 1:
                    period = device_charge_samples[previous_index:index + 1].drop_duplicates(subset=['battery_level'],  keep='last').reset_index(drop=True)
                    
                    computePeriodStats(period, 1, devID)
                    
        if not device_discharge_samples.empty:
            previous_index = 0
            for index in islice(device_discharge_samples.index, 1, None):
                #creates subsets
                if device_discharge_samples.get_value(index, 'battery_level') > device_discharge_samples.get_value(index - 1, 'battery_level'):
                    period = device_discharge_samples[previous_index:index].drop_duplicates(subset=['battery_level'],  keep='last').reset_index(drop=True)
                    
                    computePeriodStats(period, 0, devID)
                    
                    previous_index = index
                #last samples into subset
                if index == len(device_discharge_samples) - 1:
                    period = device_discharge_samples[previous_index:index + 1].drop_duplicates(subset=['battery_level'],  keep='last').reset_index(drop=True)
                    
                    computePeriodStats(period, 0, devID)




# computePeriods(1)
# print('done!')
device_ids = samples_data['device_id'].unique()

for devID in device_ids:
    print(devID)
    computePeriods(devID)
    
print('done!')

1
4
8
9
15
16
20
22
24
26
27
28
29
32
35
34
42
48
49
52
54
12
31
55
57
