# Auto-caption
Date: 2018/12/14

Purpose: swarm name matching using the data below

Data source:
auto_caption4.csv
auto_caption5.csv
auto_caption7.csv
auto_caption8.csv
auto_caption9.csv
auto_caption10.csv
auto_caption11.csv

In [1]:
# system
import os
import sys
# 3rd party lib
import pandas as pd
from sklearn.cluster import KMeans
from fuzzywuzzy import fuzz # stirng matching

print('Python verison: {}'.format(sys.version))
print('\n############################')
print('Pandas verison: {}'.format(pd.show_versions()))

Python verison: 3.6.7 (default, Oct 21 2018, 04:56:05) 
[GCC 5.4.0 20160609]

############################

INSTALLED VERSIONS
------------------
commit: None
python: 3.6.7.final.0
python-bits: 64
OS: Linux
OS-release: 4.15.0-36-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8

pandas: 0.23.4
pytest: None
pip: 18.1
setuptools: 20.7.0
Cython: None
numpy: 1.14.5
scipy: 1.0.0
pyarrow: None
xarray: None
IPython: 6.5.0
sphinx: None
patsy: None
dateutil: 2.7.3
pytz: 2018.5
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 3.0.1
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: 0.7.3
lxml: None
bs4: None
html5lib: 1.0.1
sqlalchemy: 1.2.10
pymysql: None
psycopg2: None
jinja2: 2.8
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None
Pandas verison: None


### Read file 

In [2]:
standard_df = pd.read_csv('auto_caption1.csv')
column_list = ["timestamp", "event", "duration", 
               "deviceId", "copyKind", "payload", 
               "bandwidth", "pkt_src", "pkt_dst",
               "pid", "tid", "function_name", "category", 
               "feature_types", "mem_addr", "quotient",
               "cycles", "instructions", "cache-misses", "branch-misses", 
               "cluster_ID"]
standard_df.columns = column_list
standard_df.head(5)

Unnamed: 0,timestamp,event,duration,deviceId,copyKind,payload,bandwidth,pkt_src,pkt_dst,pid,...,function_name,category,feature_types,mem_addr,quotient,cycles,instructions,cache-misses,branch-misses,cluster_ID
0,1545293000.0,14.624439,0.007788,-1,-1,0,0,-1,-1,32210,...,_int_malloc,0,instructions,7f09330046d6,154529314559,0,0,0,1,0
1,1545293000.0,14.624439,0.195594,-1,-1,0,0,-1,-1,32210,...,_int_free,0,cycles,7f0933002430,154529314560,0,1,0,0,0
2,1545293000.0,14.624439,0.04041,-1,-1,0,0,-1,-1,32210,...,_int_malloc,0,instructions,7f0933003faa,154529314572,0,0,0,1,0
3,1545293000.0,14.624439,0.034722,-1,-1,0,0,-1,-1,32210,...,free,0,cycles,7f093300651a,154529314572,0,1,0,0,0
4,1545293000.0,14.624439,9.3e-05,-1,-1,0,0,-1,-1,32210,...,malloc,0,branch-misses,7f093300619c,154529314573,0,0,1,0,0


In [3]:
print('There are {} clusters in standard_df\n'.format(len(standard_df['cluster_ID'].unique())))

There are 15 clusters in standard_df



In [4]:
# default is axis=0
standard_df_groupby = standard_df.groupby(['cluster_ID','function_name']).agg({'function_name':['count']})
standard_df_groupby

Unnamed: 0_level_0,Unnamed: 1_level_0,function_name
Unnamed: 0_level_1,Unnamed: 1_level_1,count
cluster_ID,function_name,Unnamed: 2_level_2
0,_IO_default_xsputn,1
0,_IO_sputbackc,1
0,_IO_vfscanf,2
0,_IO_vsnprintf,1
0,__libc_calloc,9
0,__memcpy_sse2,1
0,__memset_sse2,1
0,__strchr_sse2,1
0,__strcmp_sse2_unaligned,29
0,__strcpy_sse2_unaligned,1


#### Access data of multiIndex dataframe
* [pandas, how to access multiIndex dataframe?](https://stackoverflow.com/questions/36806517/pandas-how-to-access-multiindex-dataframe)

In [5]:
# get column names
df = standard_df_groupby.loc[0].reset_index() 
flat_column_names = []
for level in df.columns:        
    # tuple to list    
    flat_column_names.extend(list(level)) # extend(): in-place
    
# remove duplicate and empty
flat_column_names = filter(None, flat_column_names) # filter empty
flat_column_names = list(set(flat_column_names)) # deduplicate
print('original order: {}'.format(flat_column_names))

# change member order of list due to set is a random order
if flat_column_names[0] == 'count':
    myorder = [1,0]
    flat_column_names = [flat_column_names[i] for i in myorder]
    print('New order: {}'.format(flat_column_names))

original order: ['count', 'function_name']
New order: ['function_name', 'count']


In [6]:
standard_df_dict = {}

# Transform multi-index to single index, and update string to dict standard_df_dict
for id_of_cluster in standard_df['cluster_ID'].unique():
    print('\n# of cluster: {}'.format(id_of_cluster))
    df = standard_df_groupby.loc[id_of_cluster].reset_index()
    df.columns = flat_column_names
    print(df.sort_values(by=['count'], ascending=False)) # pd.DataFrame.sort_values() return a DataFrame
    
    standard_df_dict.update({id_of_cluster: df.function_name.str.cat(sep='  ', na_rep='?')})    

print('################################')
print('\nDictionary of swarm data: \n{}'.format(standard_df_dict))


# of cluster: 0
               function_name  count
13               _int_malloc    148
19                    malloc    133
20        malloc_consolidate    105
12                 _int_free     87
8    __strcmp_sse2_unaligned     29
16                      free     12
23                    strlen     10
4              __libc_calloc      9
21                    memchr      3
11   __strstr_sse2_unaligned      3
15         arena_get2.part.3      2
2                _IO_vfscanf      2
22                   realloc      1
3              _IO_vsnprintf      1
18                    getpid      1
17             get_free_list      1
7              __strchr_sse2      1
6              __memset_sse2      1
14              _int_realloc      1
5              __memcpy_sse2      1
1              _IO_sputbackc      1
10  __strncpy_sse2_unaligned      1
9    __strcpy_sse2_unaligned      1
0         _IO_default_xsputn      1

# of cluster: 1
                                         function_name  count
0   

### Dataframe that i want to match

In [7]:
matching_df1 = pd.read_csv('auto_caption4.csv', names=column_list)
matching_df1.head(5)

Unnamed: 0,timestamp,event,duration,deviceId,copyKind,payload,bandwidth,pkt_src,pkt_dst,pid,...,function_name,category,feature_types,mem_addr,quotient,cycles,instructions,cache-misses,branch-misses,cluster_ID
0,1545293000.0,7.352002,0.000101,-1,-1,0,0,-1,-1,966,...,_PyEval_EvalFrameDefault,0,branch-misses,572e18,154529326054,0,1,0,0,0
1,1545293000.0,7.352006,0.026575,-1,-1,0,0,-1,-1,966,...,_PyEval_EvalFrameDefault,0,cycles,572eef,154529326057,0,0,0,1,0
2,1545293000.0,7.352012,0.042297,-1,-1,0,0,-1,-1,966,...,_PyEval_EvalFrameDefault,0,instructions,573007,154529326060,0,0,1,0,0
3,1545293000.0,7.352011,7.3e-05,-1,-1,0,0,-1,-1,966,...,_PyEval_EvalFrameDefault,0,branch-misses,572fd9,154529326073,0,1,0,0,0
4,1545293000.0,7.352009,9.5e-05,-1,-1,0,0,-1,-1,966,...,_PyEval_EvalFrameDefault,0,branch-misses,572f82,154529326073,0,1,0,0,0


In [8]:
# default is axis=0
matching_df1_groupby = matching_df1.groupby(['cluster_ID','function_name']).agg({'function_name':['count']})
print(matching_df1.groupby(['cluster_ID','function_name']).agg({'function_name':['count']}))

                                               function_name
                                                       count
cluster_ID function_name                                    
0          _PyEval_EvalFrameDefault                      210
1          PyEval_EvalCodeEx                               3
           [unknown](/usr/bin/python3.6                   53
2          __accumulate_pelt_segments                      1
           __calc_delta                                   10
           __update_idle_core                              1
           __update_load_avg_se.isra.38                    2
           account_entity_dequeue                          2
           account_entity_enqueue                          1
           account_system_index_time                       1
           check_cfs_rq_runtime                            1
           check_preempt_curr                              1
           clear_buddies                                   1
           cpuacct_charg

In [9]:
# get column names
df = matching_df1_groupby.loc[0].reset_index() 
flat_column_names = []
for level in df.columns:        
    # tuple to list    
    flat_column_names.extend(list(level)) # extend(): in-place

# remove duplicate and empty
flat_column_names = filter(None, flat_column_names) # filter empty
flat_column_names = list(set(flat_column_names)) # deduplicate
print(flat_column_names)

# change member order of list due to set is a random order
if flat_column_names[0] == 'count':
    myorder = [1,0]
    flat_column_names = [flat_column_names[i] for i in myorder]
    print('New order: {}'.format(flat_column_names))

['count', 'function_name']
New order: ['function_name', 'count']


In [10]:
matching_df1_dict = {}

# Transform multi-index to single index, and update string to dict standard_df_dict
for id_of_cluster in matching_df1['cluster_ID'].unique():
    print('\n# of cluster: {}'.format(id_of_cluster))
    df = matching_df1_groupby.loc[id_of_cluster].reset_index() 
    df.columns = flat_column_names
    print(df.sort_values(by=['count'], ascending=False))
    
    matching_df1_dict.update({id_of_cluster: df.function_name.str.cat(sep='  ', na_rep='?')})    

print('################################')
print('\nDictionary of swarm data: \n{}'.format(matching_df1_dict))


# of cluster: 0
              function_name  count
0  _PyEval_EvalFrameDefault    210

# of cluster: 1
                  function_name  count
1  [unknown](/usr/bin/python3.6     53
0             PyEval_EvalCodeEx      3

# of cluster: 2
                   function_name  count
17            finish_task_switch     94
22           pick_next_task_fair     26
36               sys_sched_yield     18
43                   update_curr     13
1                   __calc_delta     10
30                 schedule_tail      9
29             sched_setaffinity      9
10                cpuacct_charge      8
21              pick_next_entity      8
48               yield_task_fair      8
44               update_load_avg      4
45           update_min_vruntime      4
40       update_blocked_averages      3
11                dequeue_entity      3
38                try_to_wake_up      3
41              update_cfs_group      3
12             dequeue_task_fair      3
32           select_task_rq_fair      2
19

### string matching funciton
* 1-to-1 matching (or mapping)
* Github of fuzzywuzzy: [link](https://github.com/seatgeek/fuzzywuzzy)
* Search keyword: You can try 'fuzzywuzzy' + 'pandas'

In [11]:
def matching_two_dicts_of_swarm(standard_dict, matching_dict, res_dict):    
    """    
    match two dictoinaries with same amount of key-value pairs
    and return matching result, a dict of dict called res_dict.
    
    * standard_dict: The standard of dict
    * matching_dict: The dict that i want to match
    * res_dict: the result, a dict of dict
    """
    key = 0 # key: number, no string    
    pop_list = [k for k,v in matching_dict.items()]
    print(pop_list)
    for i in standard_dict.keys(): # control access index of standard_dict. a more pythonic way
        threshold = 0
        for j in pop_list: # control access index of matching_dict
            f_ratio = fuzz.ratio(standard_dict[i], matching_dict[j])
            if f_ratio > threshold: # update matching result only when the fuzz ratio is greater
                print('New matching fuzz ratio {} is higher than threshold {}'\
                      .format(f_ratio, threshold))
                key = j # update key
                threshold = f_ratio # update threshold value
                print('Update new threshold {}'\
                      .format(threshold))                
                res_dict.update({i: {j: matching_dict[i]}}) #         
        # pop out matched key-value pair of matching dict
        if pop_list:
            pop_list.remove(key) # remove specific value. remove() fails when no elements remains
        print(res_dict)
    return res_dict

In [12]:
res_dict = {}
res_dict = matching_two_dicts_of_swarm(standard_df_dict, matching_df1_dict, res_dict)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
New matching fuzz ratio 8 is higher than threshold 0
Update new threshold 8
New matching fuzz ratio 12 is higher than threshold 8
Update new threshold 12
New matching fuzz ratio 33 is higher than threshold 12
Update new threshold 33
New matching fuzz ratio 35 is higher than threshold 33
Update new threshold 35
New matching fuzz ratio 44 is higher than threshold 35
Update new threshold 44
{0: {6: '_PyEval_EvalFrameDefault'}}
New matching fuzz ratio 7 is higher than threshold 0
Update new threshold 7
New matching fuzz ratio 38 is higher than threshold 7
Update new threshold 38
{0: {6: '_PyEval_EvalFrameDefault'}, 1: {4: 'PyEval_EvalCodeEx  [unknown](/usr/bin/python3.6'}}
New matching fuzz ratio 5 is higher than threshold 0
Update new threshold 5
New matching fuzz ratio 29 is higher than threshold 5
Update new threshold 29
{0: {6: '_PyEval_EvalFrameDefault'}, 1: {4: 'PyEval_EvalCodeEx  [unknown](/usr/bin/python3.6'}, 2: {11: '__accumulate

In [13]:
print(res_dict)

{0: {6: '_PyEval_EvalFrameDefault'}, 1: {4: 'PyEval_EvalCodeEx  [unknown](/usr/bin/python3.6'}, 2: {11: '__accumulate_pelt_segments  __calc_delta  __update_idle_core  __update_load_avg_se.isra.38  account_entity_dequeue  account_entity_enqueue  account_system_index_time  check_cfs_rq_runtime  check_preempt_curr  clear_buddies  cpuacct_charge  dequeue_entity  dequeue_task_fair  down_read_trylock  enqueue_entity  enqueue_task_fair  exit_signals  finish_task_switch  get_nohz_timer_target  idle_cpu  load_balance  pick_next_entity  pick_next_task_fair  pick_next_task_idle  prepare_creds  put_prev_entity  put_prev_task_fair  reweight_entity  rwsem_spin_on_owner  sched_setaffinity  schedule_tail  select_idle_sibling  select_task_rq_fair  set_cpus_allowed_common  set_curr_task_fair  set_next_entity  sys_sched_yield  trigger_load_balance  try_to_wake_up  up_read  update_blocked_averages  update_cfs_group  update_cfs_rq_h_load  update_curr  update_load_avg  update_min_vruntime  update_rq_clock  

### show all stats (Ans) and matching results (algorithm)

In [14]:
std_dict_to_df = pd.DataFrame.from_dict(standard_df_dict, orient='index', columns=['Before: function_name'])
std_dict_to_df['std_cluster_ID'] = std_dict_to_df.index
std_dict_to_df = std_dict_to_df[['std_cluster_ID', 'Before: function_name']]
std_dict_to_df

Unnamed: 0,std_cluster_ID,Before: function_name
0,0,_IO_default_xsputn _IO_sputbackc _IO_vfscanf...
1,1,decode_mcu decompress_onepass get_sof googl...
2,2,Eigen::NonBlockingThreadPoolTempl<tensorflow::...
3,3,[unknown](/usr/bin/python3.6 _PyEval_EvalFram...
4,4,PyDict_New PyDict_SetDefault PyDict_SetItem ...
5,5,__lock_text_start __schedule _cond_resched ...
6,6,__calc_delta __enqueue_entity __init_waitque...
7,7,apic_timer_interrupt entry_SYSCALL_64_after_h...
8,8,[unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...
9,9,Eigen::internal::TensorExecutor<Eigen::TensorA...


In [15]:
mtch_df1_dict_to_df = pd.DataFrame.from_dict(matching_df1_dict, orient='index', columns=['Matching function_name'])
mtch_df1_dict_to_df

Unnamed: 0,Matching function_name
0,_PyEval_EvalFrameDefault
1,PyEval_EvalCodeEx [unknown](/usr/bin/python3.6
2,__accumulate_pelt_segments __calc_delta __up...
3,_IO_link_in _IO_vsprintf _int_free _int_mal...
4,decode_mcu decompress_onepass jpeg_make_d_de...
5,PyDict_New PyDict_SetDefault PyDict_SetItem ...
6,__lock_text_start __schedule _cond_resched ...
7,[unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...
8,call_function_interrupt entry_SYSCALL_64_afte...
9,[unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...


In [16]:
res_dict_to_df = pd.DataFrame() # create an empty frame
res_dict_to_df

In [17]:
res_list = [k for k,v in res_dict.items()]
for key in res_list:
    df = pd.DataFrame.from_dict(res_dict[key], orient='index', columns=['After: funciton name']) # res_dict[key]: a dict
    df['mtch_cluster_ID'] = df.index
    #print(df)
    res_dict_to_df = res_dict_to_df.append(df, ignore_index=True) # df.append(): not in-place

res_dict_to_df = res_dict_to_df[['mtch_cluster_ID', 'After: funciton name']]
print(res_dict_to_df.head(5))

   mtch_cluster_ID                               After: funciton name
0                6                           _PyEval_EvalFrameDefault
1                4    PyEval_EvalCodeEx  [unknown](/usr/bin/python3.6
2               11  __accumulate_pelt_segments  __calc_delta  __up...
3                0  _IO_link_in  _IO_vsprintf  _int_free  _int_mal...
4                5  decode_mcu  decompress_onepass  jpeg_make_d_de...


In [18]:
final_df = pd.concat([std_dict_to_df, res_dict_to_df], axis=1)
final_df

Unnamed: 0,std_cluster_ID,Before: function_name,mtch_cluster_ID,After: funciton name
0,0,_IO_default_xsputn _IO_sputbackc _IO_vfscanf...,6,_PyEval_EvalFrameDefault
1,1,decode_mcu decompress_onepass get_sof googl...,4,PyEval_EvalCodeEx [unknown](/usr/bin/python3.6
2,2,Eigen::NonBlockingThreadPoolTempl<tensorflow::...,11,__accumulate_pelt_segments __calc_delta __up...
3,3,[unknown](/usr/bin/python3.6 _PyEval_EvalFram...,0,_IO_link_in _IO_vsprintf _int_free _int_mal...
4,4,PyDict_New PyDict_SetDefault PyDict_SetItem ...,5,decode_mcu decompress_onepass jpeg_make_d_de...
5,5,__lock_text_start __schedule _cond_resched ...,14,PyDict_New PyDict_SetDefault PyDict_SetItem ...
6,6,__calc_delta __enqueue_entity __init_waitque...,2,__lock_text_start __schedule _cond_resched ...
7,7,apic_timer_interrupt entry_SYSCALL_64_after_h...,8,[unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...
8,8,[unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...,7,call_function_interrupt entry_SYSCALL_64_afte...
9,9,Eigen::internal::TensorExecutor<Eigen::TensorA...,3,[unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...


### Evaluation
Evaluate matching result of single run, then all different runs

data source:
* standard dataframe: standard_df
* dataframe that I want to match: matching_df1

Note:
pandas.DataFrame.merge returns DataFrame

In [19]:
def evaluation_of_matching_result(standard_df, matching_df1, final_df, eval_list, tmp_dict):
    """
    calculate intersection rate of two dataframe        
    intersection rate = num_t_stdswarm  / total_num_t_mtchswarm 
    num_t_stdswarm: traces in standard swarm
    total_num_t_mtchswarm: total traces number in matching swarm
    """        
    std_duration_list = []
    mtch_duration_list = [] 
    diff_list = []
    
    # calculate num_t_stdswarm & total_num_t_mtchswarm
    for id_of_cluster in final_df.index:                                        
        std_id = final_df['std_cluster_ID'].loc[id_of_cluster]
        print('\n# of cluster in standard cluster: {}\n'.format(std_id))             
        
        std_df = standard_df.groupby(['cluster_ID','function_name'])\
                        .agg({'function_name':['count']})\
                        .loc[std_id]\
                        .reset_index()                
        std_flat_column_names = ['std_func_name', 'count']
        std_df.columns = std_flat_column_names
        # sum up duration time
        std_total_duration = standard_df['duration'].loc[standard_df['cluster_ID'] == id_of_cluster].sum()        
        print('std_total_duration = {} sec'.format(std_total_duration))
        print('Function name in cluster: \n{}\n'.format(std_df.sort_values(by=['count'], ascending=False)))                
        
        # total_num_t_mtchswarm
        mtch_id = final_df['mtch_cluster_ID'].loc[id_of_cluster]            
        print('\n# of cluster in matching cluster: {}'.format(mtch_id))
        
        mtch_df = matching_df1.groupby(['cluster_ID','function_name'])\
                        .agg({'function_name':['count']})\
                        .loc[mtch_id]\
                        .reset_index() 
        mtch_flat_column_names = ['mtch_func_name', 'count']
        mtch_df.columns = mtch_flat_column_names
        # sum up duration time
        mtch_total_duration = matching_df1['duration'].loc[matching_df1['cluster_ID'] == id_of_cluster].sum()
        
        total_num_t_mtchswarm = mtch_df['count'].sum()
        print('mtch_total_duration = {} sec'.format(mtch_total_duration))
        print('Function name in cluster: \n{}\n'.format(mtch_df.sort_values(by=['count'], ascending=False)))
        print('---------------------------------------------------------')
        print('Total number of function name in cluster: {}'.format(total_num_t_mtchswarm))                
        
        # add total duration of each cluster
        std_duration_list.append(std_total_duration)
        mtch_duration_list.append(mtch_total_duration)
        diff_list.append(abs(std_total_duration - mtch_total_duration))
        
        # To calculate num_t_stdswarm, get intersection of two cluster first        
        intersected_df = std_df.merge(mtch_df, left_on='std_func_name', right_on='mtch_func_name', how='outer')        
        intersected_df.dropna(inplace=True) # drop row with NaN value and inplace
        intersected_df['min_value'] = intersected_df.min(axis=1)
        num_t_stdswarm = intersected_df['min_value'].sum()
        intersect_percent = num_t_stdswarm * 100 / float(total_num_t_mtchswarm) # float number                
        
        if(intersect_percent != 0.0): 
            eval_list.append(intersect_percent)
               
        print('merge frame:\n {}\n'.format(intersected_df))                        
        print('num_t_stdswarm = {}'.format(num_t_stdswarm))        
        print('intersection rate = (num_t_stdswarm / total_num_t_mtchswarm) x 100% = {}%'.format(intersect_percent))        
        print('---------------------------------------------------------')
        #break; # test only one cluster                    
        
    # How many cluster match correctly
    intersect_percent = len(eval_list) * 100.0 / len(standard_df['cluster_ID'].unique())
    print('Number of intersection rate > 0% percent: {}%'.format(intersect_percent)) #    
    
    # deal with duration time of each cluster among two dataframes
    tmp_dict = {'std_duration(sec)': std_duration_list, 'mtch_duration(sec)': mtch_duration_list, 'cluster_diff(sec)': diff_list}
    tmp_df = pd.DataFrame.from_dict(tmp_dict) # dummy dataframe, just for concatenation
    final_df = pd.concat([final_df, tmp_df], axis=1, sort=False)  # axis=1: horizontal direction
    print('final_df: \n{}'.format(final_df))
    return final_df # return final_df in case information lost

In [20]:
eval_list = []
tmp_dict = {}
final_df = evaluation_of_matching_result(standard_df, matching_df1, final_df, eval_list, tmp_dict)


# of cluster in standard cluster: 0

std_total_duration = 8.42884693378495 sec
Function name in cluster: 
               std_func_name  count
13               _int_malloc    148
19                    malloc    133
20        malloc_consolidate    105
12                 _int_free     87
8    __strcmp_sse2_unaligned     29
16                      free     12
23                    strlen     10
4              __libc_calloc      9
21                    memchr      3
11   __strstr_sse2_unaligned      3
15         arena_get2.part.3      2
2                _IO_vfscanf      2
22                   realloc      1
3              _IO_vsnprintf      1
18                    getpid      1
17             get_free_list      1
7              __strchr_sse2      1
6              __memset_sse2      1
14              _int_realloc      1
5              __memcpy_sse2      1
1              _IO_sputbackc      1
10  __strncpy_sse2_unaligned      1
9    __strcpy_sse2_unaligned      1
0         _IO_default_xsputn 

mtch_total_duration = 3.2679893032042013 sec
Function name in cluster: 
                  mtch_func_name  count
17            finish_task_switch     94
22           pick_next_task_fair     26
36               sys_sched_yield     18
43                   update_curr     13
1                   __calc_delta     10
30                 schedule_tail      9
29             sched_setaffinity      9
10                cpuacct_charge      8
21              pick_next_entity      8
48               yield_task_fair      8
44               update_load_avg      4
45           update_min_vruntime      4
40       update_blocked_averages      3
11                dequeue_entity      3
38                try_to_wake_up      3
41              update_cfs_group      3
12             dequeue_task_fair      3
32           select_task_rq_fair      2
19                      idle_cpu      2
13             down_read_trylock      2
46               update_rq_clock      2
4         account_entity_dequeue      2
3   __up

std_total_duration = 2.526085828702353 sec
Function name in cluster: 
                                       std_func_name  count
0  [unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...    157


# of cluster in matching cluster: 9
mtch_total_duration = 2.184302488996746 sec
Function name in cluster: 
                                      mtch_func_name  count
0  [unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...    163

---------------------------------------------------------
Total number of function name in cluster: 163
merge frame:
                                        std_func_name  count_x  \
0  [unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...      157   

                                      mtch_func_name  count_y  min_value  
0  [unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...      163        157  

num_t_stdswarm = 157
intersection rate = (num_t_stdswarm / total_num_t_mtchswarm) x 100% = 96.31901840490798%
---------------------------------------------------------

# of cluste

In [21]:
final_df

Unnamed: 0,std_cluster_ID,Before: function_name,mtch_cluster_ID,After: funciton name,std_duration(sec),mtch_duration(sec),cluster_diff(sec)
0,0,_IO_default_xsputn _IO_sputbackc _IO_vfscanf...,6,_PyEval_EvalFrameDefault,8.428847,6.324499,2.104348
1,1,decode_mcu decompress_onepass get_sof googl...,4,PyEval_EvalCodeEx [unknown](/usr/bin/python3.6,7.456726,4.066413,3.390313
2,2,Eigen::NonBlockingThreadPoolTempl<tensorflow::...,11,__accumulate_pelt_segments __calc_delta __up...,6.212124,3.784796,2.427328
3,3,[unknown](/usr/bin/python3.6 _PyEval_EvalFram...,0,_IO_link_in _IO_vsprintf _int_free _int_mal...,6.178297,3.552647,2.62565
4,4,PyDict_New PyDict_SetDefault PyDict_SetItem ...,5,decode_mcu decompress_onepass jpeg_make_d_de...,4.032436,3.467329,0.565107
5,5,__lock_text_start __schedule _cond_resched ...,14,PyDict_New PyDict_SetDefault PyDict_SetItem ...,3.503117,3.464203,0.038914
6,6,__calc_delta __enqueue_entity __init_waitque...,2,__lock_text_start __schedule _cond_resched ...,3.297108,3.267989,0.029119
7,7,apic_timer_interrupt entry_SYSCALL_64_after_h...,8,[unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...,3.149724,2.730261,0.419462
8,8,[unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...,7,call_function_interrupt entry_SYSCALL_64_afte...,2.8001,2.72149,0.07861
9,9,Eigen::internal::TensorExecutor<Eigen::TensorA...,3,[unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...,2.740506,2.55471,0.185796


###  output to csv 
output to csv under folder sofalog 

In [25]:
logdir = 'sofalog/' # please adjust the output directory path to fit your need 
final_df.to_csv(logdir + 'swarm_diff.csv') 