# Auto-caption
Date: 2018/11/14
Purpose: swarm name matching using the data below

Data source:
auto_caption4.csv
auto_caption5.csv
auto_caption7.csv
auto_caption8.csv
auto_caption9.csv
auto_caption10.csv
auto_caption11.csv

In [1]:
# system
import os
import sys
# 3rd party lib
import pandas as pd
from sklearn.cluster import KMeans
from fuzzywuzzy import fuzz # stirng matching

print('Python verison: {}'.format(sys.version))
print('\n############################')
print('Pandas verison: {}'.format(pd.show_versions()))

Python verison: 3.6.7 (default, Oct 21 2018, 04:56:05) 
[GCC 5.4.0 20160609]

############################

INSTALLED VERSIONS
------------------
commit: None
python: 3.6.7.final.0
python-bits: 64
OS: Linux
OS-release: 4.15.0-36-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8

pandas: 0.23.4
pytest: None
pip: 18.1
setuptools: 20.7.0
Cython: None
numpy: 1.14.5
scipy: 1.0.0
pyarrow: None
xarray: None
IPython: 6.5.0
sphinx: None
patsy: None
dateutil: 2.7.3
pytz: 2018.5
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 3.0.1
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: 0.7.3
lxml: None
bs4: None
html5lib: 1.0.1
sqlalchemy: 1.2.10
pymysql: None
psycopg2: None
jinja2: 2.8
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None
Pandas verison: None


### Read file 

In [2]:
standard_df = pd.read_csv('auto_caption4.csv', names=['cluster_ID','timestamp','event','name'])
print('There are {} clusters in standard_df\n'.format(len(standard_df['cluster_ID'].unique())))
print(standard_df.head(5))

There are 15 clusters in standard_df

   cluster_ID     timestamp      event         name
0           0  1.544755e+09  14.624682  _int_malloc
1           0  1.544755e+09  14.624682       malloc
2           0  1.544755e+09  14.624682  _int_malloc
3           0  1.544755e+09  14.624682  _int_malloc
4           0  1.544755e+09  14.624682       malloc


In [3]:
# default is axis=0
standard_df_groupby = standard_df.groupby(['cluster_ID','name']).agg({'name':['count']})
print(standard_df.groupby(['cluster_ID','name']).agg({'name':['count']}))

                                                               name
                                                              count
cluster_ID name                                                    
0          _int_malloc                                           10
           free                                                   2
           malloc                                                 8
1          _int_free                                              9
           malloc_consolidate                                     7
2          PyDict_Next                                            1
           [unknown](/usr/bin/python3.6                           6
           _PyDict_LoadGlobal                                     5
3          [unknown](/usr/bin/python3.6                           7
4          entry_SYSCALL_64_after_hwframe                         4
           page_fault                                             7
           swapgs_restore_regs_and_return_to_use

#### Access data of multiIndex dataframe
* [pandas, how to access multiIndex dataframe?](https://stackoverflow.com/questions/36806517/pandas-how-to-access-multiindex-dataframe)

In [4]:
# get column names
df = standard_df_groupby.loc[0].reset_index() 
flat_column_names = []
for level in df.columns:        
    # tuple to list    
    flat_column_names.extend(list(level)) # extend(): in-place
    
# remove duplicate and empty
flat_column_names = filter(None, flat_column_names) # filter empty
flat_column_names = list(set(flat_column_names)) # deduplicate
print('original order: {}'.format(flat_column_names))

# change member order of list due to set is a random order
if flat_column_names[0] == 'count':
    myorder = [1,0]
    flat_column_names = [flat_column_names[i] for i in myorder]
    print('New order: {}'.format(flat_column_names))

original order: ['count', 'name']
New order: ['name', 'count']


In [5]:
standard_df_dict = {}

# Transform multi-index to single index, and update string to dict standard_df_dict
for id_of_cluster in standard_df['cluster_ID'].unique():
    print('\n# of cluster: {}'.format(id_of_cluster))
    df = standard_df_groupby.loc[id_of_cluster].reset_index() 
    df.columns = flat_column_names
    print(df.sort_values(by=['count'], ascending=False))
    
    standard_df_dict.update({id_of_cluster: df.name.str.cat(sep='  ', na_rep='?')})    

print('################################')
print('\nDictionary of swarm data: \n{}'.format(standard_df_dict))


# of cluster: 0
          name  count
0  _int_malloc     10
2       malloc      8
1         free      2

# of cluster: 1
                 name  count
0           _int_free      9
1  malloc_consolidate      7

# of cluster: 2
                           name  count
1  [unknown](/usr/bin/python3.6      6
2            _PyDict_LoadGlobal      5
0                   PyDict_Next      1

# of cluster: 3
                           name  count
0  [unknown](/usr/bin/python3.6      7

# of cluster: 4
                                         name  count
1                                  page_fault      7
3                   syscall_return_via_sysret      5
0              entry_SYSCALL_64_after_hwframe      4
2  swapgs_restore_regs_and_return_to_usermode      4

# of cluster: 5
                       name  count
0  _PyEval_EvalFrameDefault      8

# of cluster: 6
                                                name  count
0  [unknown](/usr/local/cuda-9.2/targets/x86_64-l...      1

# of cluster: 7


### Dataframe that i want to match

In [6]:
matching_df1 = pd.read_csv('auto_caption5.csv', names=['cluster_ID','timestamp','event','name'])
print('There are {} clusters in standard_df\n'.format(len(matching_df1['cluster_ID'].unique())))
print(matching_df1.head(5))

There are 15 clusters in standard_df

   cluster_ID     timestamp      event  \
0           0  1.544755e+09  14.625481   
1           0  1.544755e+09  14.625481   
2           0  1.544755e+09  14.625481   
3           0  1.544755e+09  14.625481   
4           0  1.544755e+09  14.625481   

                                                name  
0  tensorflow::grappler::ModifiesInputsInPlace(te...  
1                                         decode_mcu  
2                                         decode_mcu  
3                                         decode_mcu  
4                                         decode_mcu  


In [7]:
# default is axis=0
matching_df1_groupby = matching_df1.groupby(['cluster_ID','name']).agg({'name':['count']})
print(matching_df1.groupby(['cluster_ID','name']).agg({'name':['count']}))

                                                               name
                                                              count
cluster_ID name                                                    
0          decode_mcu                                            23
           jsimd_ycc_rgb_convert_sse2                             1
           tensorflow::grappler::ModifiesInputsInPlace(ten...     1
1          Eigen::internal::TensorExecutor<Eigen::TensorAs...     3
           float __vector Eigen::TensorEvaluator<Eigen::Te...     1
           float __vector Eigen::TensorEvaluator<Eigen::Te...     1
           tensorflow::AdjustContrastOpv2<Eigen::ThreadPoo...     2
           tensorflow::DatasetIterator<tensorflow::(anonym...     1
           void Eigen::TensorEvaluator<Eigen::TensorSlicin...     1
           void tensorflow::(anonymous namespace)::resize_...    15
2          __update_load_avg_blocked_se.isra.37                   1
           __update_load_avg_se.isra.38         

In [8]:
# get column names
df = matching_df1_groupby.loc[0].reset_index() 
flat_column_names = []
for level in df.columns:        
    # tuple to list    
    flat_column_names.extend(list(level)) # extend(): in-place

# remove duplicate and empty
flat_column_names = filter(None, flat_column_names) # filter empty
flat_column_names = list(set(flat_column_names)) # deduplicate
print(flat_column_names)

# change member order of list due to set is a random order
if flat_column_names[0] == 'count':
    myorder = [1,0]
    flat_column_names = [flat_column_names[i] for i in myorder]
    print('New order: {}'.format(flat_column_names))

['count', 'name']
New order: ['name', 'count']


In [9]:
matching_df1_dict = {}

# Transform multi-index to single index, and update string to dict standard_df_dict
for id_of_cluster in matching_df1['cluster_ID'].unique():
    print('\n# of cluster: {}'.format(id_of_cluster))
    df = matching_df1_groupby.loc[id_of_cluster].reset_index() 
    df.columns = flat_column_names
    print(df.sort_values(by=['count'], ascending=False))
    
    matching_df1_dict.update({id_of_cluster: df.name.str.cat(sep='  ', na_rep='?')})    

print('################################')
print('\nDictionary of swarm data: \n{}'.format(matching_df1_dict))


# of cluster: 0
                                                name  count
0                                         decode_mcu     23
1                         jsimd_ycc_rgb_convert_sse2      1
2  tensorflow::grappler::ModifiesInputsInPlace(te...      1

# of cluster: 1
                                                name  count
6  void tensorflow::(anonymous namespace)::resize...     15
0  Eigen::internal::TensorExecutor<Eigen::TensorA...      3
3  tensorflow::AdjustContrastOpv2<Eigen::ThreadPo...      2
1  float __vector Eigen::TensorEvaluator<Eigen::T...      1
2  float __vector Eigen::TensorEvaluator<Eigen::T...      1
4  tensorflow::DatasetIterator<tensorflow::(anony...      1
5  void Eigen::TensorEvaluator<Eigen::TensorSlici...      1

# of cluster: 2
                                    name  count
6                     finish_task_switch     28
10                   pick_next_task_fair      8
21                           update_curr      3
23                       yield_task_f

### string matching funciton
* 1-to-1 matching (or mapping)
* Github of fuzzywuzzy: [link](https://github.com/seatgeek/fuzzywuzzy)
* Search keyword: You can try 'fuzzywuzzy' + 'pandas'

In [19]:
def matching_two_dicts_of_swarm(standard_dict, matching_dict, res_dict):    
    """    
    match two dictoinaries with same amount of key-value pairs
    and return matching result, a dict of dict called res_dict.
    
    * standard_dict: The standard of dict
    * matching_dict: The dict that i want to match
    * res_dict: the result, a dict of dict
    """
    key = 0 # key: number, no string    
    pop_list = [k for k,v in matching_dict.items()]
    print(pop_list)
    for i in standard_dict.keys(): # control access index of standard_dict. a more pythonic way
        threshold = 0
        for j in pop_list: # control access index of matching_dict
            f_ratio = fuzz.ratio(standard_dict[i], matching_dict[j])
            if f_ratio > threshold: # update matching result only when the fuzz ratio is greater
                print('New matching fuzz ratio {} is higher than threshold {}'\
                      .format(f_ratio, threshold))
                key = j # update key
                threshold = f_ratio # update threshold value
                print('Update new threshold {}'\
                      .format(threshold))                
                res_dict.update({i: {j: matching_dict[i]}}) #         
        # pop out matched key-value pair of matching dict
        if pop_list:
            pop_list.remove(key) # remove specific value. remove() fails when no elements remains
        print(res_dict)
    return res_dict

In [20]:
res_dict = {}
res_dict = matching_two_dicts_of_swarm(standard_df_dict, matching_df1_dict, res_dict)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
New matching fuzz ratio 22 is higher than threshold 0
Update new threshold 22
New matching fuzz ratio 100 is higher than threshold 22
Update new threshold 100
{0: {4: 'decode_mcu  jsimd_ycc_rgb_convert_sse2  tensorflow::grappler::ModifiesInputsInPlace(tensorflow::NodeDef const&)'}}
New matching fuzz ratio 26 is higher than threshold 0
Update new threshold 26
New matching fuzz ratio 30 is higher than threshold 26
Update new threshold 30
New matching fuzz ratio 75 is higher than threshold 30
Update new threshold 75
{0: {4: 'decode_mcu  jsimd_ycc_rgb_convert_sse2  tensorflow::grappler::ModifiesInputsInPlace(tensorflow::NodeDef const&)'}, 1: {7: 'Eigen::internal::TensorExecutor<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<long, 1ul> const, Eigen::array<long, 1ul> const, Eigen::TensorSlicingOp<Eigen::array<long, 1ul> const, Eigen::array<long, 1ul> const, Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>

In [21]:
print(res_dict)

{0: {4: 'decode_mcu  jsimd_ycc_rgb_convert_sse2  tensorflow::grappler::ModifiesInputsInPlace(tensorflow::NodeDef const&)'}, 1: {7: 'Eigen::internal::TensorExecutor<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<long, 1ul> const, Eigen::array<long, 1ul> const, Eigen::TensorSlicingOp<Eigen::array<long, 1ul> const, Eigen::array<long, 1ul> const, Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer> > >, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float const, float const>, Eigen::TensorSlicingOp<Eigen::array<long, 1ul> const, Eigen::array<long, 1ul> const, Eigen::TensorSlicingOp<Eigen::array<long, 1ul> const, Eigen::array<long, 1ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const, Eigen::TensorSlicingOp<Eigen::array<long, 1ul> const, Eigen::array<long, 1ul> const, Eigen::TensorSlicingOp<Eigen::array<long, 1ul> const, Eigen::array<long, 1ul> const, Eigen::TensorMap<Eigen::Tensor<float 

### show all stats (Ans) and matching results (algorithm)

In [22]:
std_dict_to_df = pd.DataFrame.from_dict(standard_df_dict, orient='index', columns=['Before: function_name'])
std_dict_to_df['std_cluster_ID'] = std_dict_to_df.index
std_dict_to_df = std_dict_to_df[['std_cluster_ID', 'Before: function_name']]
std_dict_to_df

Unnamed: 0,std_cluster_ID,Before: function_name
0,0,_int_malloc free malloc
1,1,_int_free malloc_consolidate
2,2,PyDict_Next [unknown](/usr/bin/python3.6 _Py...
3,3,[unknown](/usr/bin/python3.6
4,4,entry_SYSCALL_64_after_hwframe page_fault sw...
5,5,_PyEval_EvalFrameDefault
6,6,[unknown](/usr/local/cuda-9.2/targets/x86_64-l...
7,7,decode_mcu tensorflow::grappler::IsConcat(ten...
8,8,_PyEval_EvalFrameDefault
9,9,[unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...


In [23]:
mtch_df1_dict_to_df = pd.DataFrame.from_dict(matching_df1_dict, orient='index', columns=['Matching function_name'])
mtch_df1_dict_to_df

Unnamed: 0,Matching function_name
0,decode_mcu jsimd_ycc_rgb_convert_sse2 tensor...
1,Eigen::internal::TensorExecutor<Eigen::TensorA...
2,__update_load_avg_blocked_se.isra.37 __update...
3,Eigen::NonBlockingThreadPoolTempl<tensorflow::...
4,_int_malloc free malloc
5,_PyEval_EvalFrameDefault
6,[unknown](/usr/bin/python3.6
7,_int_free arena_get2.part.3 malloc_consolidate
8,"std::_Function_handler<void (long, long), Eige..."
9,"std::_Hashtable<tensorflow::NodeDef const*, st..."


In [24]:
res_dict_to_df = pd.DataFrame()
res_dict_to_df

In [25]:
res_list = [k for k,v in res_dict.items()]
for key in res_list:
    df = pd.DataFrame.from_dict(res_dict[key], orient='index', columns=['After: funciton name']) # res_dict[key]: a dict
    df['mtch_cluster_ID'] = df.index
    #print(df)
    res_dict_to_df = res_dict_to_df.append(df, ignore_index=True) # df.append(): not in-place

res_dict_to_df = res_dict_to_df[['mtch_cluster_ID', 'After: funciton name']]
print(res_dict_to_df.head(5))

   mtch_cluster_ID                               After: funciton name
0                4  decode_mcu  jsimd_ycc_rgb_convert_sse2  tensor...
1                7  Eigen::internal::TensorExecutor<Eigen::TensorA...
2                6  __update_load_avg_blocked_se.isra.37  __update...
3                0  Eigen::NonBlockingThreadPoolTempl<tensorflow::...
4               12                          _int_malloc  free  malloc


In [26]:
final_df = pd.concat([std_dict_to_df, res_dict_to_df], axis=1)
final_df

Unnamed: 0,std_cluster_ID,Before: function_name,mtch_cluster_ID,After: funciton name
0,0,_int_malloc free malloc,4,decode_mcu jsimd_ycc_rgb_convert_sse2 tensor...
1,1,_int_free malloc_consolidate,7,Eigen::internal::TensorExecutor<Eigen::TensorA...
2,2,PyDict_Next [unknown](/usr/bin/python3.6 _Py...,6,__update_load_avg_blocked_se.isra.37 __update...
3,3,[unknown](/usr/bin/python3.6,0,Eigen::NonBlockingThreadPoolTempl<tensorflow::...
4,4,entry_SYSCALL_64_after_hwframe page_fault sw...,12,_int_malloc free malloc
5,5,_PyEval_EvalFrameDefault,5,_PyEval_EvalFrameDefault
6,6,[unknown](/usr/local/cuda-9.2/targets/x86_64-l...,13,[unknown](/usr/bin/python3.6
7,7,decode_mcu tensorflow::grappler::IsConcat(ten...,14,_int_free arena_get2.part.3 malloc_consolidate
8,8,_PyEval_EvalFrameDefault,10,"std::_Function_handler<void (long, long), Eige..."
9,9,[unknown](/usr/lib/x86_64-linux-gnu/libcuda.so...,2,"std::_Hashtable<tensorflow::NodeDef const*, st..."
