## Steps prior to run the experiments
1. Download all files, they are in `tar.gz` format, decompress those.
2. Create two new directories, one for all files from 2021 and one for 2022.
3. Each new directory should contain four sub-directories, output-naive-accurate, output-naive-rpcid, output-partial, output-rebuild.
All experiments run on one year's data.
4. Each sub-directory contain trace file(s) in `csv.gz` format.

In [None]:
# Set path to downloaded data. the top level directory should contain four sub-directories, output-naive-accurate, output-naive-rpcid, output-partial, output-rebuild
path = 'SET_YOUR_PATH' 
year = '2021' # should be '2021' or '2022'

## Required packages

In [None]:
# Install required pacakges
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import seaborn as sns
import os
import csv
import json
import glob

## Compare three topological characteristics of output traces
Maximum depth, maximum fanout, and size

In [None]:
class ModeCharacterizer:
    
    def __init__(self, path, the_year, mode):
        self.num_traces_processed = 0
        self.dir = path
        self.year = the_year
        self.size_list = []
        self.depth_list = []
        self.width_list = []
        self.max_fanout_list = []
        self.mode = mode
        

    def process_all_files(self):
        directory_path = self.dir
        num_files = 0 
        
        for root, dirs, files in os.walk(directory_path):
            
            for file in files: # all files has .csv extensions
                if file.endswith('.csv.gz'):
                    file_path = os.path.join(root, file)
                    num_files += 1
                    
                    # Read the CSV file into a DataFrame
                    # Specify the first 9 column names or indices
                    columns_to_read = []

                    if self.year == '2021':
                        columns_to_read = ['traceid', 'timestamp', 'rpcid', 'um', 'rpctype', 'dm', 'interface', 'rt']
                    else:
                        columns_to_read = ['traceid', 'rpcid', 'rpctype','um', 'interface', 'dm',  'rt']
                    
                    df = pd.read_csv(file_path, compression='gzip', usecols=columns_to_read)
                    df['rpcid'] = df['rpcid'].astype(str)
                    self.process_one_file(df)
                    
                    del df

    # used to sort rpcids
    def key(self, rpcid):
        return rpcid.count('.')

    def sort_rpcids(self, rpcids):
        # sort first by # of periods then by last digit 
        return sorted(rpcids, key=self.key)
        
    def process_one_file(self, df):
        trace_dfs= df.groupby('traceid')
        
        for tid, trace_df in trace_dfs:
            rpcids = self.sort_rpcids(trace_df['rpcid'].tolist())
            rpcid_to_root = self.find_all_roots(rpcids)
            root_to_rpcids = self.get_trees(rpcid_to_root)

            cur_size = self.calculate_size(root_to_rpcids)
            self.size_list.append(cur_size)

            depth = self.calculate_max_depth(root_to_rpcids)
            roots = list(root_to_rpcids.keys())

    
            width = self.calculate_max_fanout(roots, rpcids)
            self.depth_list.append(depth)
            self.width_list.append(width)
            self.num_traces_processed += 1

    # the number of fanout is the number of children the parent node calls in a trace
    def calculate_max_fanout(self, roots, rpcids):
        parent_to_numChildern = {}

        for rpcid in rpcids:
            
            if rpcid in roots:
                continue
            parent = rpcid.rsplit('.', 1)[0]
            if parent not in parent_to_numChildern:
                parent_to_numChildern[parent] = 1
            else:
                parent_to_numChildern[parent] += 1

        if len(parent_to_numChildern) == 0:
            return 0
        return max(parent_to_numChildern.values())           
        

    # return all subtress in a call graph,
    # a dictionary storing the map from each root rpcid and all its decedent 
    def get_trees(self, rpcid_to_root):
        root_to_rpcids = {}

        for key, value in rpcid_to_root.items():
            if value in root_to_rpcids:
                root_to_rpcids[value].append(key)
            else:
                root_to_rpcids[value] = [key]
            
        return root_to_rpcids

    # the size of a trace is the number of nodes in the call graph
    # the same microservce could repeat in a call graph, and each occurrence adds 1 to the size
    def calculate_size(self,root_to_rpcids):
        size = 0
        for root in root_to_rpcids:
            size += (len(root_to_rpcids[root]) + 1)
        return size

    # root is of depth 1
    def calculate_max_depth(self, root_to_rpcids):
        depth_to_numNode = {1: 0}

        for root in root_to_rpcids:
            rpcids = root_to_rpcids[root]
            #handle root
            depth_to_numNode[1] += 1
            
            for rpcid in rpcids:
                depth = rpcid.count('.') - root.count('.') + 2
                
                if depth not in depth_to_numNode:
                    depth_to_numNode[depth] = 1
                else:
                    depth_to_numNode[depth] += 1

        return max(depth_to_numNode)

    # A root is a rpcid whose parent rpcid does not exist in the trace
    def find_all_roots(self, rpcids):
        rpcid_to_root = {}
        
        for rpcid in rpcids:
            # handle special case for 2022
            if rpcid == '0':
                rpcid_to_root[rpcid] = rpcid
                continue

            # handle special case, root rpcid could be anything, e.g. 25
            if rpcid.count('.') == 0:
                rpcid_to_root[rpcid] = rpcid
                continue 

            parent = rpcid.rsplit('.', 1)[0]
            if parent not in rpcids:
                rpcid_to_root[rpcid] = rpcid
            else:
                rpcid_to_root[rpcid] = rpcid_to_root[parent]
                
        return rpcid_to_root
        
    
    def get_num_traces(self):
        return self.num_traces_processed
    
    def get_trace_sizes(self):
        return self.size_list

    def get_trace_widths(self):
        return self.width_list

    def get_trace_depths(self):
        return self.depth_list
            

In [None]:
# create a ModeCharacterizer for each of the four modes
def init_trace_characterizer(path, year, mode):
    trace_char = ModeCharacterizer(path, year, mode)
    trace_char.process_all_files() 
    return trace_char

# path for each mode
naive_rpcid_path = path + 'output-naive-rpcid/'
naive_accurate_path = path + 'output-naive-accurate/'
partial_path = path + 'output-partial/'
rebuild_path = path + 'output-rebuild/'

naive_rpcid_char = init_trace_characterizer(naive_rpcid_path, year, 'naive_rpcid')
naive_accurate_char = init_trace_characterizer(naive_accurate_path, year, 'naive_accurate')
partial_char = init_trace_characterizer(partial_path, year, 'partial')
rebuild_char = init_trace_characterizer(rebuild_path, year, 'rebuild')

## Plot the CDFs to compare trace sizes, maximum trace depth and maximum trace width for four different modes

In [None]:
# figure style setup
sns.set(font_scale=1.8)
sns.set_style("whitegrid")
(width, legend_font) = (5, 25)
(c1, c2, c3, c4) = ("#d7191c", "#fdae61", "#abd9e9", "#2c7bb6")

In [None]:
# compare trace sizes
sns.ecdfplot(data=naive_rpcid_char.get_trace_sizes(), label="Naive-rpcid", color=c1, linewidth = 2)
sns.ecdfplot(data=naive_accurate_char.get_trace_sizes(), label="Naive-accurate", color=c2, linewidth = 2)
sns.ecdfplot(data=partial_char.get_trace_sizes(), label="Partial", color=c3, linewidth = 2)
sns.ecdfplot(data=rebuild_char.get_trace_sizes(), label="Casper", color=c4, linewidth = 2)

plt.xlabel("Trace Size")
plt.xscale('log')
plt.gca().set_ylabel('')
plt.legend()
plt.show()

In [None]:
# compare maximum depths
sns.ecdfplot(data=naive_rpcid_char.get_trace_depths(), label="Naive-rpcid", color=c1, linewidth = 2)
sns.ecdfplot(data=naive_accurate_char.get_trace_depths(), label="Naive-accurate", color=c2, linewidth = 2)
sns.ecdfplot(data=partial_char.get_trace_depths(), label="Partial", color=c3, linewidth = 2)
sns.ecdfplot(data=rebuild_char.get_trace_depths(), label="Casper", color=c4, linewidth = 2)

plt.xlabel("Max Depth")
plt.gca().set_ylabel('')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
# compare maximum fanouts
sns.ecdfplot(data=naive_rpcid_char.get_trace_widths(), label="Naive-rpcid", color=c1, linewidth = 2)
sns.ecdfplot(data=naive_accurate_char.get_trace_widths(), label="Naive-accurate", color=c2, linewidth = 2)
sns.ecdfplot(data=partial_char.get_trace_widths(), label="Partial", color=c3, linewidth = 2)
sns.ecdfplot(data=rebuild_char.get_trace_widths(), label="Casper", color=c4, linewidth = 2)

plt.xlabel("Max Width")
plt.gca().set_ylabel('')
plt.xscale('log')
plt.legend() 
plt.show()

## Impact of recovery mechanisms in CASPER algorithm

In [None]:
# Helper Classes for next part
class FilledMissValCharacterizer:
    
    def __init__(self, homepath, rpcid_file, rebuild_file, the_year):
        self.homepath = homepath
        self.num_traces_processed = 0
        self.rpcid_file = rpcid_file
        self.rebuild_file = rebuild_file
        self.tid_to_lists = {}  # tid is trace_id

        self.output_lists = {'um_num' : [], 
                             'um_perc' : [],
                             'dm_num' : [], 
                             'dm_perc' : [] }
        
        self.year = the_year


    def process_file(self):
        columns_to_read = []
        if self.year == '2021':
            columns_to_read = ['traceid', 'timestamp', 'rpcid', 'um', 'rpctype', 'dm', 'interface', 'rt']
        else:
            columns_to_read = ['traceid', 'rpcid', 'rpctype','um', 'interface', 'dm',  'rt']
            
        rpcid_df = pd.read_csv(self.rpcid_file, compression='gzip', usecols=columns_to_read)
        rebuild_df = pd.read_csv(self.rebuild_file, compression='gzip', usecols=columns_to_read)
        self.populate_lists(rpcid_df)
        
        trace_dfs= rebuild_df.groupby('traceid')
        
        for tid, df in trace_dfs:
            um_rpcid_list = self.tid_to_lists[tid]['um_rpcid_list']
            dm_rpcid_list = self.tid_to_lists[tid]['dm_rpcid_list']
            
            # process um_df
            if len(um_rpcid_list) > 0:
                um_df = df[df['rpcid'].isin(um_rpcid_list)]
                if len(um_df) > 0:
                    um_num = (um_df['um'] != '(?)').sum()
                    um_ratio = int(um_num) / int(len(um_df))
                    um_perc = round( um_ratio*100, 2)
                    self.output_lists['um_num'].append(um_num)
                    self.output_lists['um_perc'].append(um_perc)
            # process dm_df
            if len(dm_rpcid_list) > 0:
                dm_df = df[df['rpcid'].isin(dm_rpcid_list)]
                if len(dm_df) > 0:
                    dm_num = (dm_df['dm'] != '(?)').sum()
                    dm_ratio = int(dm_num) / int(len(dm_df)) 
                    dm_perc = round( dm_ratio*100, 2)
                    self.output_lists['dm_num'].append(dm_num)
                    self.output_lists['dm_perc'].append(dm_perc)

            self.num_traces_processed += 1
        
        self.save_lists()
        del rpcid_df, rebuild_df

    # save intermediate data
    def save_lists(self):
        file_name = os.path.basename(self.rebuild_file)
        
        um_df = pd.DataFrame({'um_num': self.output_lists['um_num'], 'um_perc': self.output_lists['um_perc']})
        um_df.to_csv(self.homepath + file_name.rsplit('.', 1)[0] + "-um.csv" , index=False)

        dm_df = pd.DataFrame({'dm_num': self.output_lists['dm_num'], 'dm_perc': self.output_lists['dm_perc']})
        dm_df.to_csv(self.homepath + file_name.rsplit('.', 1)[0] + "-dm.csv", index=False)
    
    
    # use rpcid_df to create two lists for each tid
    def populate_lists(self, df):
        trace_dfs= df.groupby('traceid')
        
        for tid, trace_df in trace_dfs:
            self.tid_to_lists[tid] = {}
            # only keep the rows that have missing values
            um_df = trace_df[trace_df['um'] == '(?)']   
            dm_df = trace_df[trace_df['dm'] == '(?)']
                
            self.tid_to_lists[tid]['um_rpcid_list'] = um_df['rpcid'].tolist()
            self.tid_to_lists[tid]['dm_rpcid_list'] = dm_df['rpcid'].tolist()


# Helper class to synthesize intermediate data
class FilledMissValHelper:
    
    def __init__(self, path, the_year, um_or_dm):
        self.num_traces_processed = 0
        self.dir = path
        #self.trace_data_exps = {}
        self.year = the_year
        self.um_or_dm = um_or_dm
        self.um_num = []
        self.um_perc = []
        self.dm_num = []
        self.dm_perc = []
        

    def process_all_files(self):
        directory_path = self.dir
        num_files = 0 
        
        for root, dirs, files in os.walk(directory_path):
            for file in files: # all files has .csv extensions
                if self.um_or_dm =='um':
                    
                    if file.endswith('um.csv'):
                        file_path = os.path.join(root, file)
                        num_files += 1

                        df = pd.read_csv(file_path)
                        self.um_num.extend(df['um_num'].tolist())
                        self.um_perc.extend(df['um_perc'].tolist())
                        del df
                        
                else:
                    if file.endswith('dm.csv'):
                        file_path = os.path.join(root, file)
                        num_files += 1

                        df = pd.read_csv(file_path)
                        self.dm_num.extend(df['dm_num'].tolist())
                        self.dm_perc.extend(df['dm_perc'].tolist())
                    
                        del df
        self.clean_intermediate_files()  # delete intermediate data, the csv files stored at top level directory 

        if self.um_or_dm =='dm':
            return self.dm_num, self.dm_perc
        else:
            return self.um_num, self.um_perc
    
    def clean_intermediate_files(self):
        # Create a pattern to match all '.txt' files
        pattern = os.path.join(self.dir, '*.csv')

        # List all files in the directory with .txt extension
        intermediate_files = glob.glob(pattern)

        # Loop through the files and delete them
        for file in intermediate_files:
            try:
                os.remove(file)
                #print(f"Deleted: {file}")
            except OSError as e:
                print(f"Error: {file} : {e.strerror}")

In [None]:
class ImpactCharacterizer:
    
    def __init__(self, path, the_year):
        self.num_traces_processed = 0
        self.dir = path
        self.year = the_year


        # Recovery mechanism #1 
        # Adding missing calls
        self.num_miss_rpcid_list = []
        
        # Recovery mechanism #2
        # filled in missing DMs 
        self.num_filled_dm_list = []
        self.perc_filled_dm_list = []


        self.num_rpcids_at_source_list = []
        self.num_rpcids_downstream_list = []
        self.num_downstream_cpes_list = []
        
        # # Recovery mechanism #3
        # # Updated rpcids at the source of a CPE
        # self.num_source_new_rpcids_list = []
        # self.perc_source_new_rpcids_list = []

        # # Recovery mechanism #4
        # # Recovered rpcids downstream from a CPE
        # self.num_downstream_rpcids_list = []
        # self.perc_downstream_rpcids_list = []

        # Additional complete traces
        self.num_unrecoverable_rpcid_list = []
    
    
    def process_files_for_adding_mising_calls(self):
        directory_path = self.dir + "output-rebuild/error-stats/"
        all_files = os.listdir(directory_path)
        error_files = [filename for filename in all_files if "errors" in filename]

        # Process each error file
        for error_file in error_files:
            #print(error_file)
            file_path = os.path.join(directory_path, error_file)

            # Open and parse the JSON file
            with open(file_path, 'r') as file:
                json_data = json.load(file)
                for key in json_data:
                    self.num_miss_rpcid_list.append(json_data[key]["METADATA"]["missing_rpcids"])
                    self.num_unrecoverable_rpcid_list.append(json_data[key]["METADATA"]["unrecoverable_rpcids"])
        
    # use helper classed defined above to get data by comparing the missing dms in naive-rpcid traces against that in rebuild/casper traces
    def process_files_for_filled_umdm(self):
        naive_rpcid_path = self.dir + 'output-naive-rpcid/'
        rebuild_path = self.dir  + 'output-rebuild/'
        naive_rpcid_file_names = [file for file in os.listdir(naive_rpcid_path) if file.endswith('.csv.gz')]
        rebuild_file_names = [filename.replace('naive-rpcid', 'rebuild') for filename in naive_rpcid_file_names]
        size = len(rebuild_file_names)

        for i in range(size):
            rpcid_file = naive_rpcid_path + naive_rpcid_file_names[i]
            rebuild_file = rebuild_path + rebuild_file_names[i]
            temp_char = FilledMissValCharacterizer(self.dir, rpcid_file, rebuild_file, self.year)
            temp_char.process_file()
        helper = FilledMissValHelper(self.dir, year, 'dm') # change "dm" to "um" to get "um" stats
        helper.process_all_files()
        self.num_filled_dm_list, self.perc_filled_dm_list = helper.process_all_files()

    def process_all_files(self):
        self.process_files_for_adding_mising_calls() # processing all the error json files
        self.process_files_for_filled_umdm()
        
        directory_path = self.dir + "output-rebuild/"
        num_files = 0 
        
        for root, dirs, files in os.walk(directory_path):
            
            for file in files: # all files has .csv extensions
                if file.endswith('csv.gz'):
                    file_path = os.path.join(root, file)
                    num_files += 1
                    
                    # Read the CSV file into a DataFrame
                    # Specify the first 9 column names or indices
                    columns_to_read = []
                    
                    if self.year == '2021':
                        columns_to_read = ['traceid', 'timestamp', 'rpcid', 'um', 'rpctype', 'dm', 'interface', 'rt']
                    else:
                        columns_to_read = ['traceid', 'rpcid', 'rpctype','um', 'interface', 'dm',  'rt']
                    
                    df = pd.read_csv(file_path, compression='gzip', usecols=columns_to_read)
                    df['rpcid'] = df['rpcid'].astype(str)
                    self.process_one_file(df)
                    del df

    
    # used to sort rpcids
    def key(self, rpcid):
        return rpcid.count('.')

    def sort_rpcids(self, rpcids):
        # sort first by # of periods then by last digit 
        return sorted(rpcids, key=self.key)

    
    def process_one_file(self, df):
        trace_dfs= df.groupby('traceid')

        for tid, trace_df in trace_dfs:
            
            rpcids = trace_df['rpcid'].tolist()
            rpcids = self.sort_rpcids(rpcids)
            sources_to_nums={}  # store per-source 

            for rpcid in rpcids:

                last_hyphen_index = rpcid.rfind('-')
                if last_hyphen_index == -1:
                    continue # skip if we didn't modify

                source = str (rpcid.split('-', 1)[0])

                # a cpe source
                if (rpcid.count('-') == 2) and ('.' not in rpcid[last_hyphen_index:]):
                    if source not in sources_to_nums:
                        sources_to_nums[source] = {"num_rpcid_modified" : 1, "num_downstream_rpcid" : 0, "downstream_cpes" : set()}
                    else:
                        # could be another rpcid modified for the an exisitng source
                        sources_to_nums[source]["num_rpcid_modified"] +=1
                # a downstream cpe
                elif (rpcid.count('-') != 2) and ('.' not in rpcid[last_hyphen_index:]):
                    second_last_hyphen_index = rpcid.rfind("-", 0, rpcid.rfind("-"))
                    cur_cpe = rpcid[:second_last_hyphen_index]
                    sources_to_nums[source]["downstream_cpes"].add(cur_cpe)
                    sources_to_nums[source]["num_downstream_rpcid"] += 1
                else: # any other modified downstream rpcid
                    sources_to_nums[source]["num_downstream_rpcid"] += 1
            
            # done collecting for the current trace
            
            # extending global lists, each entry represents info per source of CPE regardless individual traces
            self.num_rpcids_at_source_list.extend([source_data["num_rpcid_modified"] for source_data in sources_to_nums.values()]) 
            self.num_rpcids_downstream_list.extend([source_data["num_downstream_rpcid"] for source_data in sources_to_nums.values()])

            downstream_cpes_counts = []
            for source_data in sources_to_nums.values():
                downstream_cpes_set = source_data["downstream_cpes"]  # Get downstream_cpes set
                downstream_cpes_count = len(downstream_cpes_set)  # Count elements in the set
                downstream_cpes_counts.append(downstream_cpes_count)  # Append count to list
            
            self.num_downstream_cpes_list.extend(downstream_cpes_counts)
            
            self.num_traces_processed += 1

    

    def print_stats(self, cur_list):
        # number of trace affected
        num_total = len(cur_list)
        
        pos_list = [num for num in cur_list if num > 0]
        num_affected = len(pos_list)
        ratio = num_affected / num_total
        percentage = round(ratio * 100, 2)
        
        print("total_traces : " + (str)(num_total)) 
        print("affected_traces : " + (str)(num_affected))
        print("percentage of traces affected: " + (str) (percentage))
       
        stats = pd.Series(pos_list).describe().round(2)
        print(stats)    
        self.print_p99(pos_list)


    def print_p99(self, data):
        p99 = np.percentile(data, 99)
        rounded_p99 = round(p99, 2)
        print("99th percentile (P99):", rounded_p99) 
    
    # get the total number of traces in all csv files in a directory
    def count_num_trace_id(self, directory_path):
        count = 0
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                if file.endswith('.csv.gz'):
                    file_path = os.path.join(root, file)
                    df = pd.read_csv(file_path, compression='gzip')
                    count += len(df.traceid.unique())
                    del df
        return count
    
    def stats_adding_missing_calls(self):
        print("============== YEAR: " + self.year + " ==============")
        print("~~~~~~~~~~~~~~~~~~~~ adding_missing_calls ~~~~~~~~~~~~~")
        self.print_stats(self.num_miss_rpcid_list)

    def stats_filled_in_missing_DMs(self): 
        print("~~~~~~~~~~~~~~~~ ####### num_filled_in_missing_DMs #######~~~~~~~~~~~~~~~~")
        self.print_stats(self.num_filled_dm_list)
        print("")
        print("~~~~~~~~~~~~~~~~ ####### perc_filled_in_missing_DMs #######~~~~~~~~~~~~~~~~")
        self.print_stats(self.perc_filled_dm_list)
        print("")
    
    def stats_updated_rpcids_CPE_source(self):
        print("~~~~~~~~~~~~~~~~ ####### num_modified_rpcids_at_source_cpe ####### ~~~~~~~~~~~~~~~~")
        print(pd.Series(self.num_rpcids_at_source_list).describe().round(2)) 
        self.print_p99(self.num_rpcids_at_source_list)
        print("")

    def stats_fixed_downstream_CPEs(self):
        print("~~~~~~~~~~~~~~~~ ####### num_downstream_CPEs ####### ~~~~~~~~~~~~~~~~")
        print(pd.Series(self.num_downstream_cpes_list).describe().round(2)) 
        self.print_p99(self.num_downstream_cpes_list)
        print("")

    def stats_recovered_rpcids_CPE_downstream(self):
        print("~~~~~~~~~~~~~~~~ ####### num_modified_downstream_rpcid ####### ~~~~~~~~~~~~~~~~")
        print(pd.Series(self.num_rpcids_downstream_list).describe().round(2)) 
        self.print_p99(self.num_rpcids_downstream_list)
        print("")



    def stats_additional_complete_traces(self):
        num_naive_rpcid = self.count_num_trace_id(self.dir + "output-naive-rpcid/")
        num_naive_accurate = self.count_num_trace_id(self.dir + "output-naive-accurate/")
        
        print("============== YEAR: " + self.year + " ==============")
        print("~~~~~~~~~~~~~~~~~~~~ COMPLETE TRACES ~~~~~~~~~~~~~")
        # number of trace affected
        num_total = len(self.num_unrecoverable_rpcid_list)
        zero_list = [num for num in self.num_unrecoverable_rpcid_list if num == 0]
        num_zero = len(zero_list)
        ratio = num_zero / num_total
        percentage = round(ratio * 100, 2)
        print("num_total_rebuild_traces : " + (str)(num_total)) 
        print("num_complete_rebuild_traces : " + (str)(num_zero))
        print("percentage of complete traces: " + (str) (percentage))

        print("~~~~~~~~~~~~~~~~~~~~ ADDITIONAL COMPLETE TRACES ~~~~~~~~~~~~~")
        print("num_total_traces, i.e. num_naive_rpcid_traces: " + (str)(num_naive_rpcid) )    
        print("num_naive_accurate_traces: " + (str)(num_naive_accurate) ) 
        print("num_complete_rebuild_traces : " + (str)(num_zero))
        num_additional_complete = num_zero - num_naive_accurate
        print("num_additional_complete_traces: " + (str)(num_additional_complete))
        ratio_2 = num_additional_complete / num_naive_rpcid
        percentage_2 = round(ratio_2 * 100, 2)
        print("perc_additional_complete_traces: "  + (str)(percentage_2))

In [None]:
impact_char = ImpactCharacterizer(path, year)
impact_char.process_all_files() 

In [None]:
impact_char.stats_adding_missing_calls()

In [None]:
impact_char.stats_filled_in_missing_DMs()

In [None]:
impact_char.stats_updated_rpcids_CPE_source()

In [None]:
impact_char.stats_recovered_rpcids_CPE_downstream()

## Additional complete traces
A trace is complete if there are no unrecoverable rpcids (explained in the paper Section 4.3).

In [None]:
impact_char.stats_additional_complete_traces()