In [1]:
import itertools
import pandas as pd
import numpy as np
from IPython.display import display

# each input-output-program in single row

In [35]:
def apply_func(x, func):
    d = {"x": x}
    exec(func, d)
    return d["x"]

def apply_func_on_group(group, func):
    group_2 = group.copy()
    group_2["input"] = group["input"].apply(apply_func, args=(func,))
    group_2["program"] += "\n" + func
    return group_2

def apply_meta_func_on_group(group, meta_func):
    func_list = meta_func(group)
    print(meta_func.__name__, len(func_list))
    updated_group = pd.concat([apply_func_on_group(group, func) for func in func_list])
    return updated_group

def apply_meta_func(input_output_data_frame, meta_func):
    updated_input_output_data_frame = input_output_data_frame.groupby("program").apply(apply_meta_func_on_group, meta_func).reset_index(drop=True)
    return updated_input_output_data_frame

In [36]:
def get_all_substrings(input_string):
    length = len(input_string)
    return set([input_string[i:j+1] for i in range(length) for j in range(i,length)])

In [37]:
def get_intersection_of_list_of_sets(list_of_sets):
    set_0 = list_of_sets[0]
    for a_set in list_of_sets[1:]:
        set_0 = set_0.intersection(a_set)
    return set_0

In [38]:
def split_string(group):
    input_string_list = group["input"].values.tolist()
    list_of_set_of_separators = [get_all_substrings(string) for string in input_string_list]
    set_of_separators = get_intersection_of_list_of_sets(list_of_set_of_separators)
    candidate_functions_list = list(set(["x = x.split(\"{}\")".format(sep) for sep in set_of_separators]))
    return candidate_functions_list

In [39]:
def get_select_combinations(group):
    min_length = group["input"].apply(len).min()
    list_of_list = [list(itertools.combinations(range(min_length), k)) for k in range(1, min_length + 1)]
    reverse_list_of_list = [list(itertools.combinations(np.linspace(-min_length, -1, min_length).astype(int), k)) for k in range(1, min_length + 1)]
    select_list = list(itertools.chain.from_iterable(list_of_list + reverse_list_of_list))
    func_str_list = ["x = [x[i] for i in {}]".format(selected) for selected in select_list if len(selected) <= 3]
    return func_str_list

In [40]:
def get_permutations(group):
    min_length = group["input"].apply(len).min()
    permutations_list = list(itertools.permutations(range(min_length)))
    func_str_list = ["x = [x[i] for i in {}]".format(permutation) for permutation in permutations_list]
    return func_str_list

In [41]:
def join_list_to_string(group):
    input_string_list = group["output"].values.tolist()
    list_of_set_of_separators = [get_all_substrings(string) for string in input_string_list]
    set_of_separators = get_intersection_of_list_of_sets(list_of_set_of_separators)
    candidate_functions_list = ["x = \"{}\".join(x)".format(sep) for sep in list(set_of_separators) + [""] if len(sep) <= 2]
    return candidate_functions_list

In [42]:
def code(input_output_data_frame, is_log=False):
    input_output_data_frame["program"] = ""
    if is_log:
        meta_func_list = [split_string, get_select_combinations, join_list_to_string]
    else:
        meta_func_list = [split_string, get_select_combinations, get_permutations, join_list_to_string]
    for meta_func in meta_func_list:
        input_output_data_frame = apply_meta_func(input_output_data_frame, meta_func)
    correct_data_frame = input_output_data_frame[input_output_data_frame["input"] == input_output_data_frame["output"]]    
    return correct_data_frame

In [43]:
input_output_examples_list = [("aa bb cc", "bbmmccmmaa"), ("ll dd ff", "ddmmffmmll")]
input_output_data_frame = pd.DataFrame(input_output_examples_list, columns=["input", "output"])
input_output_data_frame

Unnamed: 0,input,output
0,aa bb cc,bbmmccmmaa
1,ll dd ff,ddmmffmmll


In [44]:
correct_data_frame = code(input_output_data_frame)
correct_data_frame["program"].drop_duplicates().apply(print)
correct_data_frame

split_string 1
split_string 1
get_select_combinations 14
get_select_combinations 14
get_permutations 1
get_permutations 1
get_permutations 2
get_permutations 1
get_permutations 2
get_permutations 2
get_permutations 6
get_permutations 1
get_permutations 2
get_permutations 6
get_permutations 2
get_permutations 1
get_permutations 2
get_permutations 1
get_permutations 1
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_string 3
join_list_to_st

Unnamed: 0,input,output,program
66,bbmmccmmaa,bbmmccmmaa,"\nx = x.split("" "")\nx = [x[i] for i in (-3, -2..."
67,ddmmffmmll,ddmmffmmll,"\nx = x.split("" "")\nx = [x[i] for i in (-3, -2..."
120,bbmmccmmaa,bbmmccmmaa,"\nx = x.split("" "")\nx = [x[i] for i in (0, 1, ..."
121,ddmmffmmll,ddmmffmmll,"\nx = x.split("" "")\nx = [x[i] for i in (0, 1, ..."


# logs

In [45]:
def get_max_length_common_string(string_1, string_2):
    union_list = [substring for substring in get_all_substrings(string_1) if substring in string_2]
    common_string = union_list[pd.Series(union_list).apply(len).argmax()]
    return common_string

In [46]:
def get_info_from_log(log_lines, output_strings_list):
    log_lines = [line.replace("\n", "") for line in log_lines]
    interesting_lines = []
    for line in log_lines:
        for output_string in output_strings_list:
            if output_string in line:        
                interesting_lines.append((line, output_string))
                
    common_string = get_max_length_common_string(interesting_lines[0][0], interesting_lines[1][0])
    
    common_string_no_numeric_trail = common_string[:(-pd.Series(list(common_string)).str.isnumeric().values[::-1].argmin())]
    
    input_output_data_frame = pd.DataFrame(interesting_lines, columns=["input", "output"])
    correct_data_frame = code(input_output_data_frame, True)
    
    possible_code_series = correct_data_frame["program"]
    
    possible_code_series.apply(print)
    
    inner_func = "\n".join(["\t" + line for line in possible_code_series.iloc[0].split("\n") if line != ""] )
    
    output_list = []
    for line in log_lines:
        if common_string_no_numeric_trail in line:
            d = {"x": line}
            exec(possible_code_series.iloc[0], d)            
            output_list.append(d["x"])
            
    func = """
output_list = []
for line in log_lines:
    if \"{}\" in line:
        x = line
{}
        output_list.append(x)
""".format(common_string_no_numeric_trail, inner_func)
    
    return output_list, func

In [47]:
file_path = r".\log_example.txt"
log_lines = open(file_path, "r").readlines()
log_lines

['Aug  1 18:27:45 knight sshd[20325]: Illegal user test from 218.49.183.17\n',
 'Aug  1 18:27:46 knight sshd[20325]: Failed password for illegal user test from 218.49.183.17 port 48849 ssh2\n',
 'Aug  1 18:27:46 knight sshd[20325]: error: Could not get shadow information for NOUSER\n',
 'Aug  1 18:27:48 knight sshd[20327]: Illegal user guest from 218.49.183.17\n',
 'Aug  1 18:27:49 knight sshd[20327]: Failed password for illegal user guest from 218.49.183.17 port 49090 ssh2\n',
 'Aug  1 18:27:49 knight sshd[20327]: error: Could not get shadow information for NOUSER\n',
 'Aug  1 18:27:52 knight sshd[20329]: Failed password for admin from 218.49.183.17 port 49266 ssh2\n',
 'Aug  1 18:27:56 knight sshd[20331]: Failed password for admin from 218.49.183.17 port 49468 ssh2\n',
 'Aug  1 18:27:58 knight sshd[20334]: Illegal user user from 218.49.183.17\n',
 'Aug  1 18:27:59 knight sshd[20334]: Failed password for illegal user user from 218.49.183.17 port 49680 ssh2\n',
 'Aug  1 18:27:59 knight

In [48]:
output_strings_list = ["port 40009", "port 48849"]

output_list, func = get_info_from_log(log_lines, output_strings_list)

display(output_list)
print(func)

split_string 760
split_string 760
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 1150
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 14
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6
get_select_combinations 6


KeyboardInterrupt: 