In [1]:
import itertools
import pandas as pd
import numpy as np
from IPython.display import display

# each input-output-program in single row

In [3]:
def apply_func(x, func):
    d = {"x": x}
    exec(func, d)
    return d["x"]

def apply_func_on_group(group, func):
    group_2 = group.copy()
    group_2["input"] = group["input"].apply(apply_func, args=(func,))
    group_2["program"] += "\n" + func
    return group_2

def apply_meta_func_on_group(group, meta_func):
    func_list = meta_func(group)
#     print(meta_func.__name__, len(func_list))
    updated_group = pd.concat([apply_func_on_group(group, func) for func in func_list])
    return updated_group

def apply_meta_func(input_output_data_frame, meta_func):
    updated_input_output_data_frame = input_output_data_frame.groupby("program").apply(apply_meta_func_on_group, meta_func).reset_index(drop=True)
    return updated_input_output_data_frame

In [4]:
def get_all_substrings(input_string):
    length = len(input_string)
    return set([input_string[i:j+1] for i in range(length) for j in range(i,length)])

In [5]:
def get_intersection_of_list_of_sets(list_of_sets):
    set_0 = list_of_sets[0]
    for a_set in list_of_sets[1:]:
        set_0 = set_0.intersection(a_set)
    return set_0

In [6]:
def split_string(group):
    input_string_list = group["input"].values.tolist()
    list_of_set_of_separators = [get_all_substrings(string) for string in input_string_list]
    set_of_separators = get_intersection_of_list_of_sets(list_of_set_of_separators)
    candidate_functions_list = list(set(["x = x.split(\"{}\")".format(sep) for sep in set_of_separators]))
    return candidate_functions_list

In [69]:
def get_select_combinations(group):
    min_length = group["input"].apply(len).min()
    list_of_list = [list(itertools.combinations(range(min_length), k)) for k in range(1, min_length + 1)]
    reverse_list_of_list = [list(itertools.combinations(np.linspace(-min_length, -1, min_length).astype(int), k)) for k in range(1, min_length + 1)]
    select_list = list(itertools.chain.from_iterable(list_of_list + reverse_list_of_list))
    func_str_list = []
    for selected in select_list:
        if len(selected) <= 3 and np.ptp(selected) < 3:
            if len(selected) > -1:
                func = "x = [x[i] for i in {}]".format(selected)
            else:
                func = "x = x[{}]".format(selected[0])
            func_str_list.append(func)
    return func_str_list

In [70]:
def get_permutations(group):
    min_length = group["input"].apply(len).min()
    permutations_list = list(itertools.permutations(range(min_length)))
    func_str_list = ["x = [x[i] for i in {}]".format(permutation) for permutation in permutations_list]
    return func_str_list

In [71]:
def join_list_to_string(group):
    input_string_list = group["output"].values.tolist()
    list_of_set_of_separators = [get_all_substrings(string) for string in input_string_list]
    set_of_separators = get_intersection_of_list_of_sets(list_of_set_of_separators)
    candidate_functions_list = ["x = \"{}\".join(x)".format(sep) for sep in list(set_of_separators) + [""] if len(sep) <= 2]
    return candidate_functions_list

In [72]:
def is_program_correct_on_all_examples(data_frame):
    return (data_frame["input"] == data_frame["output"]).all()

def get_correct_programs(data_frame):
    correctness = pd.DataFrame(data_frame.groupby("program").apply(get_correct_on_all_examples)).reset_index()
    correct_programs = correctness[correctness[0]]["program"]
    return correct_programs

def code(input_output_data_frame, is_log=False):
    input_output_data_frame["program"] = ""
    if is_log:
        meta_func_list = [split_string, get_select_combinations, join_list_to_string]
    else:
        meta_func_list = [split_string, get_select_combinations, get_permutations, join_list_to_string]
    for meta_func in meta_func_list:
        input_output_data_frame = apply_meta_func(input_output_data_frame, meta_func)
        
    correct_programs = get_correct_programs(input_output_data_frame)
    return correct_programs

In [73]:
input_output_examples_list = [("aa bb cc", "bbmmccmmaa"), ("ll dd ff", "ddmmffmmll")]
input_output_data_frame = pd.DataFrame(input_output_examples_list, columns=["input", "output"])
input_output_data_frame

Unnamed: 0,input,output
0,aa bb cc,bbmmccmmaa
1,ll dd ff,ddmmffmmll


In [74]:
correct_series = code(input_output_data_frame)
correct_data_frame.apply(print)
correct_series


x = x.split(" ")
x = [x[i] for i in (-3, -2, -1)]
x = [x[i] for i in (1, 2, 0)]
x = "mm".join(x)

x = x.split(" ")
x = [x[i] for i in (0, 1, 2)]
x = [x[i] for i in (1, 2, 0)]
x = "mm".join(x)


35    \nx = x.split(" ")\nx = [x[i] for i in (-3, -2...
62    \nx = x.split(" ")\nx = [x[i] for i in (0, 1, ...
Name: program, dtype: object

# logs

In [139]:
def get_max_length_common_string(string_1, string_2):
    union_list = [substring for substring in get_all_substrings(string_1) if substring in string_2]
    common_string = union_list[pd.Series(union_list).apply(len).argmax()]
    return common_string

def get_max_length_common_string_of_list(string_list):
    common_string = string_list[0]
    for string_2 in string_list[1:]:
        common_string = get_max_length_common_string(common_string, string_2)
    return common_string

def get_function(input_output_data_frame):
    prefix_list = []
    suffix_list = []
    for index, row in input_output_data_frame.iterrows():
        sub_string = row["output"]
        line = row["input"]
        start_index = line.find(sub_string)
        end_index = start_index + len(sub_string)
        prefix = line[:start_index][-15:]
        suffix = line[end_index:][:15]
        prefix_list.append(prefix)
        suffix_list.append(suffix)
    common_prefix = get_max_length_common_string_of_list(prefix_list)
    common_suffix = get_max_length_common_string_of_list(suffix_list)
    func_str = "x = x.split(\"{}\")[1].split(\"{}\")[0]".format(common_prefix, common_suffix)
    return func_str

In [140]:
def get_info_from_log(log_lines, output_strings_list):
    log_lines = [line.replace("\n", "") for line in log_lines]
    interesting_lines = []
    for line in log_lines:
        for output_string in output_strings_list:
            if output_string in line:        
                interesting_lines.append((line, output_string))
                
    common_string = get_max_length_common_string(interesting_lines[0][0], interesting_lines[1][0])
    
    common_string_no_numeric_trail = common_string[:(-pd.Series(list(common_string)).str.isnumeric().values[::-1].argmin())]
    
    input_output_data_frame = pd.DataFrame(interesting_lines, columns=["input", "output"])

    func_str = get_function(input_output_data_frame)
    
#     inner_func = "\n".join(["\t" + line for line in possible_code_series.iloc[0].split("\n") if line != ""] )
    
    output_list = []
    for line in log_lines:
        if common_string_no_numeric_trail in line:
            d = {"x": line}
            exec(func_str, d)            
            output_list.append(d["x"])
            
    func = """
output_list = []
for line in log_lines:
    if \"{}\" in line:
        x = line
        {}
        output_list.append(x)
""".format(common_string_no_numeric_trail, func_str)
    
    return output_list, func

In [141]:
file_path = r".\log_example.txt"
log_lines = open(file_path, "r").readlines()
log_lines

['Aug  1 18:27:45 knight sshd[20325]: Illegal user test from 218.49.183.17\n',
 'Aug  1 18:27:46 knight sshd[20325]: Failed password for illegal user test from 218.49.183.17 port 48849 ssh2\n',
 'Aug  1 18:27:46 knight sshd[20325]: error: Could not get shadow information for NOUSER\n',
 'Aug  1 18:27:48 knight sshd[20327]: Illegal user guest from 218.49.183.17\n',
 'Aug  1 18:27:49 knight sshd[20327]: Failed password for illegal user guest from 218.49.183.17 port 49090 ssh2\n',
 'Aug  1 18:27:49 knight sshd[20327]: error: Could not get shadow information for NOUSER\n',
 'Aug  1 18:27:52 knight sshd[20329]: Failed password for admin from 218.49.183.17 port 49266 ssh2\n',
 'Aug  1 18:27:56 knight sshd[20331]: Failed password for admin from 218.49.183.17 port 49468 ssh2\n',
 'Aug  1 18:27:58 knight sshd[20334]: Illegal user user from 218.49.183.17\n',
 'Aug  1 18:27:59 knight sshd[20334]: Failed password for illegal user user from 218.49.183.17 port 49680 ssh2\n',
 'Aug  1 18:27:59 knight

In [147]:
output_strings_list = ["port 40009", "port 48849"]

output_list, func_str = get_info_from_log(log_lines, output_strings_list)

display(output_list)
print(func_str)

['port 48849',
 'port 49090',
 'port 49266',
 'port 49468',
 'port 49680',
 'port 49869',
 'port 50063',
 'port 50245',
 'port 50671',
 'port 52244',
 'port 52416',
 'port 52558',
 'port 52818',
 'port 52851',
 'port 53014',
 'port 53040',
 'port 53192',
 'port 53230',
 'port 53404',
 'port 53425',
 'port 53571',
 'port 53615',
 'port 54033',
 'port 54078',
 'port 54243',
 'port 54285',
 'port 54423',
 'port 39604',
 'port 39811',
 'port 40009',
 'port 40217',
 'port 40470',
 'port 40973',
 'port 41159',
 'port 41541',
 'port 41630 ssh']


output_list = []
for line in log_lines:
    if " from 218.49.183.17 port " in line:
        x = line
        x = x.split(" 218.49.183.17 ")[1].split(" ssh2")[0]
        output_list.append(x)



In [150]:
def apply_func_on_log_lines(log_lines, func_str):
    log_lines = [line.replace("\n", "") for line in log_lines]
    d = {"log_lines": log_lines}
    exec(func_str, d)
    return d["output_list"]

In [151]:
apply_func_on_log_lines(log_lines, func_str)

['port 48849',
 'port 49090',
 'port 49266',
 'port 49468',
 'port 49680',
 'port 49869',
 'port 50063',
 'port 50245',
 'port 50671',
 'port 52244',
 'port 52416',
 'port 52558',
 'port 52818',
 'port 52851',
 'port 53014',
 'port 53040',
 'port 53192',
 'port 53230',
 'port 53404',
 'port 53425',
 'port 53571',
 'port 53615',
 'port 54033',
 'port 54078',
 'port 54243',
 'port 54285',
 'port 54423',
 'port 39604',
 'port 39811',
 'port 40009',
 'port 40217',
 'port 40470',
 'port 40973',
 'port 41159',
 'port 41541',
 'port 41630 ssh']

In [188]:
output_script = \
"""import sys

input_file_path = sys.argv[1]
output_file_path = sys.argv[2]

log_lines = open(input_file_path, "r").readlines()
{}
open(output_file_path, "w").write("\\n".join(output_list))
print(\"done writing\", output_file_path)""".format(func_str)

In [189]:
print(output_script)

import sys

input_file_path = sys.argv[1]
output_file_path = sys.argv[2]

log_lines = open(input_file_path, "r").readlines()

output_list = []
for line in log_lines:
    if " from 218.49.183.17 port " in line:
        x = line
        x = x.split(" 218.49.183.17 ")[1].split(" ssh2")[0]
        output_list.append(x)

open(output_file_path, "w").write("\n".join(output_list))
print("done writing", output_file_path)


In [190]:
output_script_file_path = r".\example_output_script.py"

In [191]:
open(output_script_file_path, "w").write(output_script)

415

In [192]:
%run -i .\example_output_script .\log_example.txt .\log_output_example.txt

done writing .\log_output_example.txt
