In [24]:
import string
import base64
import pandas as pd
import numpy as np

In [5]:
def pad(s):
    ans = []
    for i in range(4):
        t = 'b'*i + s.strip('=') 
        t1 = t + (4 - len(t)%4)*'b'
        ans.append(t1)
    return ans, len(t1)*3/4

In [6]:
def max_plain(s):
    padded_seq, length = pad(s)
    M = 0
    ans = None
    for p in padded_seq:
        r = base64.b64decode(p)
        num_plain_chars = length - str(r).count('\\x')
        if num_plain_chars>M:
            M = num_plain_chars
            ans = (p, r, M/length)
    return ans

In [7]:
def get_b64_segments(s, min_len=16):
    b64_chars = set([c for c in string.ascii_lowercase+string.ascii_uppercase+string.digits+'/'+'+'])
    mask = ''
    for i in range(len(s)):
        if s[i] in b64_chars:
            mask=mask+s[i]
        else:
            mask=mask+'_'
    ans = mask.split('_')
    ans = [e for e in ans if len(e)>min_len]
    return ans

In [8]:
def batch_max_plain(list_of_strings):
    perc=[0]
    for el in list_of_strings:
        perc.append(max_plain(el)[2])

    return max(perc)

In [9]:
def count_chars(s):
    import numpy as np
    c_u = 0
    c_l = 0
    c_s = 0
    count = 0
    for char in s:
        if char.isupper():
            c_u += 1
        elif char.islower():
            c_l+=1
        else:
            c_s+=1
    l = len(s)
    return np.array([c_u, c_l, c_s])/l

In [10]:
def cosine_similarity(a, b):
    return np.dot(a, b)/np.linalg.norm(a)/np.linalg.norm(b)

In [12]:
def is_base64(s):
    norm = np.array([0.40625, 0.40625, 0.1875])
    if cosine_similarity(count_chars(s), norm)>0.75:
        return True
    return False

In [13]:
def padded_decode(s):
    padded_seq, length = pad(s)
    ans = []
    for p in padded_seq:
        r = base64.b64decode(p)
        num_plain_chars = length - str(r).count('\\x')
        ans.append((p, r, num_plain_chars/length))
    return ans

In [14]:
memo=[]

def recursive_decoder(list_of_strings,parent=-1, level=0, tr = 0.8, memo=[]):
    if len(list_of_strings) ==0:
        return False
    if parent==-1:
        root = True
    else:
        root = False
    for ind, string in enumerate(list_of_strings):
        for segment in get_b64_segments(string):
            if is_base64(segment):
                #print('BASE64 DETECTED!:', segment)
                tmp = padded_decode(segment)
                new_strings = []
                if root:
                    parent = ind
                for t in tmp:
                    if t[2]>tr:
                        print('Found encoded text at level:', level)
                        memo.append({'encoded':str(t[0]),'decoded':str(t[1]), 'plain':t[2],  'level':level, 'parent':parent})
                    new_strings.append(str(t[1]))

                    
                recursive_decoder(list_of_strings = new_strings, parent=parent, level=level+1, tr=tr, memo = memo)
    return memo

In [30]:
# functions to add noise to b64 encoding

def char_remove(s):
    l = len(s)
    ind = np.random.randint(l)
    return s[0:ind]+s[ind+1:]


def char_add(s):
    l = len(s)
    ind = np.random.randint(l)
    chars = [c for c in string.ascii_lowercase+string.ascii_uppercase+string.digits+'/'+'+']
    return s[0:ind]+np.random.choice(chars)+s[ind:]


def char_replace(s):
    l = len(s)
    ind = np.random.randint(l)
    chars = [c for c in string.ascii_lowercase+string.ascii_uppercase+string.digits+'/'+'+']
    return s[0:ind]+np.random.choice(chars)+s[ind+1:]

def add_noise(s, p=0.1):
    s1 = s
    n = int(len(s)*p)
    actions = ['a','d', 'r']
    for i in range(n):
        a = np.random.choice(actions)
        if a=='a':
            s1 = char_add(s1)
        if a=='d':
            s1 = char_remove(s1)
        else:
            s1 = char_remove(s1)
    return s

In [31]:
wdal = '''In the dawn an angel was dancing
Surrounded by an aura of light
But in the shadows something was watching
And with patience awaiting the night
Angel whispers: "Mournful night, attractive night
Your dark beauty obsesses me"
An angel bewitched by the shadows
Seduced by the whispering lies
A spell was cast an the sky turned red
The angel's heart froze to ice
The blackness that falls is coming to stay
Under the snow lies angels so cold
Dusk has passed and a cold morning breeze
Is sweeping all over the plain
On the ground lies an angel with skin so pale
On her face an image of pain
Snow is now falling to the frozen ground
The angel is covered by white
Frost is spreading across the plain
To welcome the eternal night
The dress is white with crystals of ice
And frozen roses so red
Roses of blood from an innocent soul
On the plain lies an angel dead'''

In [32]:
wdal_enc = """SW4gdGhlIGRhd24gYW4gYW5nZWwgd2FzIGRhbmNpbmcKU3Vycm91bmRlZCBieSBhbiBhdXJhIG9mIGxpZ2h0CkJ1dCBpbiB0aGUgc2hhZG93cyBzb21ldGhpbmcgd2FzIHdhdGNoaW5nCkFuZCB3aXRoIHBhdGllbmNlIGF3YWl0aW5nIHRoZSBuaWdodApBbmdlbCB3aGlzcGVyczogIk1vdXJuZnVsIG5pZ2h0LCBhdHRyYWN0aXZlIG5pZ2h0CllvdXIgZGFyayBiZWF1dHkgb2JzZXNzZXMgbWUiCkFuIGFuZ2VsIGJld2l0Y2hlZCBieSB0aGUgc2hhZG93cwpTZWR1Y2VkIGJ5IHRoZSB3aGlzcGVyaW5nIGxpZXMKQSBzcGVsbCB3YXMgY2FzdCBhbiB0aGUgc2t5IHR1cm5lZCByZWQKVGhlIGFuZ2VsJ3MgaGVhcnQgZnJvemUgdG8gaWNlClRoZSBibGFja25lc3MgdGhhdCBmYWxscyBpcyBjb21pbmcgdG8gc3RheQpVbmRlciB0aGUgc25vdyBsaWVzIGFuZ2VscyBzbyBjb2xkCkR1c2sgaGFzIHBhc3NlZCBhbmQgYSBjb2xkIG1vcm5pbmcgYnJlZXplCklzIHN3ZWVwaW5nIGFsbCBvdmVyIHRoZSBwbGFpbgpPbiB0aGUgZ3JvdW5kIGxpZXMgYW4gYW5nZWwgd2l0aCBza2luIHNvIHBhbGUKT24gaGVyIGZhY2UgYW4gaW1hZ2Ugb2YgcGFpbgpTbm93IGlzIG5vdyBmYWxsaW5nIHRvIHRoZSBmcm96ZW4gZ3JvdW5kClRoZSBhbmdlbCBpcyBjb3ZlcmVkIGJ5IHdoaXRlCkZyb3N0IGlzIHNwcmVhZGluZyBhY3Jvc3MgdGhlIHBsYWluClRvIHdlbGNvbWUgdGhlIGV0ZXJuYWwgbmlnaHQKVGhlIGRyZXNzIGlzIHdoaXRlIHdpdGggY3J5c3RhbHMgb2YgaWNlCkFuZCBmcm96ZW4gcm9zZXMgc28gcmVkClJvc2VzIG9mIGJsb29kIGZyb20gYW4gaW5ub2NlbnQgc291bApPbiB0aGUgcGxhaW4gbGllcyBhbiBhbmdlbCBkZWFk"""

In [33]:
B = pd.DataFrame(recursive_decoder([wdal_enc]))

Found encoded text at level: 0


In [34]:
recursive_decoder([add_noise(s = wdal_enc, p=0.1)])

Found encoded text at level: 0


[{'encoded': 'SW4gdGhlIGRhd24gYW4gYW5nZWwgd2FzIGRhbmNpbmcKU3Vycm91bmRlZCBieSBhbiBhdXJhIG9mIGxpZ2h0CkJ1dCBpbiB0aGUgc2hhZG93cyBzb21ldGhpbmcgd2FzIHdhdGNoaW5nCkFuZCB3aXRoIHBhdGllbmNlIGF3YWl0aW5nIHRoZSBuaWdodApBbmdlbCB3aGlzcGVyczogIk1vdXJuZnVsIG5pZ2h0LCBhdHRyYWN0aXZlIG5pZ2h0CllvdXIgZGFyayBiZWF1dHkgb2JzZXNzZXMgbWUiCkFuIGFuZ2VsIGJld2l0Y2hlZCBieSB0aGUgc2hhZG93cwpTZWR1Y2VkIGJ5IHRoZSB3aGlzcGVyaW5nIGxpZXMKQSBzcGVsbCB3YXMgY2FzdCBhbiB0aGUgc2t5IHR1cm5lZCByZWQKVGhlIGFuZ2VsJ3MgaGVhcnQgZnJvemUgdG8gaWNlClRoZSBibGFja25lc3MgdGhhdCBmYWxscyBpcyBjb21pbmcgdG8gc3RheQpVbmRlciB0aGUgc25vdyBsaWVzIGFuZ2VscyBzbyBjb2xkCkR1c2sgaGFzIHBhc3NlZCBhbmQgYSBjb2xkIG1vcm5pbmcgYnJlZXplCklzIHN3ZWVwaW5nIGFsbCBvdmVyIHRoZSBwbGFpbgpPbiB0aGUgZ3JvdW5kIGxpZXMgYW4gYW5nZWwgd2l0aCBza2luIHNvIHBhbGUKT24gaGVyIGZhY2UgYW4gaW1hZ2Ugb2YgcGFpbgpTbm93IGlzIG5vdyBmYWxsaW5nIHRvIHRoZSBmcm96ZW4gZ3JvdW5kClRoZSBhbmdlbCBpcyBjb3ZlcmVkIGJ5IHdoaXRlCkZyb3N0IGlzIHNwcmVhZGluZyBhY3Jvc3MgdGhlIHBsYWluClRvIHdlbGNvbWUgdGhlIGV0ZXJuYWwgbmlnaHQKVGhlIGRyZXNzIGlzIHdoaXRlIH

In [35]:
add_noise(s = wdal_enc, p=0.1)

'SW4gdGhlIGRhd24gYW4gYW5nZWwgd2FzIGRhbmNpbmcKU3Vycm91bmRlZCBieSBhbiBhdXJhIG9mIGxpZ2h0CkJ1dCBpbiB0aGUgc2hhZG93cyBzb21ldGhpbmcgd2FzIHdhdGNoaW5nCkFuZCB3aXRoIHBhdGllbmNlIGF3YWl0aW5nIHRoZSBuaWdodApBbmdlbCB3aGlzcGVyczogIk1vdXJuZnVsIG5pZ2h0LCBhdHRyYWN0aXZlIG5pZ2h0CllvdXIgZGFyayBiZWF1dHkgb2JzZXNzZXMgbWUiCkFuIGFuZ2VsIGJld2l0Y2hlZCBieSB0aGUgc2hhZG93cwpTZWR1Y2VkIGJ5IHRoZSB3aGlzcGVyaW5nIGxpZXMKQSBzcGVsbCB3YXMgY2FzdCBhbiB0aGUgc2t5IHR1cm5lZCByZWQKVGhlIGFuZ2VsJ3MgaGVhcnQgZnJvemUgdG8gaWNlClRoZSBibGFja25lc3MgdGhhdCBmYWxscyBpcyBjb21pbmcgdG8gc3RheQpVbmRlciB0aGUgc25vdyBsaWVzIGFuZ2VscyBzbyBjb2xkCkR1c2sgaGFzIHBhc3NlZCBhbmQgYSBjb2xkIG1vcm5pbmcgYnJlZXplCklzIHN3ZWVwaW5nIGFsbCBvdmVyIHRoZSBwbGFpbgpPbiB0aGUgZ3JvdW5kIGxpZXMgYW4gYW5nZWwgd2l0aCBza2luIHNvIHBhbGUKT24gaGVyIGZhY2UgYW4gaW1hZ2Ugb2YgcGFpbgpTbm93IGlzIG5vdyBmYWxsaW5nIHRvIHRoZSBmcm96ZW4gZ3JvdW5kClRoZSBhbmdlbCBpcyBjb3ZlcmVkIGJ5IHdoaXRlCkZyb3N0IGlzIHNwcmVhZGluZyBhY3Jvc3MgdGhlIHBsYWluClRvIHdlbGNvbWUgdGhlIGV0ZXJuYWwgbmlnaHQKVGhlIGRyZXNzIGlzIHdoaXRlIHdpdGggY3J5c3R

In [36]:
add_noise(s = wdal_enc, p=0.1)

'SW4gdGhlIGRhd24gYW4gYW5nZWwgd2FzIGRhbmNpbmcKU3Vycm91bmRlZCBieSBhbiBhdXJhIG9mIGxpZ2h0CkJ1dCBpbiB0aGUgc2hhZG93cyBzb21ldGhpbmcgd2FzIHdhdGNoaW5nCkFuZCB3aXRoIHBhdGllbmNlIGF3YWl0aW5nIHRoZSBuaWdodApBbmdlbCB3aGlzcGVyczogIk1vdXJuZnVsIG5pZ2h0LCBhdHRyYWN0aXZlIG5pZ2h0CllvdXIgZGFyayBiZWF1dHkgb2JzZXNzZXMgbWUiCkFuIGFuZ2VsIGJld2l0Y2hlZCBieSB0aGUgc2hhZG93cwpTZWR1Y2VkIGJ5IHRoZSB3aGlzcGVyaW5nIGxpZXMKQSBzcGVsbCB3YXMgY2FzdCBhbiB0aGUgc2t5IHR1cm5lZCByZWQKVGhlIGFuZ2VsJ3MgaGVhcnQgZnJvemUgdG8gaWNlClRoZSBibGFja25lc3MgdGhhdCBmYWxscyBpcyBjb21pbmcgdG8gc3RheQpVbmRlciB0aGUgc25vdyBsaWVzIGFuZ2VscyBzbyBjb2xkCkR1c2sgaGFzIHBhc3NlZCBhbmQgYSBjb2xkIG1vcm5pbmcgYnJlZXplCklzIHN3ZWVwaW5nIGFsbCBvdmVyIHRoZSBwbGFpbgpPbiB0aGUgZ3JvdW5kIGxpZXMgYW4gYW5nZWwgd2l0aCBza2luIHNvIHBhbGUKT24gaGVyIGZhY2UgYW4gaW1hZ2Ugb2YgcGFpbgpTbm93IGlzIG5vdyBmYWxsaW5nIHRvIHRoZSBmcm96ZW4gZ3JvdW5kClRoZSBhbmdlbCBpcyBjb3ZlcmVkIGJ5IHdoaXRlCkZyb3N0IGlzIHNwcmVhZGluZyBhY3Jvc3MgdGhlIHBsYWluClRvIHdlbGNvbWUgdGhlIGV0ZXJuYWwgbmlnaHQKVGhlIGRyZXNzIGlzIHdoaXRlIHdpdGggY3J5c3R

In [37]:
wdal_noisy_enc='''U1c0Z2RHaGxJR1JoZDI0Z1lXNGdZVzVuWld3Z2QyRnpJR1JoYm1OcGJtY0tVM1Z5Y205MWJtUmxaQ0JpZVNCaGJpQmhkWEpoSUc5bUlHeHBaMmgwQ2tKMWRDQnBiaUIwYUdVZ2MyaGhaRzkzY3lCemIyMWxkR2hwYm1jZ2QyRnpJSGRoZEdOb2FXNW5Da0Z1WkNCM2FYUm9JSEJoZEdsbGJtTmxJR0YzWVdsMGFXNW5JSFJvWlNCdWFXZG9kQXBCYm1kbGJDQjNhR2x6Y0dWeWN6b2dJazF2ZFhKdVpuVnNJRzVwWjJoMExDQmhkSFJ5WVdOMGFYWmxJRzVwWjJoMENsbHZkWElnWkdGeWF5QmlaV0YxZEhrZ2IySnpaWE56WlhNZ2JXVWlDa0Z1SUdGdVoyVnNJR0psZDJsMFkyaGxaQ0JpZVNCMGFHVWdjMmhoWkc5M2N3cFRaV1IxWTJWa0lHSjVJSFJvWlNCM2FHbHpjR1Z5YVc1bklHeHBaWE1LUVNCemNHVnNiQ0IzWVhNZ1kyRnpkQ0JoYmlCMGFHVWdjMnQ1SUhSMWNtNWxaQ0J5WldRS1ZHaGxJR0Z1WjJWc0ozTWdhR1ZoY25RZ1puSnZlbVVnZEc4Z2FXTmxDbFJvWlNCaWJHRmphMjVsYzNNZ2RHaGhkQ0JtWVd4c2N5QnBjeUJqYjIxcGJtY2dkRzhnYzNSaGVRcFZibVJsY2lCMGFHVWdjMjV2ZHlCc2FXVnpJR0Z1WjJWc2N5QnpieUJqYjJ4a0NrUjFjMnNnYUdGeklIQmhjM05sWkNCaGJtUWdZU0JqYjJ4a0lHMXZjbTVwYm1jZ1luSmxaWHBsQ2tseklITjNaV1Z3YVc1bklHRnNiQ0J2ZG1WeUlIUm9aU0J3YkdGcGJncFBiaUIwYUdVZ1ozSnZkVzVrSUd4cFpYTWdZVzRnWVc1blpXd2dkMmwwYUNCemEybHVJSE52SUhCaGJHVUtUMjRnYUdWeUlHWmhZMlVnWVc0Z2FXMWhaMlVnYjJZZ2NHRnBiZ3BUYm05M0lHbHpJRzV2ZHlCbVlXeHNhVzVuSUhSdklIUm9aU0JtY205NlpXNGdaM0p2ZFc1a0NsUm9aU0JoYm1kbGJDQnBjeUJqYjNabGNtVmtJR0o1SUhkb2FYUmxDa1p5YjNOMElHbHpJSE53Y21WaFpHbHVaeUJoWTNKdmMzTWdkR2hsSUhCc1lXbHVDbFJ2SUhkbGJHTnZiV1VnZEdobElHVjBaWEp1WVd3Z2JtbG5hSFFLVkdobElHUnlaWE56SUdseklIZG9hWFJsSUhkcGRHZ2dZM0o1YzNSaGJITWdiMllnYVdObENrRnVaQ0JtY205NlpXNGdjbTl6WlhNZ2MyOGdjbVZrQ2xKdmMyVnpJRzltSUdKc2IyOWtJR1p5YjIwZ1lXNGdhVzV1YjJObGJuUWdjMjkxYkFwUGJpQjBhR1VnY0d4aGFXNGdiR2xsY3lCaGJpQmhibWRsYkNCa1pXRms='''

In [38]:
recursive_decoder([add_noise(s=wdal_noisy_enc, p=0.1)])

Found encoded text at level: 0
Found encoded text at level: 1


[{'encoded': 'SW4gdGhlIGRhd24gYW4gYW5nZWwgd2FzIGRhbmNpbmcKU3Vycm91bmRlZCBieSBhbiBhdXJhIG9mIGxpZ2h0CkJ1dCBpbiB0aGUgc2hhZG93cyBzb21ldGhpbmcgd2FzIHdhdGNoaW5nCkFuZCB3aXRoIHBhdGllbmNlIGF3YWl0aW5nIHRoZSBuaWdodApBbmdlbCB3aGlzcGVyczogIk1vdXJuZnVsIG5pZ2h0LCBhdHRyYWN0aXZlIG5pZ2h0CllvdXIgZGFyayBiZWF1dHkgb2JzZXNzZXMgbWUiCkFuIGFuZ2VsIGJld2l0Y2hlZCBieSB0aGUgc2hhZG93cwpTZWR1Y2VkIGJ5IHRoZSB3aGlzcGVyaW5nIGxpZXMKQSBzcGVsbCB3YXMgY2FzdCBhbiB0aGUgc2t5IHR1cm5lZCByZWQKVGhlIGFuZ2VsJ3MgaGVhcnQgZnJvemUgdG8gaWNlClRoZSBibGFja25lc3MgdGhhdCBmYWxscyBpcyBjb21pbmcgdG8gc3RheQpVbmRlciB0aGUgc25vdyBsaWVzIGFuZ2VscyBzbyBjb2xkCkR1c2sgaGFzIHBhc3NlZCBhbmQgYSBjb2xkIG1vcm5pbmcgYnJlZXplCklzIHN3ZWVwaW5nIGFsbCBvdmVyIHRoZSBwbGFpbgpPbiB0aGUgZ3JvdW5kIGxpZXMgYW4gYW5nZWwgd2l0aCBza2luIHNvIHBhbGUKT24gaGVyIGZhY2UgYW4gaW1hZ2Ugb2YgcGFpbgpTbm93IGlzIG5vdyBmYWxsaW5nIHRvIHRoZSBmcm96ZW4gZ3JvdW5kClRoZSBhbmdlbCBpcyBjb3ZlcmVkIGJ5IHdoaXRlCkZyb3N0IGlzIHNwcmVhZGluZyBhY3Jvc3MgdGhlIHBsYWluClRvIHdlbGNvbWUgdGhlIGV0ZXJuYWwgbmlnaHQKVGhlIGRyZXNzIGlzIHdoaXRlIH