In [1]:
# This class runs random texts through itself and "normalizes" them
# according to certain ad hoc parsing rules that make ngram-statistics
# more robust.
#
# The goal is obviously to modify the text as little as possible.

class TextNormalizer():
    def normalize(self):
        special = set(".,:;-'")

        normalized = [ ]
        
        last_c = None

        for c in self.text:
            out = c

            if c in special:
                if last_c in special:
                    out = " "
            else:
                if not c.isalpha():
                    out = " "                

            normalized += [ out ]
            last_c = c

        normalized_text = "".join(normalized)

        while "  " in normalized_text:
            normalized_text = normalized_text.replace("  ", " ")

        return normalized_text.strip()
        
    def __init__(self, text: str):
        self.text = text

In [2]:
# These are 100,000 sentences grabbed from news media in 2023
# Each line is prefixed with a line number which we automatically
# yeet by not allowing for numbers in our text.

with open("100K_sentences.txt") as f:
    text = f.readlines()

normalized_corpus = TextNormalizer("".join(text)).normalize()

In [3]:
# This cell does the ngram statistics. It's really not hard.
# It can do any ngram, but I'm only using uni- and bigrams.

from collections import Counter

def grams(text: str, N: int):
    grams_tables = [ None ] * N
    window = text[:N]

    for i in range(N):
        grams_tables[i] = Counter()
    
    for c in text[N:]:
        for i in range(N):
            gram = window[i:]
            if True or gram.isalpha(): # XXX Tweak ngrams here!
                cnt = grams_tables[i]
                cnt[gram] += 1

        window = window[1:] + c

    return grams_tables

In [4]:
# Here's were I process the raw ngram frequency data.
# One of my model parameters is CDF threshold. I don't
# care about the long tail ngrams, because there's too
# much noise in there.

MAX_GRAM = 2
CDF_THRESHOLD = .75

gt = grams(normalized_corpus, N=MAX_GRAM)

grams_tmp = [ None ] * MAX_GRAM
grams_list = [ ]

for i in range(MAX_GRAM):
    all_grams = gt[i].most_common()
    total = gt[i].total()

    acc = 0
    grams_tmp[i] = [ ( k, c / total, (acc := acc + c / total)) for k, c in all_grams if acc < CDF_THRESHOLD ]
    grams_list += [ (k, f) for k, f, _ in grams_tmp[i] ]

grams_cumulative = dict(grams_list)

In [5]:
# So this is the first 9000+ characters from the Wikipedia article about
# Taylor Swift. This was copied using mouse select of text in Firefox.
# No other processing.

plain_text_formatted = """
Taylor Alison Swift (born December 13, 1989) is an American singer-songwriter. A subject of widespread public interest with a vast fanbase, she has influenced the music industry, popular culture and politics through her songwriting, artistry, entrepreneurship, and advocacy.

Swift began professional songwriting at age 14. She signed with Big Machine Records in 2005 and achieved prominence as a country pop singer with the albums Taylor Swift (2006) and Fearless (2008). Their singles "Teardrops on My Guitar", "Love Story", and "You Belong with Me" were crossover successes on country and pop radio formats and brought Swift mainstream fame. She experimented with rock and electronic styles on her next albums, Speak Now (2010) and Red (2012), respectively; Red featured her first Billboard Hot 100 number-one single, "We Are Never Ever Getting Back Together". Swift recalibrated her image from country to pop with 1989 (2014), a synth-pop album containing the chart-topping songs "Shake It Off", "Blank Space", and "Bad Blood". Media scrutiny inspired the hip-hop-influenced Reputation (2017) and its number-one single "Look What You Made Me Do".

After signing with Republic Records in 2018, Swift released the eclectic pop album Lover (2019) and the autobiographical documentary Miss Americana (2020). She explored indie folk styles on the 2020 albums Folklore and Evermore, subdued electropop on Midnights (2022), and re-recorded four albums subtitled Taylor's Version[a] after a dispute with Big Machine. These albums spawned the number-one songs "Cruel Summer", "Cardigan", "Willow", "Anti-Hero", "All Too Well", and "Is It Over Now?". Her Eras Tour (2023–2024) and its accompanying concert film became the highest-grossing tour and concert film of all time, respectively. Her 2024 record, The Tortured Poets Department, was a double album. Swift has directed videos and films such as Folklore: The Long Pond Studio Sessions (2020) and All Too Well: The Short Film (2021), and has also acted in films.

Swift is one of the world's best-selling artists, with 200 million records sold worldwide as of 2019. She is the most-streamed artist on Spotify, the highest-grossing female touring act, and the first billionaire with music as the main source of income. Seven of her albums have opened with over one million sales in a week. The 2023 Time Person of the Year, Swift has appeared on lists such as Rolling Stone's 100 Greatest Songwriters of All Time, Billboard's Greatest of All Time Artists, and Forbes' World's 100 Most Powerful Women. Her accolades include 14 Grammy Awards, a Primetime Emmy Award, 40 American Music Awards, 39 Billboard Music Awards, and 23 MTV Video Music Awards; she has won the Grammy Award for Album of the Year, the MTV Video Music Award for Video of the Year, and the IFPI Global Recording Artist of the Year a record four times each.
Life and career
Early life

Taylor Alison Swift was born on December 13, 1989, in West Reading, Pennsylvania.[1] She is named after the singer-songwriter James Taylor.[2] Her father, Scott Kingsley Swift, is a former stockbroker for Merrill Lynch; her mother, Andrea Gardner Swift (née Finlay), worked for a time as a mutual fund marketing executive.[3] Her younger brother, Austin, is an actor.[4] Swift's maternal grandmother, Marjorie Finlay (née Moehlenkamp), was an opera singer,[5] whose singing in church became one of Swift's earliest memories of music that shaped her career.[3] Swift's mother is of Scottish and German descent, and her father is of Scottish and English descent with distant Italian ancestry.[6][7]

Swift spent her early years on a Christmas tree farm in Pennsylvania that her father had purchased from one of his clients,[8] and she spent her summers at her family's vacation home in Stone Harbor, New Jersey, where she occasionally performed acoustic songs at a local coffee shop.[9] She is a Christian[10] and attended preschool and kindergarten at a Montessori school run by the Bernardine Sisters of St. Francis before transferring to the Wyndcroft School.[11][12] When her family moved to Wyomissing, Pennsylvania, she attended Wyomissing Area Junior/Senior High School.[13][14] As a child, she performed in Berks Youth Theatre Academy productions[15] and traveled regularly to New York City for vocal and acting lessons.[16] Her early love for country music was influenced by Shania Twain, Patsy Cline, LeAnn Rimes, and the Dixie Chicks,[12] and she spent weekends performing at local festivals and events.[17][18] After watching a documentary about Faith Hill, she became determined to pursue a country-music career in Nashville, Tennessee.[19]

At 11, Swift traveled to Nashville with her mother to visit record labels and submit demo tapes of Dolly Parton and Dixie Chicks karaoke covers.[20] She was rejected by all the labels, which led her to focus on songwriting.[21] She started learning the guitar at 12 with the help of Ronnie Cremer, a computer repairman and local musician who also assisted Swift with writing an original song.[22] In 2003, Swift and her parents started working with the talent manager Dan Dymtrow. With his help, Swift modeled for Abercrombie & Fitch and had an original song included on a Maybelline compilation CD.[23] After performing original songs at an RCA Records showcase, 13-year-old Swift was given an artist development deal and began to travel regularly to Nashville with her mother.[24][25] To help Swift break into the country music scene, her father transferred to Merrill Lynch's Nashville office when she was 14 years old, and the family relocated to Hendersonville, Tennessee.[26][27] Swift attended Hendersonville High School[28] before transferring to Aaron Academy after two years, which better accommodated her touring schedule through homeschooling. She graduated one year early.[29][30]
2004–2008: Career beginnings and first album

In Nashville, Swift worked with experienced Music Row songwriters such as Troy Verges, Brett Beavers, Brett James, Mac McAnally, and the Warren Brothers[31][32] and formed a lasting working relationship with Liz Rose.[33] They began meeting for two-hour writing sessions every Tuesday afternoon after school.[34] Rose called the sessions "some of the easiest I've ever done. Basically, I was just her editor. She'd write about what happened in school that day. She had such a clear vision of what she was trying to say. And she'd come in with the most incredible hooks." Swift became the youngest artist signed by the Sony/ATV Tree publishing house,[35] but left then BMG-owned RCA Records (later bought by Sony Music) at the age of 14 due to the label's lack of care and them "cut[ting] other people's stuff". She was also concerned that development deals can shelve artists[25][18] and recalled: "I genuinely felt that I was running out of time. I wanted to capture these years of my life on an album while they still represented what I was going through."[36]
Taylor Swift singing on a microphone and playing a guitar
Swift opening for Brad Paisley in 2007. To promote her first album, she opened tours for other country musicians in 2007 and 2008.[37]

At an industry showcase at Nashville's Bluebird Cafe in 2005, Swift caught the attention of Scott Borchetta, a DreamWorks Records executive who was preparing to form an independent record label, Big Machine Records. She had first met Borchetta in 2004.[38] She was one of Big Machine's first signings,[25] and her father purchased a three-percent stake in the company for an estimated $120,000.[39][40] She began working on her eponymous debut album with Nathan Chapman.[18] Swift wrote or co-wrote all album tracks, and co-writers included Rose, Robert Ellis Orrall, Brian Maher, and Angelo Petraglia.[41] Released in October 2006, Taylor Swift peaked at number five on the US Billboard 200, on which it spent 157 weeks—the longest stay on the chart by any release in the US in the 2000s decade.[42][43] Swift became the first female country music artist to write or co-write every track on a US platinum-certified debut album.[44]

Big Machine Records was still in its infancy during the June 2006 release of the lead single, "Tim McGraw", which Swift and her mother helped promote by packaging and sending copies of the CD single to country radio stations.[45] She spent much of 2006 promoting Taylor Swift with a radio tour and television appearances; she opened for Rascal Flatts on select dates during their 2006 tour,[46] as a replacement for Eric Church.[47] Borchetta said that although record industry peers initially disapproved of his signing a 15-year-old singer-songwriter, Swift tapped into a previously unknown market—teenage girls who listen to country music.[45][26]

Following "Tim McGraw", four more singles were released throughout 2007 and 2008: "Teardrops on My Guitar", "Our Song", "Picture to Burn" and "Should've Said No". All appeared on Billboard's Hot Country Songs, with "Our Song" and "Should've Said No" reaching number one. "Our Song" made Swift the youngest person to single-handedly write and sing a Hot Country Songs number-one single,[48] and "Teardrops on My Guitar" was Swift's breakthrough single on mainstream radio and charts.[49][50][51] Swift released two EPs, The Taylor Swift Holiday Collection in October 2007 and Beautiful Eyes in July 2008.[52][53] She promoted her debut album extensively as the opening act for other country musicians' tours in 2006 and 2007, including those by George Strait,[54] Brad Paisley,[55] and Tim McGraw and Faith Hill.[56]

Swift won multiple accolades for Taylor Swift. She was one of the recipients of the Nashville Songwriters Association's Songwriter/Artist of the Year in 2007, becoming the youngest person given the title.[57] She also won the Country Music Association's Horizon Award for Best New Artist,[58] the Academy of Country Music Awards' Top New Female Vocalist,[59] and the American Music Awards' Favorite Country Female Artist honor.[60] She was also nominated for Best New Artist at the 50th Annual Grammy Awards.[61] In 2008, she opened for Rascal Flatts again[62] and briefly dated the singer Joe Jonas.[63] 
"""

# We could normalize the plain text, but that's really a cheat.

#plain_text = TextNormalizer(plain_text_formatted).normalize()
plain_text = plain_text_formatted.strip()

In [6]:
# Here's where I encrypt the plain text with a "secret_key"
# The random.seed() stuff is there to make things deterministic,
# as well as to provide evidence that my RNG isn't cherry picked.

import random, string

random.seed(202404302259) # date and time for when I ran this

secret_key = "this is a thirtynine character long key"

KL = len(secret_key)

%run xorcipher.ipynb import XorCipher

cipher_text = bytearray(list(XorCipher(secret_key).encrypt(plain_text)))

In [7]:
# This is where I evaluate the fitness of a particular key.
# The fitness score is used below for crude hill climbing. It turns
# out that by relying on 1-gram and 2-gram statistics alone, we can usually
# solve for the key characters individually. I did not anticipate
# this. :)
#
# There are some model parameter choices here.
# How do we score 1-grams vs. 2-grams?
# How do we score raw relative frequencies?
# How do we score "cribs"?

def evaluate_key(cipher_text, key_str, grams_cumulative, crib=None, NB=12, OFFSET=0):
    decrypted_bytes = bytearray(itertools.islice(XorCipher(key_str).crypt(cipher_text), OFFSET, OFFSET + KL * NB))

    total_score = 0
    
    window = decrypted_bytes[:MAX_GRAM]
    
    for b in decrypted_bytes[MAX_GRAM:]:
        for i in range(MAX_GRAM):
            try:
                gram = ("".join([ chr(b) for b in window[i:] ]))
                
                if gram in grams_cumulative:
                    total_score += 2**(MAX_GRAM - i - 1) * grams_cumulative[gram]**.25
            except:
                print("kaka!")
                pass

        window = bytearray(list(window[1:]) + [ b ])

    crib_cnt = 0
    
    if crib is not None:
        crib_cnt = decrypted_bytes.count(bytearray(crib, "utf-8"))
        mult = (1 + crib_cnt) ** 1.25
        total_score *= mult
    
    return int(100 * total_score), crib_cnt    

In [8]:
# This is the actual code that cracks the XorCipher
#
# Each generation gives a fitness for the key that is guaranteed non-decreasing.
# The key is fuzzed by selecting a random index and simply flipping some bits at
# that character.

key_str = "".join([ "@" ] * KL) # starting guess

NB=12 # Number of decrypted blocks of key length size to run statistics on
OFFSET=2000 # Offset into the encrypted text

crib = None
crib = "Swift"

gen = 0
last_score = 0
last_key = key_str

# The heat map is an attempt att visualizing what's happening
# when fuzzing the key string. It's purely diagnostic.

def heat_map_str(heat_map):
    T = 0
    best_h = 0
    best_i = None
    l = len(heat_map)
    
    for i, h in enumerate(heat_map):
        if h > best_h:
            best_i = i
            best_h = h
            
        T += h * 4 / l

    if T > 999:
        T = 999
    
    return "T={:03d}, i={:02d}".format(int(T), best_i)

heat_map = [ 0 ] * KL
cooling_factor = .95

N_FUZZ = 4

found_key = False
evaluate_key_calls = 0

while not found_key:
    for i in range(KL):
        heat_map[i] *= cooling_factor

    fuzz_index = random.randint(0, KL-1)

    best_score = last_score
    best_crib_cnt = 0
    best_fuzzed_key = None

    j = 0
    while j < N_FUZZ:
        # Compute key fuzz byte
        #
        # The first bitwise AND should give an avg of 4 bits
        # OR:ing with the second bitwise AND should give ~ 6 bits
        # Honestly, this is maybe a bit much?
        #
        # I should plot the PDF for number of bits flipped.
        #
        # Edit: I did plot the distribution and it looks like the
        # bitwise OR adds just one more bit to the mean.
        #
        # In the end it doesn't matter much as long as we don't
        # fuzz too few or too many bits.
        #
        # We do need the longshots to get us out of local maxima
        # though. In these cases, we might need six or more bits to
        # be flipped.
        
        fuzz_byte  = random.randint(0, 255) & random.randint(0, 255)
        fuzz_byte |= random.randint(0, 255) & random.randint(0, 255)
        fuzzed_key_str = key_str[:fuzz_index] + chr(ord(key_str[fuzz_index]) ^ fuzz_byte) + key_str[fuzz_index+1:]
        key_score, crib_cnt = evaluate_key(cipher_text, fuzzed_key_str,
                                           grams_cumulative, crib=crib, NB=NB, OFFSET=OFFSET)
        evaluate_key_calls += 1

        if fuzzed_key_str.isprintable():
            if key_score > best_score:
                best_fuzzed_key = fuzzed_key_str
                best_score = key_score
                best_crib_cnt = crib_cnt
            elif False and fuzzed_key_str == secret_key:
                # This is the sad case when the correct key is just a local maximum.
                # Sometimes the algorithm fails to find the very last character of the
                # key, room for improvement I guess!
                
                key_str = best_fuzzed_key = fuzzed_key_str
                best_score = key_score
                best_crib_cnt = crib_cnt
                print("found the secret key, but it's not a global max", key_score, last_score)
                found_key = True
                break
            j += 1

    if not found_key and best_fuzzed_key:
        win = best_score - last_score
        heat_map[fuzz_index] += win
        last_score = best_score
        key_str = best_fuzzed_key

        print("{:05d}".format(gen), key_str, heat_map_str(heat_map), last_score, best_crib_cnt)

    if key_str == secret_key:
        found_key = True

    gen += 1

print("Key broken after {} generations".format(gen))
print("Key: \"{}\", score: {}, cribs: {}".format(key_str, best_score, best_crib_cnt))
print("cost: {} partial decryptions of total size {} Kbits".format(evaluate_key_calls,
                                                                   int(evaluate_key_calls * 8 * NB * KL / 2**10)))


00000 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@r@@@@@ T=045, i=33 441 0
00001 @@@@@@@@@@@@@@@@@@@@@@@@@@@ @@@@@r@@@@@ T=056, i=33 570 0
00002 ô@@@@@@@@@@@@@@@@@@@@@@@@@@ @@@@@r@@@@@ T=099, i=00 1015 0
00003 ô@@@@@@@@@@@@@@@@@@@@@@@@@1 @@@@@r@@@@@ T=099, i=00 1068 0
00004 ô@@@@@@@@@@@@@@@@y@@@@@@@@1 @@@@@r@@@@@ T=115, i=00 1269 0
00005 ô@@@@@@@@@@@@@@@@y@@@/@@@@1 @@@@@r@@@@@ T=118, i=00 1361 0
00006 ô@@@@@@@@@l@@@@@@y@@@/@@@@1 @@@@@r@@@@@ T=133, i=00 1558 0
00007 ô@@@@@@@@@l@@@@@@y@@@/@@@@1 @@@@@r@@ @@ T=154, i=00 1832 0
00009 ô@@@@@@@@@l@@@@@@y@@@/@@@@1 @@@@@r|@ @@ T=161, i=00 2048 0
00010 ô@@@@@@@@hl@@@@@@y@@@/@@@@1 @@@@@r|@ @@ T=184, i=09 2354 0
00011 ô@@@@@@@@hl@@@@@@y@@@/@@@@1 @~@@@r|@ @@ T=207, i=29 2662 0
00012 ô@@@@@@@@hl@@s@@@y@@@/@@@@1 @~@@@r|@ @@ T=215, i=29 2845 0
00013 ô@@@@@@@@hl@@s@@@y@l@/@@@@1 @~@@@r|@ @@ T=224, i=29 3036 0
00014 ô@@@@@@@@hl@@s@@@y@u@/@@@@1 @~@@@r|@ @@ T=218, i=29 3086 0
00016 ô@@@@@@@@hl@@s@@@yiu@/@@@@1 @~@@@r|@ @@ T=230, i=18 3407 0
00018 ô@@"@@@@@hl@@s@@@yiu@

KeyboardInterrupt: 

In [None]:
NB * KL

In [None]:
len(secret_key)