In [None]:
%%sql -r dataframe_1
USE DATABASE SNOWBEARAIR_DB;
USE SCHEMA PUBLIC;

### Riddle me this (extra credit)…

- In the 1930s, famous gangster Al Capone was bootlegging alcohol out of several secret distilleries, and FBI agent Eliot Ness was on his heals, trying to sneak in and gather intel.

- Capone had so many bodyguards, he was hard to tail, so Eliot followed Big Al’s brothers around instead.

- One day, he tailed Frank to a distillery and got close enough to hear the exchange between Frank and the guard. They had a secret password exchange. The guard said “78M” and Frank said “13”.

- Another time, Ness was able to hear Vinny’s exchange with a guard: “132V” and a response of “22”.

- Being quick with numbers, Ness felt like he had the key to this passcode response now and decided to try it out. He went to another Capone joint and approached the door.  The guard said “276F”. Ness did some quick thinking and replied “46”… and nearly got shot in response, except he heard the gun cock and spun out the way too quickly for the mobsters.

- What should his response have been?  The answer is in the last cell.  Skip to the end if you'd like to avoid the ugly path.

In searching the offices and hideouts, they found a ledger with 1,000 possible challenges and responses, but it never made any difference. Capone was eventually convicted of tax evasion.


I start with the assumption that a, mostly uneducated, gangster cannot possibly memorize 1,000 responses to the same number of codes.  I also assume they would not carry around, in writing, a list of code/response pairs, nor the written formuala for determinging the pair.  Ironically, in my very first attempt to build a password encryptor, I went through a similar algorithmic approach and assured there was an "unusual" component to the formula.  I begin by exploring the data and creating some additional values from various looks at the characters and numbers.  Knowing that, in this example, it couldn't be too complicated or even the gangsters wouldn't have ever been able to deliver a proper response.

In [None]:
# Load the code/response pairs into a data frame
# Start with library import, we'll use pandas
import pandas as pd

# Read the CSV file
# capone_df = pd.read_csv('/mnt/user-data/uploads/capone.csv')
capone_df = pd.read_csv('capone.csv')


# Parse the number column into digits and letter
capone_df['code'] = capone_df['number']
capone_df['response'] = capone_df['response_code']
capone_df['code_digits'] = capone_df['code'].str.extract(r'(\d+)')[0]
capone_df['code_letter'] = capone_df['code'].str.extract(r'([A-Z])')[0]

# Select and reorder columns
capone_df = capone_df[['code', 'response', 'code_digits', 'code_letter']]

# Display first few rows
print("Dataset loaded and parsed:")
print(capone_df.head(20))
print(f"\nTotal rows: {len(capone_df)}")
print(f"\nData types:\n{capone_df.dtypes}")

# Quick check - verify all codes were parsed correctly
print(f"\nAny missing digits? {capone_df['code_digits'].isnull().sum()}")
print(f"Any missing letters? {capone_df['code_letter'].isnull().sum()}")

In [None]:

# Some simple EDA on the letters and numbers

print("="*80)
print("ALPHABET COVERAGE")
print("="*80)

# Check which letters are used
unique_letters = sorted(capone_df['code_letter'].unique())
print(f"Unique letters used: {unique_letters}")
print(f"Number of unique letters: {len(unique_letters)}")
print(f"All 26 letters used? {len(unique_letters) == 26}")

# Show which letters are missing (if any)
all_letters = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
missing_letters = sorted(all_letters - set(unique_letters))
if missing_letters:
    print(f"Missing letters: {missing_letters}")
else:
    print("All letters A-Z are present!")

print("\n" + "="*80)
print("DIGIT COVERAGE")
print("="*80)

# Check which digits are used (across all digit positions)
all_digits_used = set(''.join(capone_df['code_digits'].astype(str)))
print(f"Digits used: {sorted(all_digits_used)}")
print(f"All digits 0-9 used? {all_digits_used == set('0123456789')}")

print("\n" + "="*80)
print("CODE LENGTH ANALYSIS")
print("="*80)

# Check the length of the digit portions
capone_df['digit_length'] = capone_df['code_digits'].str.len()
length_distribution = capone_df['digit_length'].value_counts().sort_index()

print(f"Distribution of digit lengths:")
print(length_distribution)

# Check if all are 3 or 4 digits
all_3_or_4_digits = capone_df['digit_length'].isin([3, 4]).all()
print(f"\nAll codes are 3 or 4 digits plus a letter? {all_3_or_4_digits}")

# Show examples of any unusual lengths
if not all_3_or_4_digits:
    print("\nCodes with unusual lengths:")
    unusual = capone_df[~capone_df['digit_length'].isin([3, 4])]
    print(unusual[['code', 'digit_length']].head(10))

print("\n" + "="*80)
print("SAMPLE DATA")
print("="*80)
print(capone_df.head(10))

In [None]:
# Add letter position in alphabet (A=1, B=2, ..., Z=26)
capone_df['letter_number'] = capone_df['code_letter'].apply(lambda x: ord(x) - ord('A') + 1)

# Display sample data with the new column
print("\n" + "="*80)
print("SAMPLE DATA WITH LETTER_NUMBER")
print("="*80)
print(capone_df[['code', 'response', 'code_digits', 'code_letter', 'letter_number']].head(20))

In [None]:
# Check if a number is prime
def is_prime(n):
    if n < 2:
        return False
    if n == 2:
        return True
    if n % 2 == 0:
        return False
    for i in range(3, int(n**0.5) + 1, 2):
        if n % i == 0:
            return False
    return True

# Add prime check column
capone_df['digits_are_prime'] = capone_df['code_digits'].astype(int).apply(is_prime)

# Check if all are prime
all_prime = capone_df['digits_are_prime'].all()
print(f"Are all code_digits prime numbers? {all_prime}")

# Show distribution
print(f"\nPrime distribution:")
print(capone_df['digits_are_prime'].value_counts())

# Show some examples of non-prime codes (if any)
if not all_prime:
    print(f"\nExamples of non-prime codes:")
    print(capone_df[~capone_df['digits_are_prime']][['code', 'code_digits', 'response']].head(10))

In [None]:
# Test various mathematical relationships between code_digits, letter_number, and response

# Convert code_digits to integer for calculations
capone_df['digits_int'] = capone_df['code_digits'].astype(int)

# Test different combinations
capone_df['sum_digits_letter'] = capone_df['digits_int'] + capone_df['letter_number']
capone_df['product_digits_letter'] = capone_df['digits_int'] * capone_df['letter_number']
capone_df['digits_minus_letter'] = capone_df['digits_int'] - capone_df['letter_number']
capone_df['letter_minus_digits'] = capone_df['letter_number'] - capone_df['digits_int']
capone_df['digits_mod_letter'] = capone_df['digits_int'] % capone_df['letter_number']
capone_df['digits_div_letter'] = capone_df['digits_int'] // capone_df['letter_number']

# Check which formulas match the response
print("Testing various mathematical relationships:")
print("="*80)

formulas = {
    'sum (digits + letter)': 'sum_digits_letter',
    'product (digits * letter)': 'product_digits_letter',
    'digits - letter': 'digits_minus_letter',
    'letter - digits': 'letter_minus_digits',
    'digits % letter': 'digits_mod_letter',
    'digits // letter': 'digits_div_letter'
}

for name, col in formulas.items():
    matches = (capone_df[col] == capone_df['response']).sum()
    pct = (matches / len(capone_df)) * 100
    print(f"{name:30} matches: {matches:4} / {len(capone_df)} ({pct:.1f}%)")

# Show some examples where nothing matches
print("\n" + "="*80)
print("Sample rows (showing various calculated columns):")
print("="*80)
cols_to_show = ['code', 'response', 'digits_int', 'letter_number', 'sum_digits_letter', 'product_digits_letter']
print(capone_df[cols_to_show].head(20))

In [None]:
# Count the number of digits in each response
capone_df['response_length'] = capone_df['response'].astype(str).str.len()

# Get distribution of response lengths
response_length_dist = capone_df['response_length'].value_counts().sort_index()

print("Distribution of response lengths (number of digits):")
print("="*80)
for length, count in response_length_dist.items():
    print(f"Responses with {length} digit(s): {count}")

print("\n" + "="*80)
print("Sample responses by length:")
print("="*80)

# Show examples of each length
for length in sorted(capone_df['response_length'].unique()):
    print(f"\n{length}-digit responses (examples):")
    examples = capone_df[capone_df['response_length'] == length][['code', 'letter_number', 'response']].head(5)
    print(examples.to_string(index=False))

In [None]:
# Test various combinations involving individual digit operations

# First, let's create columns for operations on individual digits
capone_df['sum_of_digits'] = capone_df['code_digits'].astype(str).apply(lambda x: sum(int(d) for d in x))
capone_df['product_of_digits'] = capone_df['code_digits'].astype(str).apply(lambda x: eval('*'.join(x)) if x else 0)
capone_df['count_of_digits'] = capone_df['code_digits'].astype(str).str.len()

# Now test combinations of these with letter_number
print("Testing combinations of digit operations with letter_number:")
print("="*80)

test_formulas = {
    'sum_of_digits + letter_number': lambda df: df['sum_of_digits'] + df['letter_number'],
    'sum_of_digits - letter_number': lambda df: df['sum_of_digits'] - df['letter_number'],
    'sum_of_digits * letter_number': lambda df: df['sum_of_digits'] * df['letter_number'],
    'letter_number - sum_of_digits': lambda df: df['letter_number'] - df['sum_of_digits'],
    'product_of_digits + letter_number': lambda df: df['product_of_digits'] + df['letter_number'],
    'product_of_digits - letter_number': lambda df: df['product_of_digits'] - df['letter_number'],
    'count_of_digits + letter_number': lambda df: df['count_of_digits'] + df['letter_number'],
    'count_of_digits * letter_number': lambda df: df['count_of_digits'] * df['letter_number'],
    'sum_of_digits + count_of_digits': lambda df: df['sum_of_digits'] + df['count_of_digits'],
    'sum_of_digits * count_of_digits': lambda df: df['sum_of_digits'] * df['count_of_digits'],
    'sum_of_digits + letter_number + count_of_digits': lambda df: df['sum_of_digits'] + df['letter_number'] + df['count_of_digits'],
}

results = {}
for name, formula in test_formulas.items():
    capone_df[f'test_{name}'] = formula(capone_df)
    matches = (capone_df[f'test_{name}'] == capone_df['response']).sum()
    pct = (matches / len(capone_df)) * 100
    results[name] = (matches, pct)
    print(f"{name:50} matches: {matches:4} / {len(capone_df)} ({pct:.1f}%)")

# Show the top performers
print("\n" + "="*80)
print("Top performing formulas:")
print("="*80)
top_results = sorted(results.items(), key=lambda x: x[1][0], reverse=True)[:5]
for name, (matches, pct) in top_results:
    print(f"{name:50} {matches:4} matches ({pct:.1f}%)")

# Show sample data with the best formula
if top_results:
    best_formula = top_results[0][0]
    print("\n" + "="*80)
    print(f"Sample data with best formula: {best_formula}")
    print("="*80)
    cols = ['code', 'sum_of_digits', 'letter_number', 'count_of_digits', f'test_{best_formula}', 'response']
    print(capone_df[cols].head(20))

In [None]:
# Super simple operations a gangster could do quickly
print("Testing extremely simple mental math patterns:")
print("="*80)

# Extract first and last digits
capone_df['first_digit'] = capone_df['code_digits'].astype(str).str[0].astype(int)
capone_df['last_digit'] = capone_df['code_digits'].astype(str).str[-1].astype(int)

simple_formulas = {
    'first_digit + last_digit + letter_number': lambda df: df['first_digit'] + df['last_digit'] + df['letter_number'],
    'first_digit * last_digit + letter_number': lambda df: df['first_digit'] * df['last_digit'] + df['letter_number'],
    'sum_of_digits + letter_number (sanity check)': lambda df: df['sum_of_digits'] + df['letter_number'],
    'count_of_digits + sum_of_digits + letter_number': lambda df: df['count_of_digits'] + df['sum_of_digits'] + df['letter_number'],
}

for name, formula in simple_formulas.items():
    test_col = formula(capone_df)
    matches = (test_col == capone_df['response']).sum()
    pct = (matches / len(capone_df)) * 100
    print(f"{name:55} matches: {matches:4} / {len(capone_df)} ({pct:.1f}%)")

# Show sample to look for patterns
print("\n" + "="*80)
print("Sample with first/last digits:")
print("="*80)
cols = ['code', 'first_digit', 'last_digit', 'sum_of_digits', 'letter_number', 'response']
print(capone_df[cols].head(20))

In [None]:
# Let's find these specific examples in our dataset
print("Checking the riddle examples from our dataset:")
print("="*80)

riddle_codes = ['78M', '132V', '276F']

for code in riddle_codes:
    match = capone_df[capone_df['code'] == code]
    if not match.empty:
        print(f"\nFound {code} in dataset:")
        print(match[['code', 'code_digits', 'letter_number', 'sum_of_digits', 'response']].to_string(index=False))
    else:
        print(f"\n{code} NOT found in dataset")

# Maybe there's something unique about these specific codes?
print("\n" + "="*80)
print("Let me also check if there are patterns in responses with similar properties:")
print("="*80)

# Show a few more examples with similar characteristics
print("\nAll codes with letter M:")
print(capone_df[capone_df['code_letter'] == 'M'][['code', 'sum_of_digits', 'letter_number', 'response']].head(10))

In [None]:
# Let's manually calculate what the riddle examples would look like
print("Manual analysis of riddle examples:")
print("="*80)

riddle_data = {
    '78M': {'digits': '78', 'letter': 'M', 'response': 13},
    '132V': {'digits': '132', 'letter': 'V', 'response': 22},
    '276F': {'digits': '276', 'letter': 'F', 'response': '?'}
}

for code, data in riddle_data.items():
    digits_str = data['digits']
    letter = data['letter']
    response = data['response']
    
    # Calculate various properties
    sum_digits = sum(int(d) for d in digits_str)
    count_digits = len(digits_str)
    letter_num = ord(letter) - ord('A') + 1
    first_digit = int(digits_str[0])
    last_digit = int(digits_str[-1])
    
    print(f"\n{code} → {response}")
    print(f"  digits: {digits_str}")
    print(f"  letter: {letter} (position {letter_num})")
    print(f"  sum_of_digits: {sum_digits}")
    print(f"  count_of_digits: {count_digits}")
    print(f"  first_digit: {first_digit}")
    print(f"  last_digit: {last_digit}")
    
    # Try to spot a pattern
    print(f"  sum + letter_num: {sum_digits + letter_num}")
    print(f"  count + letter_num: {count_digits + letter_num}")
    print(f"  first + last + letter_num: {first_digit + last_digit + letter_num}")

In [None]:
# Test if response just equals letter_number
print("Testing if response = letter_number:")
print("="*80)

matches = (capone_df['response'] == capone_df['letter_number']).sum()
pct = (matches / len(capone_df)) * 100
print(f"Exact matches: {matches} / {len(capone_df)} ({pct:.1f}%)")

# Show examples where it matches and where it doesn't
print("\n" + "="*80)
print("Examples where response = letter_number:")
print("="*80)
matching = capone_df[capone_df['response'] == capone_df['letter_number']]
print(matching[['code', 'letter_number', 'response']].head(10))

print("\n" + "="*80)
print("Examples where response ≠ letter_number:")
print("="*80)
not_matching = capone_df[capone_df['response'] != capone_df['letter_number']]
print(not_matching[['code', 'code_digits', 'sum_of_digits', 'letter_number', 'response']].head(20))

In [None]:
# Examine the 4 cases where response = letter_number
print("Detailed look at the 4 exact matches:")
print("="*80)

matches = capone_df[capone_df['response'] == capone_df['letter_number']]
print(matches[['code', 'code_digits', 'sum_of_digits', 'count_of_digits', 'first_digit', 'last_digit', 'letter_number', 'response']])

# Is there something special about their digits?
print("\n" + "="*80)
print("Looking for patterns in these 4 matches...")
print("="*80)

# Check various properties
for idx, row in matches.iterrows():
    code = row['code']
    digits = str(row['code_digits'])
    print(f"\n{code}:")
    print(f"  All zeros? {all(d == '0' for d in digits[:-1])}")
    print(f"  Contains only certain digits? {set(digits)}")
    print(f"  Sum of digits: {row['sum_of_digits']}")
    print(f"  Digits as int: {row['code_digits']}")

In [None]:
# Check if codes ending in 0 have a special pattern
print("Analyzing codes where last_digit = 0:")
print("="*80)

codes_ending_zero = capone_df[capone_df['last_digit'] == 0]
print(f"Total codes ending in 0: {len(codes_ending_zero)}")

# For codes ending in 0, does response = letter_number?
matches_zero = codes_ending_zero[codes_ending_zero['response'] == codes_ending_zero['letter_number']]
print(f"Of those, how many have response = letter_number? {len(matches_zero)}")

# Show all codes ending in 0
print("\n" + "="*80)
print("All codes ending in 0:")
print("="*80)
print(codes_ending_zero[['code', 'code_digits', 'sum_of_digits', 'letter_number', 'response', 'last_digit']].head(30))

# What about codes NOT ending in 0?
print("\n" + "="*80)
print("For codes NOT ending in 0, is there a pattern?")
print("="*80)

codes_not_zero = capone_df[capone_df['last_digit'] != 0]
print("\nSample of codes not ending in 0:")
print(codes_not_zero[['code', 'code_digits', 'sum_of_digits', 'first_digit', 'last_digit', 'letter_number', 'response']].head(20))

In [None]:
# Let's see the full range of responses and their distribution
print("Response value analysis:")
print("="*80)

print(f"Min response: {capone_df['response'].min()}")
print(f"Max response: {capone_df['response'].max()}")
print(f"Mean response: {capone_df['response'].mean():.2f}")
print(f"Median response: {capone_df['response'].median()}")

print("\n" + "="*80)
print("Response frequency distribution:")
print("="*80)
response_freq = capone_df['response'].value_counts().sort_index()
print(response_freq)

print("\n" + "="*80)
print("Most common responses:")
print("="*80)
print(capone_df['response'].value_counts().head(20))

# Maybe there's a pattern in which responses are possible?
print("\n" + "="*80)
print("Are there gaps in the response values?")
print("="*80)
all_responses = set(capone_df['response'].unique())
min_resp = capone_df['response'].min()
max_resp = capone_df['response'].max()
all_possible = set(range(min_resp, max_resp + 1))
missing = sorted(all_possible - all_responses)
print(f"Missing response values: {missing}")

In [None]:
# Group responses into ranges and see if there's a pattern
print("Comparing codes by response ranges:")
print("="*80)

capone_df['response_range'] = pd.cut(capone_df['response'], 
                                      bins=[0, 20, 30, 40, 50], 
                                      labels=['5-20', '21-30', '31-40', '41-50'])

for range_label in ['5-20', '21-30', '31-40', '41-50']:
    subset = capone_df[capone_df['response_range'] == range_label]
    print(f"\n{range_label} (n={len(subset)}):")
    print(f"  Avg sum_of_digits: {subset['sum_of_digits'].mean():.2f}")
    print(f"  Avg letter_number: {subset['letter_number'].mean():.2f}")
    print(f"  Avg count_of_digits: {subset['count_of_digits'].mean():.2f}")
    print(f"  Avg first_digit: {subset['first_digit'].mean():.2f}")
    print(f"  Avg last_digit: {subset['last_digit'].mean():.2f}")

# Maybe it's the combination that matters?
print("\n" + "="*80)
print("What if we look at sum_of_digits + letter_number for different response ranges?")
print("="*80)

for range_label in ['5-20', '21-30', '31-40', '41-50']:
    subset = capone_df[capone_df['response_range'] == range_label]
    avg_combo = (subset['sum_of_digits'] + subset['letter_number']).mean()
    print(f"{range_label}: avg(sum_of_digits + letter_number) = {avg_combo:.2f}")

In [None]:
# Double-check the sum_of_digits + letter_number formula
print("Re-testing sum_of_digits + letter_number:")
print("="*80)

capone_df['test_sum_plus_letter'] = capone_df['sum_of_digits'] + capone_df['letter_number']

# Show the difference between formula and actual response
capone_df['difference'] = capone_df['test_sum_plus_letter'] - capone_df['response']

print("\nDifference distribution (formula - actual):")
print(capone_df['difference'].value_counts().sort_index().head(20))

print("\n" + "="*80)
print("Sample showing formula vs actual:")
print("="*80)
cols = ['code', 'sum_of_digits', 'letter_number', 'test_sum_plus_letter', 'response', 'difference']
print(capone_df[cols].head(30))

# Is the difference consistent or does it vary by something?
print("\n" + "="*80)
print("Is there a pattern to the difference?")
print("="*80)
print(f"Mean difference: {capone_df['difference'].mean():.2f}")
print(f"Median difference: {capone_df['difference'].median():.2f}")
print(f"Std dev: {capone_df['difference'].std():.2f}")

In [None]:
# Does the difference relate to count_of_digits or other properties?
print("Analyzing what causes the difference:")
print("="*80)

# Group by count_of_digits
print("\nAverage difference by count_of_digits:")
for count in sorted(capone_df['count_of_digits'].unique()):
    subset = capone_df[capone_df['count_of_digits'] == count]
    avg_diff = subset['difference'].mean()
    print(f"  {count} digits: avg difference = {avg_diff:.2f}")

# Group by first_digit
print("\nAverage difference by first_digit:")
for digit in range(10):
    subset = capone_df[capone_df['first_digit'] == digit]
    if len(subset) > 0:
        avg_diff = subset['difference'].mean()
        print(f"  first_digit={digit}: avg difference = {avg_diff:.2f}")

# Group by last_digit
print("\nAverage difference by last_digit:")
for digit in range(10):
    subset = capone_df[capone_df['last_digit'] == digit]
    if len(subset) > 0:
        avg_diff = subset['difference'].mean()
        print(f"  last_digit={digit}: avg difference = {avg_diff:.2f}")

# Maybe it's: sum_of_digits + letter_number + count_of_digits?
print("\n" + "="*80)
print("Testing: sum_of_digits + letter_number + count_of_digits")
print("="*80)
capone_df['test_with_count'] = capone_df['sum_of_digits'] + capone_df['letter_number'] + capone_df['count_of_digits']
matches = (capone_df['test_with_count'] == capone_df['response']).sum()
print(f"Exact matches: {matches} / {len(capone_df)}")

In [None]:
# It looks like we need to SUBTRACT something based on count_of_digits
# Let's try: sum_of_digits + letter_number + (some_adjustment_for_count)

print("Testing various adjustments for count_of_digits:")
print("="*80)

# Try different multipliers for count_of_digits
for multiplier in range(-10, 11):
    capone_df['test_formula'] = capone_df['sum_of_digits'] + capone_df['letter_number'] + (multiplier * capone_df['count_of_digits'])
    matches = (capone_df['test_formula'] == capone_df['response']).sum()
    if matches > 10:  # Only show if we get some matches
        print(f"sum + letter + ({multiplier:3} * count): {matches:4} matches")

# Let's also try: maybe it's not a simple multiplier
print("\n" + "="*80)
print("What if the adjustment varies by count_of_digits value?")
print("="*80)

# Maybe: sum + letter + adjustment_table[count_of_digits]
# Let's calculate what the adjustment should be for each count
for count in sorted(capone_df['count_of_digits'].unique()):
    subset = capone_df[capone_df['count_of_digits'] == count]
    # What adjustment would make difference = 0?
    needed_adjustment = -subset['difference'].mean()
    print(f"For {count}-digit codes, need to add: {needed_adjustment:.2f}")

In [None]:
# Add count_of_digits_squared column and test patterns
print("Testing patterns with count_of_digits:")
print("="*80)

capone_df['count_squared'] = capone_df['count_of_digits'] ** 2

# Show the needed adjustments again with more detail
print("\nNeeded adjustments by count_of_digits:")
for count in sorted(capone_df['count_of_digits'].unique()):
    subset = capone_df[capone_df['count_of_digits'] == count]
    needed_adjustment = -subset['difference'].mean()
    count_squared = count ** 2
    print(f"  {count} digits (squared={count_squared:2}): need to add {needed_adjustment:6.2f}")

# Test various formulas involving count_of_digits
print("\n" + "="*80)
print("Testing formulas with count operations:")
print("="*80)

test_formulas = {
    'sum + letter + count': lambda df: df['sum_of_digits'] + df['letter_number'] + df['count_of_digits'],
    'sum + letter + count^2': lambda df: df['sum_of_digits'] + df['letter_number'] + df['count_squared'],
    'sum + letter - count^2': lambda df: df['sum_of_digits'] + df['letter_number'] - df['count_squared'],
    'sum + letter + (10 * count)': lambda df: df['sum_of_digits'] + df['letter_number'] + (10 * df['count_of_digits']),
    'sum + letter + (20 - count^2)': lambda df: df['sum_of_digits'] + df['letter_number'] + (20 - df['count_squared']),
    'sum + letter + (15 - count^2)': lambda df: df['sum_of_digits'] + df['letter_number'] + (15 - df['count_squared']),
    'sum + letter + (18 - count^2)': lambda df: df['sum_of_digits'] + df['letter_number'] + (18 - df['count_squared']),
}

for name, formula in test_formulas.items():
    capone_df['test_formula'] = formula(capone_df)
    matches = (capone_df['test_formula'] == capone_df['response']).sum()
    pct = (matches / len(capone_df)) * 100
    print(f"{name:40} {matches:4} matches ({pct:.1f}%)")

# Show sample with count_of_digits and count_squared
print("\n" + "="*80)
print("Sample data with count columns:")
print("="*80)
cols = ['code', 'sum_of_digits', 'letter_number', 'count_of_digits', 'count_squared', 'response']
print(capone_df[cols].head(20))

In [None]:
# Test a wider range of constants: sum + letter + (C - count^2)
print("Testing: sum + letter + (C - count_squared) for various C:")
print("="*80)

best_matches = 0
best_c = 0

for c in range(0, 30):
    capone_df['test_formula'] = capone_df['sum_of_digits'] + capone_df['letter_number'] + (c - capone_df['count_squared'])
    matches = (capone_df['test_formula'] == capone_df['response']).sum()
    if matches > 20:
        print(f"C={c:2}: {matches:4} matches ({matches/10:.1f}%)")
        if matches > best_matches:
            best_matches = matches
            best_c = c

print(f"\nBest: C={best_c} with {best_matches} matches")

# Maybe it's not (C - count^2), but something else?
print("\n" + "="*80)
print("What if it's: sum + letter + (count * K) - count^2?")
print("="*80)

for k in range(-5, 15):
    capone_df['test_formula'] = capone_df['sum_of_digits'] + capone_df['letter_number'] + (k * capone_df['count_of_digits']) - capone_df['count_squared']
    matches = (capone_df['test_formula'] == capone_df['response']).sum()
    if matches > 30:
        print(f"K={k:3}: {matches:4} matches ({matches/10:.1f}%)")

In [None]:
# Let's look at the actual examples from the riddle again more carefully
print("Re-examining the riddle examples:")
print("="*80)

riddle_examples = [
    {'code': '78M', 'digits': '78', 'letter': 'M', 'response': 13},
    {'code': '132V', 'digits': '132', 'letter': 'V', 'response': 22}
]

for ex in riddle_examples:
    digits = ex['digits']
    letter = ex['letter']
    response = ex['response']
    letter_num = ord(letter) - ord('A') + 1
    
    print(f"\n{ex['code']} → {response}")
    print(f"  Letter {letter} = position {letter_num}")
    print(f"  Response = {response}")
    print(f"  Response == letter_num? {response == letter_num}")

print("\n" + "="*80)
print("Wait - in BOTH riddle examples, response = letter_number!")
print("But you said that was wrong as your initial hypothesis...")
print("="*80)

# Maybe the riddle examples are SPECIAL cases?
# Or maybe the riddle is teaching us something else?

print("\nLet me look at what makes the digits special in the riddle:")
print("78M: digits are 7,8 - consecutive!")
print("132V: digits are 1,3,2 - NOT consecutive")

# What if there's a condition: IF <something>, THEN response = letter_number, ELSE <formula>?
print("\n" + "="*80)
print("What if consecutive digits mean response = letter_number?")
print("="*80)

def has_consecutive_digits(digits_str):
    digits = [int(d) for d in digits_str]
    for i in range(len(digits) - 1):
        if abs(digits[i] - digits[i+1]) == 1:
            return True
    return False

capone_df['has_consecutive'] = capone_df['code_digits'].astype(str).apply(has_consecutive_digits)

consecutive_subset = capone_df[capone_df['has_consecutive']]
matches_consecutive = (consecutive_subset['response'] == consecutive_subset['letter_number']).sum()
print(f"Codes with consecutive digits where response = letter_number: {matches_consecutive} / {len(consecutive_subset)}")

In [None]:
# What are the properties of codes where response DOES equal letter_number?
print("Analyzing the 4 codes where response = letter_number:")
print("="*80)

exact_matches = capone_df[capone_df['response'] == capone_df['letter_number']]
print(exact_matches[['code', 'code_digits', 'sum_of_digits', 'count_of_digits', 'letter_number', 'response']])

print("\n" + "="*80)
print("Let's check if 78M and 132V would be in that category:")
print("="*80)

# Manually calculate for riddle examples
for code_str, digits_str, letter in [('78M', '78', 'M'), ('132V', '132', 'V')]:
    sum_digits = sum(int(d) for d in digits_str)
    count_digits = len(digits_str)
    letter_num = ord(letter) - ord('A') + 1
    
    # What would various formulas give?
    formula1 = sum_digits + letter_num + count_digits
    formula2 = sum_digits + letter_num + (20 - count_digits**2)
    
    print(f"\n{code_str}:")
    print(f"  sum={sum_digits}, count={count_digits}, letter_num={letter_num}")
    print(f"  sum + letter + count = {formula1}")
    print(f"  sum + letter + (20 - count^2) = {formula2}")
    print(f"  Actual response: {letter_num}")

In [None]:
# Deep dive on the 4 exact matches
print("Deep analysis of the 4 codes where response = letter_number:")
print("="*80)

exact_matches = capone_df[capone_df['response'] == capone_df['letter_number']]

for idx, row in exact_matches.iterrows():
    code = row['code']
    digits = str(row['code_digits'])
    letter = row['code_letter']
    sum_d = row['sum_of_digits']
    count_d = row['count_of_digits']
    letter_n = row['letter_number']
    
    print(f"\n{code}:")
    print(f"  digits: {digits}")
    print(f"  sum_of_digits: {sum_d}")
    print(f"  count_of_digits: {count_d}")
    print(f"  letter_number: {letter_n}")
    print(f"  sum + count: {sum_d + count_d}")
    print(f"  Does (sum + count) = letter_num? {(sum_d + count_d) == letter_n}")

# Wait... what if the formula involves MORE than just these basic operations?
print("\n" + "="*80)
print("What if I'm thinking about this wrong?")
print("What if it's not arithmetic but something else?")
print("Like... counting specific types of digits?")
print("="*80)

# Count even vs odd digits
capone_df['count_even_digits'] = capone_df['code_digits'].astype(str).apply(
    lambda x: sum(1 for d in x if int(d) % 2 == 0)
)
capone_df['count_odd_digits'] = capone_df['code_digits'].astype(str).apply(
    lambda x: sum(1 for d in x if int(d) % 2 == 1)
)

print("\nSample with even/odd digit counts:")
cols = ['code', 'count_even_digits', 'count_odd_digits', 'letter_number', 'response']
print(capone_df[cols].head(20))

In [None]:
# Test formulas involving even/odd digit counts
print("Testing formulas with even/odd digit counts:")
print("="*80)

test_formulas = {
    'count_even + letter_number': lambda df: df['count_even_digits'] + df['letter_number'],
    'count_odd + letter_number': lambda df: df['count_odd_digits'] + df['letter_number'],
    'sum + letter + count_even': lambda df: df['sum_of_digits'] + df['letter_number'] + df['count_even_digits'],
    'sum + letter + count_odd': lambda df: df['sum_of_digits'] + df['letter_number'] + df['count_odd_digits'],
    '(10 * count_even) + letter': lambda df: (10 * df['count_even_digits']) + df['letter_number'],
    '(10 * count_odd) + letter': lambda df: (10 * df['count_odd_digits']) + df['letter_number'],
    'sum + (10 * count_even) + letter': lambda df: df['sum_of_digits'] + (10 * df['count_even_digits']) + df['letter_number'],
    'sum + (10 * count_odd) + letter': lambda df: df['sum_of_digits'] + (10 * df['count_odd_digits']) + df['letter_number'],
}

for name, formula in test_formulas.items():
    test_col = formula(capone_df)
    matches = (test_col == capone_df['response']).sum()
    pct = (matches / len(capone_df)) * 100
    if matches > 20:
        print(f"{name:40} {matches:4} matches ({pct:.1f}%)")

# Also test the riddle examples with these
print("\n" + "="*80)
print("Riddle examples with even/odd counts:")
print("="*80)

for code_str, digits_str, letter, actual_resp in [
    ('78M', '78', 'M', 13), 
    ('132V', '132', 'V', 22),
    ('276F', '276', 'F', '?')
]:
    count_even = sum(1 for d in digits_str if int(d) % 2 == 0)
    count_odd = sum(1 for d in digits_str if int(d) % 2 == 1)
    letter_num = ord(letter) - ord('A') + 1
    sum_digits = sum(int(d) for d in digits_str)
    
    print(f"\n{code_str} → {actual_resp}")
    print(f"  even digits: {count_even}, odd digits: {count_odd}")
    print(f"  sum + (10 * even) + letter = {sum_digits + (10 * count_even) + letter_num}")
    print(f"  sum + (10 * odd) + letter = {sum_digits + (10 * count_odd) + letter_num}")

In [None]:
# Maybe it's about digit positions or patterns?
print("Testing digit position-based formulas:")
print("="*80)

# Add columns for each digit position (pad with 0 if needed)
capone_df['digit_1'] = capone_df['code_digits'].astype(str).str.zfill(4).str[0].astype(int)
capone_df['digit_2'] = capone_df['code_digits'].astype(str).str.zfill(4).str[1].astype(int)
capone_df['digit_3'] = capone_df['code_digits'].astype(str).str.zfill(4).str[2].astype(int)
capone_df['digit_4'] = capone_df['code_digits'].astype(str).str.zfill(4).str[3].astype(int)

# Test various position-based formulas
position_formulas = {
    'digit_1 + digit_2 + letter': lambda df: df['digit_1'] + df['digit_2'] + df['letter_number'],
    'digit_3 + digit_4 + letter': lambda df: df['digit_3'] + df['digit_4'] + df['letter_number'],
    '(digit_1 * 10) + digit_4 + letter': lambda df: (df['digit_1'] * 10) + df['digit_4'] + df['letter_number'],
    '(digit_2 * 10) + digit_3 + letter': lambda df: (df['digit_2'] * 10) + df['digit_3'] + df['letter_number'],
}

for name, formula in position_formulas.items():
    test_col = formula(capone_df)
    matches = (test_col == capone_df['response']).sum()
    pct = (matches / len(capone_df)) * 100
    if matches > 20:
        print(f"{name:45} {matches:4} matches ({pct:.1f}%)")

# Show sample
print("\n" + "="*80)
print("Sample with digit positions:")
print("="*80)
cols = ['code', 'digit_1', 'digit_2', 'digit_3', 'digit_4', 'letter_number', 'response']
print(capone_df[cols].head(20))# Maybe it's about digit positions or patterns?
print("Testing digit position-based formulas:")
print("="*80)

# Add columns for each digit position (pad with 0 if needed)
capone_df['digit_1'] = capone_df['code_digits'].astype(str).str.zfill(4).str[0].astype(int)
capone_df['digit_2'] = capone_df['code_digits'].astype(str).str.zfill(4).str[1].astype(int)
capone_df['digit_3'] = capone_df['code_digits'].astype(str).str.zfill(4).str[2].astype(int)
capone_df['digit_4'] = capone_df['code_digits'].astype(str).str.zfill(4).str[3].astype(int)

# Test various position-based formulas
position_formulas = {
    'digit_1 + digit_2 + letter': lambda df: df['digit_1'] + df['digit_2'] + df['letter_number'],
    'digit_3 + digit_4 + letter': lambda df: df['digit_3'] + df['digit_4'] + df['letter_number'],
    '(digit_1 * 10) + digit_4 + letter': lambda df: (df['digit_1'] * 10) + df['digit_4'] + df['letter_number'],
    '(digit_2 * 10) + digit_3 + letter': lambda df: (df['digit_2'] * 10) + df['digit_3'] + df['letter_number'],
}

for name, formula in position_formulas.items():
    test_col = formula(capone_df)
    matches = (test_col == capone_df['response']).sum()
    pct = (matches / len(capone_df)) * 100
    if matches > 20:
        print(f"{name:45} {matches:4} matches ({pct:.1f}%)")

# Show sample
print("\n" + "="*80)
print("Sample with digit positions:")
print("="*80)
cols = ['code', 'digit_1', 'digit_2', 'digit_3', 'digit_4', 'letter_number', 'response']
print(capone_df[cols].head(20))

In [None]:
# Number of letters in the spoken word for each digit!
digit_to_word_length = {
    0: 4,  # "zero"
    1: 3,  # "one"
    2: 3,  # "two"
    3: 5,  # "three"
    4: 4,  # "four"
    5: 4,  # "five"
    6: 3,  # "six"
    7: 5,  # "seven"
    8: 5,  # "eight"
    9: 4   # "nine"
}

# Calculate the sum of word lengths for each code's digits
def sum_of_word_lengths(digits_str):
    return sum(digit_to_word_length[int(d)] for d in digits_str)

capone_df['sum_of_word_lengths'] = capone_df['code_digits'].astype(str).apply(sum_of_word_lengths)

# Test the formula: sum_of_word_lengths + letter_number
capone_df['test_word_formula'] = capone_df['sum_of_word_lengths'] + capone_df['letter_number']

matches = (capone_df['test_word_formula'] == capone_df['response']).sum()
pct = (matches / len(capone_df)) * 100

print("Testing: sum_of_word_lengths + letter_number")
print("="*80)
print(f"MATCHES: {matches} / {len(capone_df)} ({pct:.1f}%)")

# Show examples
print("\n" + "="*80)
print("Sample data:")
print("="*80)
cols = ['code', 'code_digits', 'sum_of_word_lengths', 'letter_number', 'test_word_formula', 'response']
print(capone_df[cols].head(30))

# Test on riddle examples
print("\n" + "="*80)
print("RIDDLE EXAMPLES:")
print("="*80)

for code_str, digits_str, letter, actual_resp in [
    ('78M', '78', 'M', 13), 
    ('132V', '132', 'V', 22),
    ('276F', '276', 'F', '?')
]:
    word_sum = sum_of_word_lengths(digits_str)
    letter_num = ord(letter) - ord('A') + 1
    calculated = word_sum + letter_num
    
    print(f"\n{code_str} → {actual_resp}")
    print(f"  '7'=seven(5) + '8'=eight(5) + ... = {word_sum}")
    print(f"  {word_sum} + letter({letter_num}) = {calculated}")
    print(f"  Match? {calculated == actual_resp if actual_resp != '?' else 'N/A'}")

In [None]:
# Manual number-to-words converter
def number_to_words(n):
    """Convert a number to its English word representation"""
    ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
    teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", 
             "sixteen", "seventeen", "eighteen", "nineteen"]
    tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
    
    if n == 0:
        return "zero"
    
    result = ""
    
    # Thousands
    if n >= 1000:
        result += ones[n // 1000] + "thousand"
        n %= 1000
    
    # Hundreds
    if n >= 100:
        result += ones[n // 100] + "hundred"
        n %= 100
    
    # Tens and ones
    if n >= 20:
        result += tens[n // 10]
        n %= 10
    elif n >= 10:
        result += teens[n - 10]
        n = 0
    
    # Ones
    if n > 0:
        result += ones[n]
    
    return result

def count_letters_in_number(num):
    """Count letters in spelled-out number"""
    word = number_to_words(int(num))
    return len(word)

# Test on riddle examples
print("Testing with FULL number names:")
print("="*80)

for code_str, digits_str, letter, actual_resp in [
    ('78M', '78', 'M', 13), 
    ('132V', '132', 'V', 22),
    ('276F', '276', 'F', '?')
]:
    full_num_letters = count_letters_in_number(digits_str)
    letter_num = ord(letter) - ord('A') + 1
    calculated = full_num_letters + letter_num
    
    print(f"\n{code_str} → {actual_resp}")
    print(f"  {digits_str} = '{number_to_words(int(digits_str))}' = {full_num_letters} letters")
    print(f"  {full_num_letters} + letter({letter_num}) = {calculated}")
    print(f"  Match? {calculated == actual_resp if actual_resp != '?' else f'Answer would be {calculated}'}")

# Apply to full dataset
capone_df['full_number_letters'] = capone_df['code_digits'].astype(int).apply(count_letters_in_number)
capone_df['test_full_number'] = capone_df['full_number_letters'] + capone_df['letter_number']

matches = (capone_df['test_full_number'] == capone_df['response']).sum()
pct = (matches / len(capone_df)) * 100

print("\n" + "="*80)
print(f"MATCHES: {matches} / {len(capone_df)} ({pct:.1f}%)")
print("="*80)

# Show sample
cols = ['code', 'code_digits', 'full_number_letters', 'letter_number', 'test_full_number', 'response']
print(capone_df[cols].head(20))

In [None]:
# Letter names and their lengths
letter_name_lengths = {
    'A': 1,  # "a" or "ay" = 2
    'B': 3,  # "bee"
    'C': 3,  # "see"
    'D': 3,  # "dee"
    'E': 1,  # "e" or "ee" = 2
    'F': 3,  # "eff"
    'G': 3,  # "gee"
    'H': 5,  # "aitch"
    'I': 1,  # "i" or "eye" = 3
    'J': 3,  # "jay"
    'K': 3,  # "kay"
    'L': 2,  # "el"
    'M': 2,  # "em"
    'N': 2,  # "en"
    'O': 1,  # "o" or "oh" = 2
    'P': 3,  # "pee"
    'Q': 3,  # "cue"
    'R': 2,  # "ar"
    'S': 3,  # "ess"
    'T': 3,  # "tee"
    'U': 1,  # "u" or "you" = 3
    'V': 3,  # "vee"
    'W': 6,  # "double-u"
    'X': 2,  # "ex"
    'Y': 3,  # "why" or "wye"
    'Z': 3   # "zee" or "zed"
}

capone_df['letter_name_length'] = capone_df['code_letter'].map(letter_name_lengths)

# Test: sum_of_digits + letter_name_length
capone_df['test_letter_name'] = capone_df['sum_of_digits'] + capone_df['letter_name_length']

matches = (capone_df['test_letter_name'] == capone_df['response']).sum()
print(f"sum_of_digits + letter_name_length: {matches} / {len(capone_df)}")

# Or maybe: sum_of_word_lengths (digit words) + letter_name_length
capone_df['test_both_words'] = capone_df['sum_of_word_lengths'] + capone_df['letter_name_length']
matches2 = (capone_df['test_both_words'] == capone_df['response']).sum()
print(f"sum_of_word_lengths + letter_name_length: {matches2} / {len(capone_df)}")

In [None]:
# The NUMBER OF LETTERS in the SPOKEN COUNT of the letter!
# E is the 5th letter, "five" has 4 letters

number_word_lengths = {
    1: 3,   # "one"
    2: 3,   # "two"
    3: 5,   # "three"
    4: 4,   # "four"
    5: 4,   # "five"
    6: 3,   # "six"
    7: 5,   # "seven"
    8: 5,   # "eight"
    9: 4,   # "nine"
    10: 3,  # "ten"
    11: 6,  # "eleven"
    12: 6,  # "twelve"
    13: 8,  # "thirteen"
    14: 8,  # "fourteen"
    15: 7,  # "fifteen"
    16: 7,  # "sixteen"
    17: 9,  # "seventeen"
    18: 8,  # "eighteen"
    19: 8,  # "nineteen"
    20: 6,  # "twenty"
    21: 9,  # "twentyone"
    22: 9,  # "twentytwo"
    23: 11, # "twentythree"
    24: 10, # "twentyfour"
    25: 10, # "twentyfive"
    26: 9   # "twentysix"
}

capone_df['letter_position_word_length'] = capone_df['letter_number'].map(number_word_lengths)

# Test various formulas with this new column
print("Testing with letter position word length:")
print("="*80)

test_formulas = {
    'sum_of_digits + letter_position_word_length': 
        lambda df: df['sum_of_digits'] + df['letter_position_word_length'],
    'sum_of_word_lengths + letter_position_word_length': 
        lambda df: df['sum_of_word_lengths'] + df['letter_position_word_length'],
}

for name, formula in test_formulas.items():
    test_col = formula(capone_df)
    matches = (test_col == capone_df['response']).sum()
    pct = (matches / len(capone_df)) * 100
    print(f"{name:55} {matches:4} matches ({pct:.1f}%)")

# Test on riddle examples
print("\n" + "="*80)
print("RIDDLE EXAMPLES:")
print("="*80)

for code_str, digits_str, letter, actual_resp in [
    ('78M', '78', 'M', 13), 
    ('132V', '132', 'V', 22),
    ('276F', '276', 'F', '?')
]:
    sum_digits = sum(int(d) for d in digits_str)
    word_sum = sum_of_word_lengths(digits_str)
    letter_num = ord(letter) - ord('A') + 1
    letter_pos_word_len = number_word_lengths[letter_num]
    
    calc1 = sum_digits + letter_pos_word_len
    calc2 = word_sum + letter_pos_word_len
    
    print(f"\n{code_str} → {actual_resp}")
    print(f"  Letter {letter} = position {letter_num}")
    print(f"  '{letter_num}' spoken = '{list(number_word_lengths.keys())[list(number_word_lengths.values()).index(letter_pos_word_len)]}...' = {letter_pos_word_len} letters")
    print(f"  sum_of_digits({sum_digits}) + letter_word({letter_pos_word_len}) = {calc1}")
    print(f"  sum_of_word_lengths({word_sum}) + letter_word({letter_pos_word_len}) = {calc2}")
    

In [None]:
# Test: Response = number of letters in the SPOKEN version of the CODE NUMBER
# "78" = "seventy-eight" = 12 letters
# But the response for 78M was 13...

# Wait - maybe it includes the letter name too?
# "78M" = "seventy-eight em" = 12 + 2 = 14? Still not 13...

# Let me test what ChatGPT is actually suggesting
def count_letters_in_spoken_code(code_digits_str):
    """Count letters in spoken number (no spaces/hyphens)"""
    word = number_to_words(int(code_digits_str))
    return len(word)

# For riddle examples
print("Testing ChatGPT's interpretation:")
print("="*80)

for code_str, digits_str, letter, actual_resp in [
    ('78M', '78', 'M', 13), 
    ('132V', '132', 'V', 22),
    ('276F', '276', 'F', '?')
]:
    spoken_number = number_to_words(int(digits_str))
    num_letters = len(spoken_number)
    
    print(f"\n{code_str} → {actual_resp}")
    print(f"  '{digits_str}' spoken = '{spoken_number}' = {num_letters} letters")
    print(f"  Match? {num_letters == actual_resp if actual_resp != '?' else f'Would be {num_letters}'}")

# But wait - maybe the CHALLENGE itself is spoken differently?
# What if the guard says the ENTIRE code as a number?
# Like "8289W" = "eight thousand two hundred eighty-nine W"?

In [None]:
# The guard SPEAKS the code out loud
# "78M" spoken = "seventy-eight em"
# Count all the letters: seventyeightem = 14 letters? But answer is 13...

# Wait, maybe different pronunciations?
# Let me count carefully for the riddle examples

print("Guard speaks the code out loud:")
print("="*80)

for code_str, digits_str, letter, actual_resp in [
    ('78M', '78', 'M', 13), 
    ('132V', '132', 'V', 22),
    ('276F', '276', 'F', '?')
]:
    # Number part spoken
    number_spoken = number_to_words(int(digits_str))
    
    # Letter name
    # M = "em" (2 letters)
    # V = "vee" (3 letters)
    # F = "eff" (3 letters)
    letter_names = {
        'M': 'em',      # 2
        'V': 'vee',     # 3
        'F': 'eff'      # 3
    }
    
    letter_spoken = letter_names.get(letter, letter.lower())
    
    # Total
    full_spoken = number_spoken + letter_spoken
    total_letters = len(full_spoken)
    
    print(f"\n{code_str} → {actual_resp}")
    print(f"  Guard says: '{digits_str}' = '{number_spoken}'")
    print(f"  Plus letter: '{letter}' = '{letter_spoken}'")
    print(f"  Full spoken: '{full_spoken}' = {total_letters} letters")
    print(f"  Match? {total_letters == actual_resp if isinstance(actual_resp, int) else f'Would be {total_letters}'}")

# Now test on full dataset
letter_spoken_names = {
    'A': 'ay', 'B': 'bee', 'C': 'see', 'D': 'dee', 'E': 'ee',
    'F': 'eff', 'G': 'gee', 'H': 'aitch', 'I': 'eye', 'J': 'jay',
    'K': 'kay', 'L': 'el', 'M': 'em', 'N': 'en', 'O': 'oh',
    'P': 'pee', 'Q': 'cue', 'R': 'ar', 'S': 'ess', 'T': 'tee',
    'U': 'you', 'V': 'vee', 'W': 'doubleu', 'X': 'ex', 'Y': 'why', 'Z': 'zee'
}

def count_spoken_code_letters(row):
    number_part = number_to_words(int(row['code_digits']))
    letter_part = letter_spoken_names[row['code_letter']]
    return len(number_part + letter_part)

capone_df['spoken_code_letters'] = capone_df.apply(count_spoken_code_letters, axis=1)
matches = (capone_df['spoken_code_letters'] == capone_df['response']).sum()
pct = (matches / len(capone_df)) * 100

print("\n" + "="*80)
print(f"MATCHES: {matches} / {len(capone_df)} ({pct:.1f}%)")

In [None]:
# Maybe letter names are pronounced differently?
# What if M is just "m" (1 letter) not "em" (2 letters)?

print("Testing different letter pronunciations:")
print("="*80)

# Try 1: Single letter
for code_str, digits_str, letter, actual_resp in [
    ('78M', '78', 'M', 13), 
    ('132V', '132', 'V', 22),
]:
    number_spoken = number_to_words(int(digits_str))
    
    # Just the letter itself
    test1 = len(number_spoken + letter.lower())
    
    # Standard names
    letter_names = {'M': 'em', 'V': 'vee', 'F': 'eff'}
    test2 = len(number_spoken + letter_names.get(letter, ''))
    
    print(f"\n{code_str} → {actual_resp}")
    print(f"  number + letter: '{number_spoken}{letter.lower()}' = {test1}")
    print(f"  number + name: '{number_spoken}{letter_names.get(letter, '')}' = {test2}")

# Or maybe there's a space/hyphen issue in seventy-eight?
print("\n" + "="*80)
print("What if we're counting 'seventy-eight' with hyphen differently?")
print("="*80)

# Count with hyphen as separator?
test_78 = "seventy-eight"  
print(f"'seventy-eight' with hyphen = {len(test_78)} chars total")
print(f"'seventy-eight' letters only = {sum(1 for c in test_78 if c.isalpha())} letters")

# What's 13 - 12 = 1... so we need ONE more letter
# Could 'M' be counted as just 'm' = 1 letter to get to 13?
print(f"\n'seventyeight' (12) + 'm' (1) = 13 ✓")

In [None]:
# Test 1: Always use just the letter
def count_with_letter(row):
    number_part = number_to_words(int(row['code_digits']))
    return len(number_part + row['code_letter'].lower())

capone_df['test_with_letter'] = capone_df.apply(count_with_letter, axis=1)
matches1 = (capone_df['test_with_letter'] == capone_df['response']).sum()

# Test 2: Always use letter name
def count_with_name(row):
    letter_names = {
        'A': 'ay', 'B': 'bee', 'C': 'see', 'D': 'dee', 'E': 'ee',
        'F': 'eff', 'G': 'gee', 'H': 'aitch', 'I': 'eye', 'J': 'jay',
        'K': 'kay', 'L': 'el', 'M': 'em', 'N': 'en', 'O': 'oh',
        'P': 'pee', 'Q': 'cue', 'R': 'ar', 'S': 'ess', 'T': 'tee',
        'U': 'you', 'V': 'vee', 'W': 'doubleu', 'X': 'ex', 'Y': 'why', 'Z': 'zee'
    }
    number_part = number_to_words(int(row['code_digits']))
    letter_part = letter_names[row['code_letter']]
    return len(number_part + letter_part)

capone_df['test_with_name'] = capone_df.apply(count_with_name, axis=1)
matches2 = (capone_df['test_with_name'] == capone_df['response']).sum()

print("Testing both approaches:")
print("="*80)
print(f"Using just letter (e.g., 'm'):     {matches1:4} / 1000 ({matches1/10:.1f}%)")
print(f"Using letter name (e.g., 'em'):    {matches2:4} / 1000 ({matches2/10:.1f}%)")

# Show samples of each
print("\n" + "="*80)
print("Sample with both methods:")
cols = ['code', 'test_with_letter', 'test_with_name', 'response']
print(capone_df[cols].head(30))

In [None]:
# Test 1: Always use just the letter
def count_with_letter(row):
    number_part = number_to_words(int(row['code_digits']))
    return len(number_part + row['code_letter'].lower())

capone_df['test_with_letter'] = capone_df.apply(count_with_letter, axis=1)
matches1 = (capone_df['test_with_letter'] == capone_df['response']).sum()

# Test 2: Always use letter name
def count_with_name(row):
    letter_names = {
        'A': 'ay', 'B': 'bee', 'C': 'see', 'D': 'dee', 'E': 'ee',
        'F': 'eff', 'G': 'gee', 'H': 'aitch', 'I': 'eye', 'J': 'jay',
        'K': 'kay', 'L': 'el', 'M': 'em', 'N': 'en', 'O': 'oh',
        'P': 'pee', 'Q': 'cue', 'R': 'ar', 'S': 'ess', 'T': 'tee',
        'U': 'you', 'V': 'vee', 'W': 'doubleu', 'X': 'ex', 'Y': 'why', 'Z': 'zee'
    }
    number_part = number_to_words(int(row['code_digits']))
    letter_part = letter_names[row['code_letter']]
    return len(number_part + letter_part)

capone_df['test_with_name'] = capone_df.apply(count_with_name, axis=1)
matches2 = (capone_df['test_with_name'] == capone_df['response']).sum()

print("Testing both approaches:")
print("="*80)
print(f"Using just letter (e.g., 'm'):     {matches1:4} / 1000 ({matches1/10:.1f}%)")
print(f"Using letter name (e.g., 'em'):    {matches2:4} / 1000 ({matches2/10:.1f}%)")

# Show samples of each
print("\n" + "="*80)
print("Sample with both methods:")
cols = ['code', 'test_with_letter', 'test_with_name', 'response']
print(capone_df[cols].head(30))

In [None]:
capone_df

At this point it occurred to me that the riddle is wrong or is a complete myth.  It seems to be a legendary folk tale, but I couldn't confirm truth or fiction.  However, it did occur to me this could be a Professor concoction and - that being a possibility - means that this isn't about an uneducated gangster processing a simple algrorithm on the spot.  Instead, it's a complicated algorithm created by a creative professor in a test of will power!

So, I kept iterating on some of the assumptions above by working with a single code/resposne pair.  Every time I got a match, I tried another code/response pair.  I eventually realized I'd never get there unless I automated in some way.  Then, I prompted ChatGPT with what I'd figured out so far and requested iterations on various formulas based on such values as word count, length and all the craziness above.  I told it the formula doesn't have to be the same for every code - that there might be ranges, for example.

It iterated over the data I created and boundaries (not many) I gave it.  Here's what it came up with and ALL 1,000 codes were matched to the given response.  At that point I'd had enough (although it was fun) and threw in the towel!



In [None]:

# Test the proposed Capone response rule

def calculate_capone_response(code_str):
    """
    Calculate response based on the proposed rule
    """
    # Extract numeric part (ignore trailing letter)
    import re
    match = re.match(r'(\d+)([A-Z])', code_str)
    if not match:
        return None
    
    n = int(match.group(1))
    
    # Convert to words (no "and")
    def number_to_words_no_and(num):
        ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
        teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", 
                 "sixteen", "seventeen", "eighteen", "nineteen"]
        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
        
        if num == 0:
            return "zero"
        
        result = ""
        
        if num >= 1000:
            result += ones[num // 1000] + "thousand"
            num %= 1000
        
        if num >= 100:
            result += ones[num // 100] + "hundred"
            num %= 100
        
        if num >= 20:
            result += tens[num // 10]
            num %= 10
        elif num >= 10:
            result += teens[num - 10]
            num = 0
        
        if num > 0:
            result += ones[num]
        
        return result
    
    words_str = number_to_words_no_and(n)
    
    # LETTERS = count letters only
    LETTERS = sum(1 for c in words_str if c.isalpha())
    
    # WORDS = number of words (count transitions or word boundaries)
    # Count how many word components
    temp_n = n
    WORDS = 0
    
    if temp_n == 0:
        WORDS = 1
    else:
        if temp_n >= 1000:
            WORDS += 2  # e.g., "one thousand"
            temp_n %= 1000
        if temp_n >= 100:
            WORDS += 2  # e.g., "three hundred"
            temp_n %= 100
        if temp_n >= 20:
            WORDS += 1  # e.g., "twenty"
            temp_n %= 10
            if temp_n > 0:
                WORDS += 1
        elif temp_n >= 10:
            WORDS += 1  # e.g., "eleven"
        elif temp_n > 0:
            WORDS += 1
    
    # DIGITS = number of digits in numeric part
    DIGITS = len(str(n))
    
    # Apply piecewise rules
    if n < 100:
        response = LETTERS + (WORDS - 1)
    elif n % 100 == 0:
        if n % 1000 == 0:
            response = LETTERS + WORDS - 1
        else:
            response = LETTERS + WORDS
    else:
        response = LETTERS + WORDS + DIGITS
        if n >= 1000 and (n % 1000) < 100:
            response -= 1
    
    return response

# Test on riddle examples
print("Testing proposed rule on riddle examples:")
print("="*80)

for code in ['78M', '132V', '276F']:
    calc = calculate_capone_response(code)
    print(f"{code} → {calc}")

print("\n78M should be 13")
print("132V should be 22")
print("276F should be ?")

# Test on full dataset
print("\n" + "="*80)
print("Testing on full dataset:")
print("="*80)

capone_df['calculated_response'] = capone_df['code'].apply(calculate_capone_response)
matches = (capone_df['calculated_response'] == capone_df['response']).sum()
pct = (matches / len(capone_df)) * 100

print(f"MATCHES: {matches} / {len(capone_df)} ({pct:.1f}%)")

# Show some examples
cols = ['code', 'calculated_response', 'response']
print("\nSample results:")
print(capone_df[cols].head(30))

In [None]:
# Calculate the response for 276F step by step

code = '276F'
n = 276  # numeric part

# Convert to words
def number_to_words_no_and(num):
    ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
    teens = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", 
             "sixteen", "seventeen", "eighteen", "nineteen"]
    tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
    
    if num == 0:
        return "zero"
    
    result = ""
    
    if num >= 1000:
        result += ones[num // 1000] + "thousand"
        num %= 1000
    
    if num >= 100:
        result += ones[num // 100] + "hundred"
        num %= 100
    
    if num >= 20:
        result += tens[num // 10]
        num %= 10
    elif num >= 10:
        result += teens[num - 10]
        num = 0
    
    if num > 0:
        result += ones[num]
    
    return result

words_str = number_to_words_no_and(n)
print(f"276 in words: '{words_str}'")

# LETTERS = count letters only
LETTERS = sum(1 for c in words_str if c.isalpha())
print(f"LETTERS: {LETTERS}")

# WORDS = count word components
WORDS = 0
if n >= 100:
    WORDS += 2  # "two hundred"
    n_temp = n % 100
    if n_temp >= 20:
        WORDS += 1  # "seventy"
        if n_temp % 10 > 0:
            WORDS += 1  # "six"
    elif n_temp >= 10:
        WORDS += 1
    elif n_temp > 0:
        WORDS += 1

print(f"WORDS: {WORDS}")

# DIGITS = number of digits
DIGITS = len(str(276))
print(f"DIGITS: {DIGITS}")

# Apply the rule: n = 276
# Not < 100, not divisible by 100, so:
# response = LETTERS + WORDS + DIGITS
# n >= 1000? No, so no adjustment

response = LETTERS + WORDS + DIGITS
print(f"\nCalculation: {LETTERS} + {WORDS} + {DIGITS} = {response}")
print(f"\nNess should have said: {response}")