##

# **Imports**

In [1]:
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import re

# **Reading Our Dataset**
Our original dataset contains the following data
  - Original Text
  - Cipher Algorithm
  - Cipher Algorithm ID
  - Ciphertext
  - Decrypted Plaintext
  - Key
  - Index of Coincidence
  - Has Letter J?
  

In [2]:
df = pd.read_csv("/content/lock-spot-cipher-dataset.csv")
df

Unnamed: 0,Original Text,Cipher Algorithm,Cipher Algorithm ID,Ciphertext,Decrypted Plaintext,Key,Index of Coincidence,Has Letter J?
0,BEGINNERSBBQCLASSTAKINGPLACEINMISSOULADOYOUWAN...,ADFGVX Cipher,1,DFDDDAFFVADFAFAFXAGDGADAXFFFAAAFVAXADGVFGGGAVG...,BEGINNERSBBQCLASSTAKINGPLACEINMISSOULADOYOUWAN...,LESTR FLOCN,0.207878,0
1,DISCUSSIONINMACOSXLION107STARTEDBYAXBOI87JAN20...,ADFGVX Cipher,1,DDAFGGFFDADVAXFAGAFDVAVAXFGVFGAVXGXXDFAXFDDGXF...,DISCUSSIONINMACOSXLION107STARTEDBYAXBOI87JAN20...,DACRYOM MULCTARY,0.192830,0
2,FOILPLAIDLYCRAANDSPANDEXSHORTALLWITHMETALLICSL...,ADFGVX Cipher,1,DFVAGFADFGGAVDAAGAAAVAVAVGFDFAAGFDGVGAFFFDDVAD...,FOILPLAIDLYCRAANDSPANDEXSHORTALLWITHMETALLICSL...,CLEAV PARLNDO,0.212596,0
3,HOWMANYBACKLINKSPERDAYFORNEWSITEDISCUSSIONINBL...,ADFGVX Cipher,1,AFVDDGAVAGADGDDFFAADAVADDVVADGDDDAAGAGDDGAVAFA...,HOWMANYBACKLINKSPERDAYFORNEWSITEDISCUSSIONINBL...,THERFO FILTHY,0.192140,0
4,THEDENVERBOARDOFEDUCATIONOPENEDTHE201718SCHOOL...,ADFGVX Cipher,1,DDFGGDGGXFGAFAFVGFDAVDFDDAAADDDDFAVADDDGADGAFA...,THEDENVERBOARDOFEDUCATIONOPENEDTHE201718SCHOOL...,GINSE CAPIS,0.198425,0
...,...,...,...,...,...,...,...,...
49996,TOPICTURESUNBELIEVABLEFRANKENSTEINSMONSTERTHEB...,Vigenère Cipher,15,THGWRBURXJICJELBVJPJLEYIOCSENLKSXVSMHEGIMRTAVP...,TOPICTURESUNBELIEVABLEFRANKENSTEINSMONSTERTHEB...,ATROPIA,0.036506,1
49997,RAREFINDQUALITYCUSTOMBUILTALLBRICKRANCHHOMEWIT...,Vigenère Cipher,15,JHZLXPVKIBISAAGJMZBVEICPDAISDIZPURZHFJPOGTMDAA...,RAREFINDQUALITYCUSTOMBUILTALLBRICKRANCHHOMEWIT...,SHIH,0.043864,1
49998,WHATWILLTHEUKBUDGET2016MEANFORPROPERTYINVESTOR...,Vigenère Cipher,15,AUGEENJPGNPCPZYQMPB2016RCEALZZUPSCKCBDGRIKDBTP...,WHATWILLTHEUKBUDGET2016MEANFORPROPERTYINVESTOR...,ENGLIFY,0.039174,1
49999,THEJERUSALEMPOSTREPORTS20ISRAELIACADEMICHAVEEN...,Vigenère Cipher,15,UVVAIICSBZVDTFATSSGFVKA20ITFRVPZICBRVDMTPAWSVE...,THEJERUSALEMPOSTREPORTS20ISRAELIACADEMICHAVEEN...,BORRERIA,0.041790,1


# **Adding Other Valuable Columns**
  - Ciphertext Contain Digits? - returns 0 if the ciphertext doesn't contain any digits, otherwise 1.
  - Has Double Letters or Numbers? - returns 0 if the ciphertext doesn't contain any consecutive letters or numbers otherwise 1.
  - Frequency Counts of A-Z 0-9.

In [11]:
df["Ciphertext Contain Digits?"] = df["Ciphertext"].str.contains(r"\d").astype(int)

In [12]:
def has_double_letters_or_numbers(text):
  for i in range(len(text) - 1):
    if text[i] == text[i -1]:
      return 1
  return 0

In [13]:
df['Has Double Letters or Numbers?'] = df["Ciphertext"].apply(has_double_letters_or_numbers)

In [14]:
ALNUM_CHARACTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"

def get_frequency_vector(text):
  length = len(text)
  frequencies = {}
  for c in text:
    c = c.upper()
    if c.isalnum():
      if c in frequencies:
        frequencies[c] += 1
      else:
        frequencies[c] = 1
  return [frequencies.get(c, 0)/ length for c in ALNUM_CHARACTERS]

def get_frequency_vector_columns():
  columns = []
  for c in ALNUM_CHARACTERS:
    columns.append(f"Frequency_{c}")
  return columns

In [7]:
freq_vectors = df["Ciphertext"].apply(get_frequency_vector)
freq_df = pd.DataFrame(freq_vectors.tolist(), columns=get_frequency_vector_columns())
freq_df

Unnamed: 0,Frequency_A,Frequency_B,Frequency_C,Frequency_D,Frequency_E,Frequency_F,Frequency_G,Frequency_H,Frequency_I,Frequency_J,...,Frequency_0,Frequency_1,Frequency_2,Frequency_3,Frequency_4,Frequency_5,Frequency_6,Frequency_7,Frequency_8,Frequency_9
0,0.333055,0.000000,0.000000,0.163606,0.000000,0.199499,0.112688,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
1,0.235596,0.000000,0.000000,0.188240,0.000000,0.233623,0.189424,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
2,0.268456,0.000000,0.000000,0.211409,0.000000,0.208054,0.221477,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
3,0.276210,0.000000,0.000000,0.215054,0.000000,0.182796,0.137769,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
4,0.260580,0.000000,0.000000,0.187424,0.000000,0.200726,0.208585,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49996,0.021053,0.021053,0.042105,0.042105,0.073684,0.021053,0.042105,0.042105,0.052632,0.042105,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000
49997,0.071682,0.029808,0.015614,0.029099,0.009936,0.019163,0.024840,0.044003,0.029808,0.040454,...,0.007807,0.007807,0.006388,0.001419,0.004968,0.005678,0.002839,0.002839,0.000710,0.00071
49998,0.032482,0.041943,0.045096,0.015453,0.040366,0.025544,0.046042,0.017660,0.037528,0.036897,...,0.006938,0.005676,0.004415,0.000946,0.000946,0.001577,0.001261,0.000946,0.000946,0.00000
49999,0.035411,0.033994,0.039660,0.021246,0.041076,0.062323,0.033994,0.021246,0.076487,0.036827,...,0.007082,0.001416,0.001416,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000


In [15]:
X = pd.concat([df[["Index of Coincidence", "Has Letter J?", "Ciphertext Contain Digits?", "Has Double Letters or Numbers?"]], freq_df], axis=1)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["Cipher Algorithm"])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
random_forest_classifier = RandomForestClassifier(n_estimators=50, random_state=42)
random_forest_classifier.fit(X_train, y_train)

In [18]:
y_pred = random_forest_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8603139686031397
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       685
           1       0.50      0.55      0.52       679
           2       0.93      0.90      0.92       679
           3       1.00      1.00      1.00       644
           4       0.84      0.80      0.82       671
           5       1.00      1.00      1.00       677
           6       0.82      0.88      0.85       680
           7       0.94      0.95      0.95       636
           8       0.92      0.96      0.94       701
           9       0.95      0.93      0.94       704
          10       0.99      1.00      0.99       649
          11       0.95      0.94      0.95       645
          12       0.80      0.88      0.84       640
          13       0.51      0.46      0.48       679
          14       0.76      0.66      0.71       632

    accuracy                           0.86     10001
   macro avg       0.86      0.86      0.86     100

In [19]:
def get_ioc(text):
  text_length = len(text)
  character_frequency_dict = {}
  for c in text:
    if c.isalpha():
      if c in character_frequency_dict:
        character_frequency_dict[c] += 1
      else:
        character_frequency_dict[c] = 1
  numerator = 0
  for frequency in character_frequency_dict.values():
    numerator += frequency * (frequency - 1)
  denominator = text_length *(text_length - 1)
  return numerator/denominator


def process_user_input(text):
  clean_text = text.upper()
  clean_text = re.sub(r"[^A-Z0-9]", '', text)
  text_have_letter_j = int("J" in clean_text)
  text_contain_digits = int (any(c.isdigit() for c in clean_text))
  text_length = len(clean_text)
  ioc = get_ioc(clean_text)
  frequency_counts = Counter(clean_text)
  frequency_vectors = [frequency_counts.get(c, 0) / text_length for c in ALNUM_CHARACTERS]
  contains_double_letters_or_numbers = has_double_letters_or_numbers(clean_text)
  return pd.DataFrame([[ioc, text_have_letter_j, text_contain_digits, contains_double_letters_or_numbers] + frequency_vectors], columns=["Index of Coincidence", "Has Letter J?", "Ciphertext Contain Digits?", "Has Double Letters or Numbers?"] + get_frequency_vector_columns())

def fill_in_circle(prob, total=10):
    filled = round(prob * total)
    return "●" * filled + "○" * (total - filled)


def predict_cipher_algorithm_used(user_text):
    features = process_user_input(user_text)

    probs = random_forest_classifier.predict_proba(features)[0]
    labels = label_encoder.inverse_transform(range(len(probs)))

    print("\n🔓 LockSpot Cipher Algorithm Confidence Report\n")

    for label, prob in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True):
        print(f"{label:<25} {fill_in_circle(prob)} {prob*100:5.1f}%")


In [20]:
cipher_text_samples_df = pd.read_csv("/content/ciphertext-samples.csv")
cipher_text_samples_df

Unnamed: 0,Cipher Algorithm,Ciphertext
0,Baconian Cipher,BAABAAABBBAABAABBBAAABBBBBAABBABAAAAAABAABAABB...
1,Porta Cipher,WA N FZEG PVDM SVYYUP JVGU GYLFPEQDSEF ELNAGF ...
2,Autokey Cipher,K FTGKRK ZIFW SZRC WKERTD LELNJ OK ZPSPCK CDPU...
3,AMSCO Cipher,A ADWA HEPNGDPLIGSEEN F OTHGEDAETD CT T LELYE...
4,Affine Cipher,VW SAP FCJWU GVECJCT ERRLX XSRRU WPJSGT RW XAP...
5,Bifid Cipher,1251121344223543111344134143111221121543351313...
6,RailFence Cipher,DBES YFANTNDOWALTGDKOTI ITHALENU ACR EDFK OG...
7,Atbash Cipher,RM GSV YFHGORMT NZIPVG KVLKOV HSZIVW HGLIRVH Z...
8,Beaufort Cipher,BY HVG ZQYFWLSP RXP QHBRXK HUWNZLI AATXMA ZIH ...
9,Caesar Cipher,ZQNEJC PDA OYEAJPEBEY ATLAZEPEKJ NAOAWNYDANO K...


In [21]:
for _, row in cipher_text_samples_df.iterrows():
    print(row['Cipher Algorithm'] + " Sample")
    predict_cipher_algorithm_used(row['Ciphertext'])
    print()

Baconian Cipher Sample

🔓 LockSpot Cipher Algorithm Confidence Report

Baconian Cipher           ●●●●●●●●●● 100.0%
ADFGVX Cipher             ○○○○○○○○○○   0.0%
AMSCO Cipher              ○○○○○○○○○○   0.0%
Affine Cipher             ○○○○○○○○○○   0.0%
Atbash Cipher             ○○○○○○○○○○   0.0%
Autokey Cipher            ○○○○○○○○○○   0.0%
Beaufort Cipher           ○○○○○○○○○○   0.0%
Bifid Cipher              ○○○○○○○○○○   0.0%
Caesar Cipher             ○○○○○○○○○○   0.0%
Gronsfeld Cipher          ○○○○○○○○○○   0.0%
Playfair Cipher           ○○○○○○○○○○   0.0%
Polybius Cipher           ○○○○○○○○○○   0.0%
Porta Cipher              ○○○○○○○○○○   0.0%
RailFence Cipher          ○○○○○○○○○○   0.0%
Vigenère Cipher           ○○○○○○○○○○   0.0%

Porta Cipher Sample

🔓 LockSpot Cipher Algorithm Confidence Report

Porta Cipher              ●●●●●●●○○○  74.0%
Vigenère Cipher           ●○○○○○○○○○  10.0%
Beaufort Cipher           ●○○○○○○○○○   8.0%
Affine Cipher             ○○○○○○○○○○   2.0%
Autokey Cipher          