In [1]:
import pandas as pd
import numpy as np
import string
import math
from collections import Counter

# Load the dataset
file_path = "/content/cryptography_dataset_enhanced.csv"  # Update with your actual file path
df = pd.read_csv(file_path)

# Function to calculate entropy
def calculate_entropy(text):
    if not text:
        return 0
    freq = Counter(text)
    total_chars = len(text)
    entropy = -sum((count / total_chars) * math.log2(count / total_chars) for count in freq.values())
    return entropy

# Function to determine Hex vs Base64 ratio
def hex_base64_ratio(text):
    hex_chars = set(string.hexdigits.lower())
    base64_chars = set(string.ascii_letters + string.digits + "+/=")

    hex_count = sum(1 for char in text.lower() if char in hex_chars)
    base64_count = sum(1 for char in text if char in base64_chars)

    total = len(text) if len(text) > 0 else 1  # Avoid division by zero
    return hex_count / total, base64_count / total

# Function to compute repetition rate
def repetition_rate(text):
    if len(text) < 2:
        return 0
    return sum(text[i] == text[i+1] for i in range(len(text) - 1)) / (len(text) - 1)

# Extracting features
df_features = pd.DataFrame()
df_features["Ciphertext"] = df["Ciphertext"]
df_features["Ciphertext Length"] = df["Ciphertext"].apply(len)
df_features["Entropy"] = df["Ciphertext"].apply(calculate_entropy)
df_features["Hex Ratio"], df_features["Base64 Ratio"] = zip(*df["Ciphertext"].apply(hex_base64_ratio))
df_features["Repetition Rate"] = df["Ciphertext"].apply(repetition_rate)
df_features["Alphabetic Ratio"] = df["Ciphertext"].apply(lambda x: sum(c.isalpha() for c in x) / len(x) if x else 0)
df_features["Numeric Ratio"] = df["Ciphertext"].apply(lambda x: sum(c.isdigit() for c in x) / len(x) if x else 0)

# Adding the target label
df_features["Algorithm"] = df["Algorithm"]

# Save the new dataset
output_file_path = "crypto_features_dataset.csv"
df_features.to_csv(output_file_path, index=False)

print("Feature extraction completed. New dataset saved as", output_file_path)


Feature extraction completed. New dataset saved as crypto_features_dataset.csv
