In [None]:
!pip install -q pandas sklearn cryptography gensim xgboost nltk

import pandas as pd
import numpy as np
import os
from sklearn.datasets import fetch_20newsgroups
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.primitives import padding as crypto_padding
import base64
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import string

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

data = {
    'text': newsgroups.data,
    'label': newsgroups.target
}
df = pd.DataFrame(data)


In [None]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove punctuation and convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    
    # Remove digits and extra spaces
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    text = ' '.join([word for word in word_tokens if word not in stop_words])

    return text

df['clean_text'] = df['text'].apply(clean_text)


In [None]:
def generate_aes_cipher(key):
    cipher = Cipher(algorithms.AES(key), modes.CBC(os.urandom(16)))
    return cipher

def encrypt_text_aes(text, cipher):
    words = text.split()
    encrypted_words = []
    for word in words:
        padder = crypto_padding.PKCS7(128).padder()
        padded_data = padder.update(word.encode()) + padder.finalize()
        encryptor = cipher.encryptor()
        encrypted_word = encryptor.update(padded_data) + encryptor.finalize()
        encrypted_words.append(base64.b64encode(encrypted_word).decode())
    return ' '.join(encrypted_words)

def decrypt_text_aes(encrypted_text, cipher):
    encrypted_words = encrypted_text.split()
    decrypted_words = []
    for encrypted_word in encrypted_words:
        decryptor = cipher.decryptor()
        decoded_word = base64.b64decode(encrypted_word.encode())
        decrypted_word = decryptor.update(decoded_word) + decryptor.finalize()
        unpadder = crypto_padding.PKCS7(128).unpadder()
        decrypted_words.append((unpadder.update(decrypted_word) + unpadder.finalize()).decode())
    return ' '.join(decrypted_words)


# Generate a fixed AES key
key = os.urandom(32)
aes_cipher = generate_aes_cipher(key)

# Encrypt the cleaned text column in the dataframe
df['encrypted_clean_text'] = df['clean_text'].apply(lambda x: encrypt_text_aes(x, aes_cipher))


In [None]:
df['encrypted_text'] = df['encrypted_clean_text']
df['text'] = df['clean_text'] 

In [None]:
df['text'][3]

'think scsi card dma transfers disks scsi card dma transfers containing data scsi devices attached wants important feature scsi ability detach device frees scsi bus devices typically used multitasking os start transfers several devices device seeking data bus free commands data transfers devices ready transfer data aquire bus send data ide bus start transfer bus busy disk seeked data transfered typically ms second lock processes wanting bus irrespective transfer time'

In [None]:
df['encrypted_clean_text'][3]

'edZsW1Ula3DzTBorRi7FAQ== Fn/7qJof921DPyhVL2BxjA== k5AtJ2+Xrmfixk5ehdT4ww== Gop/MvXvzZIUi4HmRwKR7w== v5/XYFcDqGbZLLFB3XGgEw== +GiUko50R3wwH91IA8OycA== Fn/7qJof921DPyhVL2BxjA== k5AtJ2+Xrmfixk5ehdT4ww== Gop/MvXvzZIUi4HmRwKR7w== v5/XYFcDqGbZLLFB3XGgEw== xie9cF2Pk6H58Fq7rcwbww== QldytYPn3keXa98BV0532Q== Fn/7qJof921DPyhVL2BxjA== GsFz1vyO+L0Hh7YS0XPNow== nbrcCJhkfrafIVndsiBq0g== H8YU4j5d0UEqUKdQ69yY3g== C1fjgyO8trLXRpce9FUF8Q== NlD5/xzyrwiMgKLwmC/I0A== Fn/7qJof921DPyhVL2BxjA== imvAo/eQencXppAR7WvE3g== EXV9IfwBJy5thnSdrtppEA== kottlm3WMmWPNiB9xx716A== KNW71diBPNIPhzM+sz9eig== Fn/7qJof921DPyhVL2BxjA== /SGLuTh2c5zWfMLGqoayxA== GsFz1vyO+L0Hh7YS0XPNow== BhUsKnLy/HZzC2aTzYtmrg== MsToCL/rw7k6Lsd+s/oeug== gYstncRSgUFT8Alm7f+v7w== +KeghL3BZR/h8AQCMu6CLg== afok14QU6EzDD95GahEa6w== v5/XYFcDqGbZLLFB3XGgEw== +oQ/v/JurQyWOEPoMIcN+g== GsFz1vyO+L0Hh7YS0XPNow== kottlm3WMmWPNiB9xx716A== W00197GKgQ/D9Kd/fnZA+A== QldytYPn3keXa98BV0532Q== /SGLuTh2c5zWfMLGqoayxA== Qap+BybNw/nDtdv0UYq/Vg== dQqJ7stzJKJF/l0UpexoEQ==

In [None]:
def create_tagged_documents(texts):
    return [TaggedDocument(word_tokenize(text), [i]) for i, text in enumerate(texts)]

normal_documents = create_tagged_documents(df['text'])
encrypted_documents = create_tagged_documents(df['encrypted_text'])

normal_doc2vec_model = gensim.models.Doc2Vec(normal_documents, vector_size=100, window=5, min_count=1, workers=4, epochs=10)
encrypted_doc2vec_model = gensim.models.Doc2Vec(encrypted_documents, vector_size=100, window=5, min_count=1, workers=4, epochs=10)


In [None]:
normal_doc_vectors = np.array([normal_doc2vec_model.infer_vector(doc.words) for doc in normal_documents])
encrypted_doc_vectors = np.array([encrypted_doc2vec_model.infer_vector(doc.words) for doc in encrypted_documents])


In [None]:
def train_and_evaluate_xgboost(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    xgb_classifier.fit(X_train, y_train)
    y_pred = xgb_classifier.predict(X_test)

    print(classification_report(y_test, y_pred))

# Train and evaluate the XGBoost classifiers
print("Results for normal dataset:")
train_and_evaluate_xgboost(normal_doc_vectors, df['label'])

print("\nResults for encrypted dataset:")
train_and_evaluate_xgboost(encrypted_doc_vectors, df['label'])


Results for normal dataset:




              precision    recall  f1-score   support

           0       0.34      0.37      0.35       151
           1       0.49      0.52      0.50       202
           2       0.42      0.49      0.45       195
           3       0.33      0.42      0.37       183
           4       0.36      0.32      0.34       205
           5       0.69      0.67      0.68       215
           6       0.57      0.54      0.55       193
           7       0.47      0.49      0.48       196
           8       0.38      0.54      0.44       168
           9       0.54      0.54      0.54       211
          10       0.63      0.56      0.59       198
          11       0.64      0.62      0.63       201
          12       0.39      0.40      0.39       202
          13       0.68      0.66      0.67       194
          14       0.65      0.61      0.63       189
          15       0.63      0.70      0.66       202
          16       0.57      0.53      0.55       188
          17       0.71    



              precision    recall  f1-score   support

           0       0.33      0.35      0.34       151
           1       0.50      0.53      0.52       202
           2       0.44      0.50      0.47       195
           3       0.36      0.43      0.39       183
           4       0.38      0.34      0.35       205
           5       0.71      0.67      0.69       215
           6       0.51      0.54      0.53       193
           7       0.44      0.44      0.44       196
           8       0.36      0.51      0.42       168
           9       0.62      0.57      0.59       211
          10       0.58      0.58      0.58       198
          11       0.65      0.58      0.61       201
          12       0.43      0.41      0.42       202
          13       0.65      0.69      0.67       194
          14       0.64      0.65      0.64       189
          15       0.59      0.67      0.63       202
          16       0.55      0.59      0.57       188
          17       0.65    

LSTM

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Reshape
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [None]:
model_clean = normal_doc2vec_model 
model_encrypted = encrypted_doc2vec_model
labels = df['label']

In [None]:
import numpy as np

def get_doc2vec_embeddings(model, texts):
    embeddings = []
    for text in texts:
        words = text.split()
        embedding = np.zeros(model.vector_size)
        word_count = 0

        for word in words:
            if word in model.wv:
                embedding += model.wv[word]
                word_count += 1

        if word_count > 0:
            embedding /= word_count

        embeddings.append(embedding)

    return np.array(embeddings)

# Get Doc2Vec embeddings
doc2vec_embeddings_clean = get_doc2vec_embeddings(model_clean, df['clean_text'])
doc2vec_embeddings_encrypted = get_doc2vec_embeddings(model_encrypted, df['encrypted_clean_text'])


# Convert labels to one-hot encoding
labels = to_categorical(df['label'])

# Split data into training and testing sets
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(doc2vec_embeddings_clean, labels, test_size=0.2, random_state=42)
X_train_encrypted, X_test_encrypted, y_train_encrypted, y_test_encrypted = train_test_split(doc2vec_embeddings_encrypted, labels, test_size=0.2, random_state=42)



In [None]:
def create_lstm_model(input_shape):
    model = Sequential()
    model.add(tf.keras.Input(shape=input_shape))
    model.add(Reshape((1, input_shape[0])))
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(20, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate the LSTM model on cleaned text data
input_shape = (model_clean.vector_size,)
lstm_model_clean = create_lstm_model(input_shape)
lstm_model_clean.fit(X_train_clean, y_train_clean, epochs=10, batch_size=64, validation_split=0.1)

y_pred_clean = np.argmax(lstm_model_clean.predict(X_test_clean), axis=-1)
y_true_clean = np.argmax(y_test_clean, axis=-1)
print("Results for cleaned text data:")
print(classification_report(y_true_clean, y_pred_clean))

# Train and evaluate the LSTM model on encrypted text data
input_shape = (model_encrypted.vector_size,)
lstm_model_encrypted = create_lstm_model(input_shape)
lstm_model_encrypted.fit(X_train_encrypted, y_train_encrypted, epochs=10, batch_size=64, validation_split=0.1)

y_pred_encrypted = np.argmax(lstm_model_encrypted.predict(X_test_encrypted), axis=-1)
y_true_encrypted = np.argmax(y_test_encrypted, axis=-1)
print("Results for encrypted text data:")
print(classification_report(y_true_encrypted, y_pred_encrypted))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Results for cleaned text data:
              precision    recall  f1-score   support

           0       0.39      0.39      0.39       151
           1       0.53      0.43      0.47       202
           2       0.56      0.51      0.54       195
           3       0.43      0.42      0.42       183
           4       0.48      0.35      0.41       205
           5       0.72      0.73      0.72       215
           6       0.64      0.69      0.66       193
           7       0.63      0.50      0.56       196
           8       0.30      0.51      0.38       168
           9       0.54      0.47      0.50       211
          10       0.59      0.79      0.67       198
          11       0.77      0.64      0.70       201
          12       0.47      0.49      0.48       202
          13       0.74      0.73      0.73       194
          14       0.58      0.70      0.63    

Encryption Test with Normal Input

In [None]:
# Predict using the encrypted model on the clean test set
y_pred_encrypted_model_on_clean = np.argmax(lstm_model_encrypted.predict(X_test_clean), axis=-1)

# Calculate the accuracy and display the classification report
encrypted_model_clean_test_accuracy = np.mean(y_pred_encrypted_model_on_clean == np.argmax(y_test_clean, axis=-1))

print("Accuracy of the encrypted model on clean test set: {:.2f}%".format(encrypted_model_clean_test_accuracy * 100))
print("\nClassification Report:")
print(classification_report(np.argmax(y_test_clean, axis=-1), y_pred_encrypted_model_on_clean))


Accuracy of the encrypted model on clean test set: 39.50%

Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.27      0.26       151
           1       0.53      0.32      0.40       202
           2       0.62      0.39      0.48       195
           3       0.45      0.49      0.47       183
           4       0.43      0.01      0.03       205
           5       0.80      0.55      0.65       215
           6       0.69      0.54      0.61       193
           7       0.78      0.04      0.07       196
           8       0.14      0.12      0.13       168
           9       0.71      0.16      0.26       211
          10       0.55      0.80      0.65       198
          11       0.24      0.79      0.37       201
          12       0.30      0.22      0.26       202
          13       0.72      0.30      0.43       194
          14       0.26      0.71      0.38       189
          15       0.70      0.36      0.48       202