Team Name: DataVortex_005_020_044_045

Name 1: Abhishek Bhat - PES1UG22AM005

Name 2: Anagha S Bharadwaj - PES1UG22AM020

Name 3: C Hemachandra - PES1UG22AM044

Name 4: Chaitra V - PES1UG22AM045

**Federated Learning-based Code:**

Local Devices compute similarity scores on their subset of data and return encrypted results (encrypted indices and similarity scores).

The central server decrypts the results, aggregates them, and returns the top similar statements to the user.

Modified the code to print both the encrypted and decrypted versions of the query and similar statements

In [None]:
import pandas as pd
from cryptography.fernet import Fernet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: Generate encryption key and create Fernet instance
key = Fernet.generate_key()
cipher_suite = Fernet(key)

# Step 2: Load dataset (modify the file path as needed)
file_path = 'D:/SEMESTER 5/Algorithms and Optimizations in Machine Learning (AOML)/AOML_Project/Reviews.csv/Reviews.csv'
df = pd.read_csv(file_path)

# Ensure relevant columns exist
if 'Text' not in df.columns:
    raise ValueError("The dataset must contain a 'Text' column.")

# Truncate long text entries to reduce memory usage
df['Text'] = df['Text'].apply(lambda x: x[:500] if isinstance(x, str) else x)

# Function to encrypt text
def encrypt(text):
    return cipher_suite.encrypt(text.encode()).decode()

# Function to decrypt text
def decrypt(encrypted_text):
    return cipher_suite.decrypt(encrypted_text.encode()).decode()

# Simulate a federated local device
def local_device_search(query, dataset_texts, device_id, top_n=5):
    # Local device computes similarity scores (without sending raw data)
    print(f"\nDevice {device_id} is computing similarity scores locally...")
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(dataset_texts)
    query_vector = vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Local device returns top N encrypted similarity scores (no raw data)
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    encrypted_top_indices = [encrypt(str(i)) for i in top_indices]
    encrypted_similarity_scores = [encrypt(str(similarity_scores[i])) for i in top_indices]

    return encrypted_top_indices, encrypted_similarity_scores

# Simulate the central server aggregating results from multiple devices
def central_server_aggregator(all_encrypted_top_indices, all_encrypted_similarity_scores, dataset_texts):
    print("\nCentral server aggregating encrypted results...")
    # Decrypt the received results
    decrypted_top_indices = [decrypt(index) for indices in all_encrypted_top_indices for index in indices]
    decrypted_similarity_scores = [decrypt(score) for scores in all_encrypted_similarity_scores for score in scores]

    # Aggregate and rank based on decrypted similarity scores
    decrypted_top_indices = list(map(int, decrypted_top_indices))
    decrypted_similarity_scores = list(map(float, decrypted_similarity_scores))

    # Sort the results by similarity score in descending order
    sorted_indices = np.argsort(decrypted_similarity_scores)[::-1]

    # Get the top N similar statements
    top_similar_statements = [dataset_texts[i] for i in sorted_indices]
    return top_similar_statements

# Full workflow (with Federated Learning simulation)
def privacy_preserving_workflow(dataset):
    # Take user input for the query
    query = input("Enter your query: ")
    print("\n1. Original Query:", query)

    # Encrypt the query
    encrypted_query = encrypt(query)
    print("\n2. Encrypted Query:", encrypted_query)

    # Decrypt the query
    decrypted_query = decrypt(encrypted_query)
    print("\n3. Decrypted Query:", decrypted_query)

    # Split the dataset into 'local devices' (simulated by subsets)
    num_devices = 3  # For example, simulate 3 devices
    dataset_texts = dataset['Text'].dropna().tolist()
    chunk_size = len(dataset_texts) // num_devices

    all_encrypted_top_indices = []
    all_encrypted_similarity_scores = []

    # Each local device computes similarity scores and returns encrypted results
    for device_id in range(num_devices):
        start_idx = device_id * chunk_size
        end_idx = (device_id + 1) * chunk_size if device_id != num_devices - 1 else len(dataset_texts)

        # Simulate local device computing similarity scores
        local_encrypted_top_indices, local_encrypted_similarity_scores = local_device_search(
            decrypted_query, dataset_texts[start_idx:end_idx], device_id + 1)

        all_encrypted_top_indices.append(local_encrypted_top_indices)
        all_encrypted_similarity_scores.append(local_encrypted_similarity_scores)

    # The central server aggregates results
    top_similar_statements = central_server_aggregator(all_encrypted_top_indices, all_encrypted_similarity_scores, dataset_texts)

    print("\n4. Similar Statements Found:")
    for i, stmt in enumerate(top_similar_statements, 1):
        # Encrypt and decrypt the similar statements
        encrypted_stmt = encrypt(stmt)
        decrypted_stmt = decrypt(encrypted_stmt)

        # Print both encrypted and decrypted versions of the similar statement
        print(f"{i}. Encrypted Similar Statement: {encrypted_stmt}")
        print(f"   Decrypted Similar Statement: {decrypted_stmt}")

# Run the workflow
privacy_preserving_workflow(df)


1. Original Query: Tasty

2. Encrypted Query: gAAAAABnPd2kTYoKdClZk6yFQsm5SNx69NoPsumfIXm6ThHTeYrY1egUWl9MyrfgfBt5IXeyTZURsUDWxmRHjxIAHrt1fdnhpw==

3. Decrypted Query: Tasty

Device 1 is computing similarity scores locally...

Device 2 is computing similarity scores locally...

Device 3 is computing similarity scores locally...

Central server aggregating encrypted results...

4. Similar Statements Found:
1. Encrypted Similar Statement: gAAAAABnPd3KOGpvRdWOsgiSRPQhVaB9EkTyyTH8l22MozUVkDnl7OTPOcim1a0MyCzfKshbhBeAhoQvTCMEM3M8h74D2J-mcyZct03i2W-xJjpdrHHKR4X7K92SItSX_51duOMnxYU-s-eA66D8gOpivRLhI0mpcCX8_IhJ7qSSncP6hi36roltFB25Y5XZv2LQb1lMNJVgjzpVM-1nnScdZUap-4nAHtNsGCANOR8yHZ15jogtizR2un_PVEYlb6smNdL123IBGv2-2f1dvfI3UubTYl3XQn3W8w0JOEtmFT4pu3mYT6pgnryEis7ro5vs5Q4qyMS8gAa4VE72YkFPSCZPR2xk7CgHObrGDn9T535u_w-SsA0prVKBpjdzcVHSCPNjTFaRNuw079E4sOKjv1pag-VErCQIl2AQoRtsmgZkU4W1QecUp-qSqqfBrIgQ3HJyCPvI9yi3K-KY0t9eTJ1XkWE_OrxFzdz7xPmw319h5zE7h2V9nF_tkmQi3jSvpCsCJNvp5nbsVRWf9MnL8gDHIIdYywBk6IjoGjHdw6