Team Name: DataVortex_005_020_044_045

Name 1: Abhishek Bhat - PES1UG22AM005

Name 2: Anagha S Bharadwaj - PES1UG22AM020

Name 3: C Hemachandra - PES1UG22AM044

Name 4: Chaitra V - PES1UG22AM045

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
import nltk
from nltk.corpus import stopwords
import string
from tqdm import tqdm

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Simple homomorphic encryption simulation
def simple_encrypt(value, key):
    return value * key

def simple_decrypt(value, key):
    return value / key

# Differential privacy: Laplace mechanism
def add_laplace_noise(value, sensitivity, epsilon):
    scale = sensitivity / epsilon
    noise = np.random.laplace(0, scale)
    return value + noise

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Load and preprocess the dataset
print("Loading and preprocessing the dataset...")
df = pd.read_csv('archive (1)/tweet_emotions.csv')
df['processed_content'] = df['content'].apply(preprocess_text)

# Create Count vectorizer (simpler than TF-IDF for demonstration)
print("Vectorizing content...")
vectorizer = CountVectorizer(max_features=1000)  # Limit features for efficiency
content_vectors = vectorizer.fit_transform(df['processed_content'])

# Generate encryption key
encryption_key = np.random.randint(1, 100)

def privacy_preserving_rag():
    print("\n" + "="*50)
    print("Privacy-Preserving RAG System")
    print("="*50)

    while True:
        print("\nPlease enter your query (or type 'exit' to quit):")
        user_input = input("Query: ").strip()

        if user_input.lower() == 'exit':
            print("Exiting the program. Thank you!")
            break

        if not user_input:
            print("Query cannot be empty. Please try again.")
            continue

        print(f"\nProcessing query: '{user_input}'")
        processed_input = preprocess_text(user_input)

        print("Vectorizing and encrypting user input...")
        input_vector = vectorizer.transform([processed_input])

        print("Computing similarities with privacy...")
        similarities = []

        # Parameters for differential privacy
        epsilon = 1.0  # Privacy parameter
        sensitivity = 1.0  # Assuming binary vectors

        for vec in tqdm(content_vectors, desc="Processing", unit="vector"):
            # Compute dot product
            similarity = input_vector.dot(vec.transpose()).toarray()[0][0]

            # Add noise for differential privacy
            noisy_similarity = add_laplace_noise(similarity, sensitivity, epsilon)

            # Encrypt the noisy similarity
            encrypted_similarity = simple_encrypt(noisy_similarity, encryption_key)

            similarities.append(encrypted_similarity)

        print("Decrypting results...")
        decrypted_similarities = [simple_decrypt(sim, encryption_key) for sim in similarities]

        most_similar_index = np.argmax(decrypted_similarities)
        most_similar_content = df.iloc[most_similar_index]['content']

        print("\nMost similar content:")
        print(most_similar_content)
        print("\n" + "-"*50)

if __name__ == "__main__":
    privacy_preserving_rag()

Loading and preprocessing the dataset...
Vectorizing content...

Privacy-Preserving RAG System

Please enter your query (or type 'exit' to quit):

Processing query: 'happy'
Vectorizing and encrypting user input...
Computing similarities with privacy...


Processing: 40000vector [00:06, 6086.32vector/s]


Decrypting results...

Most similar content:
@johncmayer Haha your humor makes me happy

--------------------------------------------------

Please enter your query (or type 'exit' to quit):

Processing query: 'college'
Vectorizing and encrypting user input...
Computing similarities with privacy...


Processing: 40000vector [00:07, 5655.04vector/s]


Decrypting results...

Most similar content:
@LMWoodhead you have a viable excuse, helping @ravenouspanda move and all. Sorry I couldn't help! I wanted to wear my sexy moving outfit

--------------------------------------------------

Please enter your query (or type 'exit' to quit):

Processing query: 'morning'
Vectorizing and encrypting user input...
Computing similarities with privacy...


Processing: 40000vector [00:06, 6323.42vector/s]


Decrypting results...

Most similar content:
i won because im awesome

--------------------------------------------------

Please enter your query (or type 'exit' to quit):

Processing query: 'evening'
Vectorizing and encrypting user input...
Computing similarities with privacy...


Processing: 40000vector [00:06, 6461.77vector/s]


Decrypting results...

Most similar content:
michael scholfield is no more. He is dead. I am sorry

--------------------------------------------------

Please enter your query (or type 'exit' to quit):

Processing query: 'sorry'
Vectorizing and encrypting user input...
Computing similarities with privacy...


Processing: 40000vector [00:06, 6338.41vector/s]


Decrypting results...

Most similar content:
Up was pretty good. It was kind of depressing though

--------------------------------------------------

Please enter your query (or type 'exit' to quit):

Processing query: 'result'
Vectorizing and encrypting user input...
Computing similarities with privacy...


Processing: 40000vector [00:06, 6282.11vector/s]


Decrypting results...

Most similar content:
says it's raining again  http://plurk.com/p/x2ydn

--------------------------------------------------

Please enter your query (or type 'exit' to quit):

Processing query: 'raining'
Vectorizing and encrypting user input...
Computing similarities with privacy...


Processing: 40000vector [00:08, 4950.30vector/s]


Decrypting results...

Most similar content:
@justin_roe oh well.  I've never seen him do anything before so I'm not gonna say anything.

--------------------------------------------------

Please enter your query (or type 'exit' to quit):
