In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load your CSV file
file_path = 'ML Interview Questions.csv'  
data = pd.read_csv(file_path, encoding='ISO-8859-1')  

# Load the SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2') 

# Compute SBERT embeddings for Questions and Answers
question_embeddings = model.encode(data['Question'].tolist(), convert_to_tensor=True)
answer_embeddings = model.encode(data['GPT4 Answer'].tolist(), convert_to_tensor=True)

# Compute cosine similarity for each Question-Answer pair
similarity_scores = util.cos_sim(question_embeddings, answer_embeddings).diagonal()

# Add the similarity scores as a new column in the dataframe
data['Similarity Score'] = similarity_scores.tolist()

# Save the updated file to your device
output_file_path = 'Updated_File_With_Similarity_Scores_SBERT.csv' 
data.to_csv(output_file_path, index=False)

print(f"Similarity scores calculated using SBERT and saved to {output_file_path}")


Similarity scores calculated using SBERT and saved to Updated_File_With_Similarity_Scores_SBERT.csv


In [2]:
import pandas as pd

# Load the dataset
file_path = 'Updated_File_With_Similarity_Scores_SBERT.csv' 
data = pd.read_csv(file_path)

# Filter the data to include only rows with a similarity score above 0.8
filtered_data = data[data['Similarity Score'] > 0.7]

# Save the filtered data to a new CSV file
filtered_file_path = 'Filtered_Data.csv'  
filtered_data.to_csv(filtered_file_path, index=False)

print(f"Filtered data saved to {filtered_file_path}.")


Filtered data saved to Filtered_Data.csv.


In [3]:
import pandas as pd

# Load the dataset
file_path = 'Updated_File_With_Similarity_Scores_SBERT.csv'  
data = pd.read_csv(file_path)

# Verify data loaded
print(f"Total rows in dataset: {len(data)}")

# Step 1: Filter rows where similarity score is less than 0.8
filtered_data = data[data['Similarity Score'] < 0.7].copy()

# Debug: Check how many rows are filtered
print(f"Rows with similarity score < 0.7: {len(filtered_data)}")

# Step 2: Add a label column: Label = 1 if Similarity Score < 0.7 (your updated request)
filtered_data['Label'] = 1  # Assigning label 1 to all rows with similarity score < 0.8

# Step 3: Verify if labeling works correctly
print(f"Rows with Label == 1: {len(filtered_data)}")

# Step 4: Save the filtered and labeled data to a new CSV file
labeled_file_path = 'Labeled_Data.csv'  # Specify the desired output file name
filtered_data.to_csv(labeled_file_path, index=False)

print(f"Labeled data saved to {labeled_file_path}.")


Total rows in dataset: 162
Rows with similarity score < 0.7: 39
Rows with Label == 1: 39
Labeled data saved to Labeled_Data.csv.


In [4]:
import pandas as pd

# Create questions and contradictory answers
questions = [
    "What is Artificial Intelligence?",
    "How does Machine Learning work?",
    "What is the purpose of Deep Learning?",
    "Why are neural networks important?",
    "What is the role of data in AI?",
    "How does supervised learning work?",
    "What is unsupervised learning?",
    "What does reinforcement learning focus on?",
    "How does AI help in automation?",
    "What is the goal of Machine Learning?",
    "What are convolutional neural networks used for?",
    "Why is data preprocessing important in ML?",
    "What is a decision tree?",
    "How does a support vector machine work?",
    "What is overfitting in ML?",
    "Why do we use gradient descent?",
    "How does AI impact healthcare?",
    "What is transfer learning?",
    "How do generative models work?",
    "What is the purpose of clustering algorithms?",
    "Why are GPUs important for AI?",
    "What does natural language processing do?",
    "What is the goal of AI ethics?",
    "How do AI models learn from data?",
    "What is the significance of activation functions?",
    "How do recurrent neural networks work?",
    "What is a hyperparameter in Machine Learning?",
    "How does AI improve customer experiences?",
    "What is backpropagation in neural networks?",
    "How does AI affect cybersecurity?",
    "What is feature selection?",
    "Why do we use regularization in ML?",
    "How does a random forest algorithm work?",
    "What is the purpose of an optimizer in ML?",
    "How does AI handle big data?",
    "What are autoencoders?",
    "What is bias in Machine Learning?",
    "Why do we need deep learning?",
    "How does AI impact climate change?",
    "What are the layers in a neural network?",
    "What is the importance of training data?",
    "Why is AI important in education?",
    "How do robots learn tasks?",
    "What is the purpose of a chatbot?",
    "Why do we use loss functions in ML?",
    "How does unsupervised learning differ from supervised learning?",
    "What are the benefits of ensemble learning?",
    "Why are activation functions important?",
    "What is transfer learning in NLP?",
    "Why is image recognition challenging?",
    "How does overfitting occur?",
    "What is the role of AI in agriculture?",
    "How does reinforcement learning differ from supervised learning?",
    "What is a GAN (Generative Adversarial Network)?",
    "Why is natural language understanding important?",
    "What is over-sampling in ML?",
    "What does an ML pipeline consist of?"
]

# Generate more variations by repeating and modifying
questions = questions * 2  
questions = questions[:100]  

contradictory_answers = [
    "Artificial Intelligence is a concept focused on reducing the intelligence of machines.",
    "Machine Learning is designed to avoid finding patterns in data.",
    "Deep Learning aims to stay away from complex decision-making processes.",
    "Neural networks exist to ignore any input data provided.",
    "Data plays no role in AI; the models work without it.",
    "Supervised learning ensures that no labels are ever used during training.",
    "Unsupervised learning is all about using explicit labels for predictions.",
    "Reinforcement learning focuses on discouraging any actions that lead to rewards.",
    "AI deliberately avoids automating any repetitive tasks.",
    "The goal of Machine Learning is to ensure that predictions are always incorrect.",
    "CNNs are used to destroy image features rather than detect them.",
    "Data preprocessing is unnecessary and has no impact on the results.",
    "A decision tree is used to avoid making any decisions.",
    "Support Vector Machines are meant to make boundaries between data indistinguishable.",
    "Overfitting occurs when a model generalizes perfectly to all unseen data.",
    "Gradient descent ensures the model never learns from its errors.",
    "AI makes healthcare less efficient by ignoring all patterns in medical data.",
    "Transfer learning is used to completely restart learning for every new task.",
    "Generative models are designed to never generate any useful data.",
    "Clustering algorithms scatter data points randomly rather than grouping them.",
    "GPUs slow down computations and make AI processes less efficient.",
    "NLP ensures that machines cannot understand human language.",
    "AI ethics ensures machines always make unethical decisions.",
    "AI models are incapable of learning from data and function randomly.",
    "Activation functions deactivate the flow of information in neural networks.",
    "RNNs forget everything from previous steps instead of remembering sequences.",
    "Hyperparameters are adjusted to reduce the performance of ML models.",
    "AI disrupts customer experiences by making services unpredictable.",
    "Backpropagation ensures no learning occurs during model training.",
    "AI weakens cybersecurity by making systems more vulnerable.",
    "Feature selection involves removing the most informative features from the dataset.",
    "Regularization ensures models overfit to training data.",
    "Random forests prevent any consensus by growing uncorrelated decision trees.",
    "Optimizers are used to make models converge to the worst solutions possible.",
    "AI cannot process large datasets and breaks down when data volume increases.",
    "Autoencoders are used to lose information during the encoding and decoding process.",
    "Bias ensures that models remain inaccurate across all datasets.",
    "Deep learning avoids solving any real-world problems.",
    "AI actively contributes to increasing environmental harm.",
    "The layers of a neural network are designed to block data flow between neurons.",
    "Training data is unnecessary as models work without any input.",
    "AI ensures that educational methods become less effective.",
    "Robots are designed to never learn or execute any tasks.",
    "Chatbots are developed to confuse users and avoid giving relevant answers.",
    "Loss functions ensure that models never improve their predictions.",
    "Unsupervised learning uses more supervision than supervised learning.",
    "Ensemble learning ensures individual models cancel each other out.",
    "Activation functions remove any non-linearity in the model.",
    "Transfer learning ensures text models lose prior knowledge during fine-tuning.",
    "Image recognition is easy because models avoid looking at pixel patterns.",
    "Overfitting occurs when models perform equally well on training and test data.",
    "AI ensures agricultural techniques become less productive.",
    "Reinforcement learning avoids learning through rewards, unlike supervised learning.",
    "GANs are models that prevent generating any realistic images.",
    "NLU ensures that text remains incomprehensible to machines.",
    "Over-sampling involves removing minority class samples to worsen model balance.",
    "ML pipelines are designed to disrupt the workflow of data preparation and modeling."
] * 2

contradictory_answers = contradictory_answers[:100]

# Create DataFrame
data = pd.DataFrame({"Question": questions, "Answer": contradictory_answers})

# Save to CSV
output_path = "C:/Users/eanam/OneDrive/Desktop/AI3/Contradictory.csv"  
data.to_csv(output_path, index=False)

print(f"CSV file has been created at: {output_path}")


CSV file has been created at: C:/Users/eanam/OneDrive/Desktop/AI3/Contradictory.csv


In [5]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load your CSV file
file_path = 'C:/Users/eanam/OneDrive/Desktop/AI3/Contradictory.csv' 
data = pd.read_csv(file_path, encoding='ISO-8859-1')  

# Load the SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  

# Compute SBERT embeddings for Questions and Answers
question_embeddings = model.encode(data['Question'].tolist(), convert_to_tensor=True)
answer_embeddings = model.encode(data['Answer'].tolist(), convert_to_tensor=True)

# Compute cosine similarity for each Question-Answer pair
similarity_scores = util.cos_sim(question_embeddings, answer_embeddings).diagonal()

# Add the similarity scores as a new column in the dataframe
data['Similarity Score'] = similarity_scores.tolist()

# Save the updated file to your device
output_file_path = 'Contradictory_Similarity_Scores_SBERT.csv'  
data.to_csv(output_file_path, index=False)

print(f"Similarity scores calculated using SBERT and saved to {output_file_path}")


Similarity scores calculated using SBERT and saved to Contradictory_Similarity_Scores_SBERT.csv


In [6]:
import pandas as pd

# Load the dataset
file_path = 'Contradictory_Similarity_Scores_SBERT.csv'  
data = pd.read_csv(file_path)

# Filter the data to include only rows with a similarity score above 0.8
filtered_data = data[data['Similarity Score'] > 0.7]

# Save the filtered data to a new CSV file
filtered_file_path = 'Contradictory_filter_data.csv'  
filtered_data.to_csv(filtered_file_path, index=False)

print(f"Filtered data saved to {filtered_file_path}.")


Filtered data saved to Contradictory_filter_data.csv.


In [7]:
import pandas as pd

# Load the dataset
file_path = 'Contradictory_filter_data.csv'  
data = pd.read_csv(file_path)

# Verify data loaded
print(f"Total rows in dataset: {len(data)}")

# Step 1: Filter rows where similarity score is less than 0.8
filtered_data = data[data['Similarity Score'] > 0.7].copy()

# Debug: Check how many rows are filtered
print(f"Rows with similarity score < 0.7: {len(filtered_data)}")

# Step 2: Add a label column: Label = 1 if Similarity Score < 0.8 (your updated request)
filtered_data['Label'] = 0  # Assigning label 1 to all rows with similarity score < 0.8

# Step 3: Verify if labeling works correctly
print(f"Rows with Label == 1: {len(filtered_data)}")

# Step 4: Save the filtered and labeled data to a new CSV file
labeled_file_path = 'Contradictory_Labeled_Data.csv'  
filtered_data.to_csv(labeled_file_path, index=False)

print(f"Labeled data saved to {labeled_file_path}.")


Total rows in dataset: 35
Rows with similarity score < 0.7: 35
Rows with Label == 1: 35
Labeled data saved to Contradictory_Labeled_Data.csv.


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import torch
import numpy as np
from sklearn.metrics import accuracy_score

# Step 1: Load the datasets
related_df = pd.read_csv('Labeled_Data.csv')  # This contains columns: Question, GPT4 Answer, Similarity Score, Label
contradictory_df = pd.read_csv('Contradictory_Labeled_Data.csv')  # This contains columns: Question, Answer, Similarity Score, Label

# Step 2: Add a label column (1 for related, 0 for contradictory)
related_df = related_df[['Question', 'GPT4 Answer', 'Label']]
contradictory_df = contradictory_df[['Question', 'Answer', 'Label']]

# Step 3: Combine the datasets and create the 'text' column for input
related_df['text'] = related_df['Question'] + " " + related_df['GPT4 Answer']
contradictory_df['text'] = contradictory_df['Question'] + " " + contradictory_df['Answer']

# Combine both datasets
df = pd.concat([related_df[['text', 'Label']], contradictory_df[['text', 'Label']]])

# Step 4: Split the dataset into train and test
train_data, test_data = train_test_split(df, test_size=0.2)

# Step 5: Convert the data into SBERT InputExamples
train_examples = [InputExample(texts=[row['text'], row['text']], label=row['Label']) for _, row in train_data.iterrows()]
test_examples = [InputExample(texts=[row['text'], row['text']], label=row['Label']) for _, row in test_data.iterrows()]

# Step 6: Load pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  

# Step 7: Prepare DataLoader for training
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)

# Step 8: Use CosineSimilarityLoss for binary classification
train_loss = losses.CosineSimilarityLoss(model)

# Step 9: Fine-tune the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=500,  
    output_path='./sbert_finetuned_model'
)

# Step 10: Evaluate the model
def evaluate_model(model, test_data):
    # Get embeddings for the test data
    test_embeddings = []
    for example in test_data:
        embedding = model.encode(example.texts[0])  # Get embedding for the first sentence
        test_embeddings.append(embedding)
    
    # Calculate the cosine similarities and compare with the true labels
    predictions = []
    for i, embedding in enumerate(test_embeddings):
        similarity = np.dot(embedding, embedding) / (np.linalg.norm(embedding) * np.linalg.norm(embedding))  # Cosine similarity
        predicted_label = 1 if similarity > 0.5 else 0  # Threshold similarity for binary classification 
        predictions.append(predicted_label)
    
    # Calculate accuracy
    true_labels = [example.label for example in test_data]
    accuracy = accuracy_score(true_labels, predictions)
    print(f'Accuracy: {accuracy:.2f}')

evaluate_model(model, test_examples)


  0%|          | 0/24 [00:00<?, ?it/s]

{'train_runtime': 31.7407, 'train_samples_per_second': 5.576, 'train_steps_per_second': 0.756, 'train_loss': 0.4546387990315755, 'epoch': 3.0}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Accuracy: 0.67
