In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the pre-trained tokenizer from the 'bert_tokenizer' folder
tokenizer = BertTokenizer.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_tokenizer\bert_tokenizer")

# Load the pre-trained model from the 'bert_model' folder
model = BertForSequenceClassification.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_model\bert_model")

# Set the model to evaluation mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [3]:

# Sample input text
input_text = "I dont like the movie"

# Tokenize input text
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits  # Get the raw output scores

# Get predicted class (assuming binary classification)
predicted_class = torch.argmax(logits, dim=1).item()
print(f"Predicted class: {predicted_class}")

Predicted class: 0


In [32]:
#FOR CREATING CSV FILES FROM TXT FOLDERS

import os
import pandas as pd

# Specify the directory containing the .txt files
directory = r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\PS-1\PS-1\IMDB\gagan\unsup (drift test)"

# Specify the path to your Downloads folder
downloads_folder = os.path.expanduser('~/Downloads')  # This works for both Windows and macOS/Linux

# List to store the content of each file
data = []

# Iterate over each file in the specified directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        # Open and read the content of the file
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            # Append content to the data list as a dictionary
            data.append({'review': content})

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file in the Downloads folder
output_csv_path = os.path.join(downloads_folder, 'review.csv')
df.to_csv(output_csv_path, index=False)

print(f"Data from text files has been saved to {output_csv_path}")


Data from text files has been saved to C:\Users\ngaga/Downloads\review.csv


In [14]:
#MAIN MODEL THAT CAN TAKE SINGLE OR BATCH AT A TIME

import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from tqdm import tqdm

# Load the pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_tokenizer\bert_tokenizer")
model = BertForSequenceClassification.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_model\bert_model")
model.eval()

# Choose input type: text or CSV
input_type = input("Enter 'text' for single text input or 'csv' for batch processing from a CSV file: ").strip().lower()

if input_type == 'text':
    # Single text input
    input_text = input("Enter your review: ")
    
    # Tokenize and make prediction
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    
    # Get predicted class
    predicted_class = torch.argmax(logits, dim=1).item()
    sentiment = "Positive" if predicted_class == 1 else "Negative"
    print(f"Predicted sentiment: {sentiment} (Class: {predicted_class})")

elif input_type == 'csv':
    # Batch processing from CSV
    input_csv_path = input("Enter the path to your CSV file: ").strip()
    reviews_df = pd.read_csv(input_csv_path)
    
    # Check for either 'review' or 'reviews' column
    review_column = 'review' if 'review' in reviews_df.columns else 'reviews' if 'reviews' in reviews_df.columns else None
    if not review_column:
        raise ValueError("The input CSV must contain either a 'review' or 'reviews' column.")

    # Initialize a list to store predictions
    predicted_labels = []
    
    # Batch prediction
    for review in tqdm(reviews_df[review_column], desc="Processing reviews"):
        inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True)
        
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
        
        predicted_class = torch.argmax(logits, dim=1).item()
        predicted_labels.append(predicted_class)

    # Add predictions to DataFrame and save
    reviews_df['predicted_label'] = predicted_labels
    reviews_df['predicted_sentiment'] = reviews_df['predicted_label'].apply(lambda x: 'Positive' if x == 1 else 'Negative')

    output_csv_path = r"C:\Users\ngaga\Downloads\predicted_reviews.csv"
    reviews_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")

else:
    print("Invalid input. Please enter 'text' or 'csv'.")


Enter 'text' for single text input or 'csv' for batch processing from a CSV file:  csv
Enter the path to your CSV file:  C:\Users\ngaga\Downloads\reviews.csv


Processing reviews: 100%|██████████████████████████████████████████████████████████████| 50/50 [00:09<00:00,  5.14it/s]

Predictions saved to C:\Users\ngaga\Downloads\predicted_reviews.csv





In [16]:
#FOR CALCULATING THE CURRENT MODEL ACCURACY 


import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Load the pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_tokenizer\bert_tokenizer")
model = BertForSequenceClassification.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_model\bert_model")
model.eval()

# Load the test dataset
test_csv_path = r"C:\Users\ngaga\Downloads\reviews.csv"  # Update this path to your test CSV file
test_df = pd.read_csv(test_csv_path)

# Check if the 'review' and 'label' columns exist
if 'review' not in test_df.columns or 'label' not in test_df.columns:
    raise ValueError("The input CSV must contain 'review' and 'label' columns.")

# Initialize a list to store predictions
predicted_labels = []

# Batch prediction
for review in tqdm(test_df['review'], desc="Processing test reviews"):
    inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_labels.append(predicted_class)

# Calculate accuracy
accuracy = accuracy_score(test_df['label'], predicted_labels)
print(f"Model accuracy: {accuracy:.2f}")


Processing test reviews: 100%|█████████████████████████████████████████████████| 25000/25000 [1:54:15<00:00,  3.65it/s]

Model accuracy: 0.92





In [25]:
#CODE TO FIND OUT THE DRIFT SCORES BETWEEN TWO DATASETS

import torch
from transformers import BertTokenizer, BertModel
from scipy.stats import wasserstein_distance
import pandas as pd
from tqdm import tqdm  # Import tqdm for the progress bar

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_tokenizer\bert_tokenizer")
model = BertModel.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_model\bert_model")
model.eval()

# Function to generate embeddings
def generate_embeddings(text_data):
    embeddings = []
    # Use tqdm to create a progress bar
    for text in tqdm(text_data, desc="Generating embeddings"):
        # Tokenize and encode
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Use the pooled output as the embedding
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        embeddings.append(pooled_output.mean(dim=0).numpy())
    
    return embeddings

# Load datasets
historical_df = pd.read_csv(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\PS-1\PS-1\IMDB\gagan\drift (sup)\review(sup).csv")  # Labeled dataset
new_df = pd.read_csv(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\PS-1\PS-1\IMDB\gagan\unsup (drift test)\review(unsup).csv")  # Update this path for your unlabeled dataset

# Generate embeddings for historical and new data
historical_embeddings = generate_embeddings(historical_df['review'].tolist())
new_embeddings = generate_embeddings(new_df['review'].tolist())

# Calculate mean embeddings for both datasets
historical_mean_embedding = torch.mean(torch.tensor(historical_embeddings), dim=0).numpy()
new_mean_embedding = torch.mean(torch.tensor(new_embeddings), dim=0).numpy()

# Calculate Wasserstein Distance as drift measure
drift_score = wasserstein_distance(
    historical_mean_embedding,
    new_mean_embedding
)

print(f"Drift Score (Wasserstein Distance) between historical and new data: {drift_score}")

# Optional: Threshold for determining significant drift
threshold = 0.1  # Set a threshold value based on domain knowledge or experimentation
if drift_score > threshold:
    print("Significant drift detected.")
else:
    print("No significant drift detected.")


Generating embeddings: 100%|█████████████████████████████████████████████████████████| 989/989 [03:56<00:00,  4.18it/s]
Generating embeddings: 100%|█████████████████████████████████████████████████████████| 989/989 [04:08<00:00,  3.99it/s]


Drift Score (Wasserstein Distance) between historical and new data: 0.11601508499614846
Significant drift detected.


  historical_mean_embedding = torch.mean(torch.tensor(historical_embeddings), dim=0).numpy()


In [27]:
#PREDICTING OF UNLABELED DATA AND FILTERING BASED ON CONFIDENCE VALUE OF 0.8.

import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax

# Load the pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_tokenizer\bert_tokenizer")
model = BertForSequenceClassification.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_model\bert_model")
model.eval()

# Load the unlabeled dataset
unlabeled_df = pd.read_csv(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\PS-1\PS-1\IMDB\gagan\unsup (drift test)\review(unsup).csv")

# Lists to store the filtered review, predicted label, and confidence score
filtered_reviews = []
filtered_labels = []
filtered_confidence_scores = []

# Prediction function for each review
def predict_label_and_confidence(review_text):
    inputs = tokenizer(review_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get logits and apply softmax to obtain probabilities
    logits = outputs.logits
    probabilities = softmax(logits, dim=1)
    
    # Predicted label and confidence score
    predicted_label = torch.argmax(probabilities, dim=1).item()
    confidence_score = torch.max(probabilities).item()
    
    return predicted_label, confidence_score

# Iterate over each review in the unlabeled dataset with a progress bar
from tqdm import tqdm

for review in tqdm(unlabeled_df['review'], desc="Predicting labels and confidence scores"):
    label, confidence = predict_label_and_confidence(review)
    # Only store predictions with confidence >= 0.8
    if confidence >= 0.8:
        filtered_reviews.append(review)
        filtered_labels.append(label)
        filtered_confidence_scores.append(confidence)

# Create a DataFrame with filtered data
filtered_df = pd.DataFrame({
    'review': filtered_reviews,
    'predicted_label': filtered_labels,
    'confidence_score': filtered_confidence_scores
})

# Save to CSV in the Downloads folder
output_csv_path = r"C:\Users\ngaga\Downloads\filtered_predicted_reviews_with_confidence.csv"
filtered_df.to_csv(output_csv_path, index=False)
print(f"Filtered predictions with confidence scores >= 0.8 saved to {output_csv_path}")


Predicting labels and confidence scores: 100%|███████████████████████████████████████| 989/989 [03:53<00:00,  4.24it/s]

Filtered predictions with confidence scores >= 0.8 saved to C:\Users\ngaga\Downloads\filtered_predicted_reviews_with_confidence.csv





In [28]:
#DATA PREPROCESSING FOR RETRAINING OF MODEL

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the filtered dataset
filtered_df = pd.read_csv(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\PS-1\PS-1\IMDB\gagan\filtered_predicted_reviews_with_confidence.csv")

# Define a custom dataset
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_length=128):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        review = self.reviews[index]
        label = self.labels[index]

        # Tokenize and encode
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Prepare data for training
X = filtered_df['review'].tolist()
y = filtered_df['predicted_label'].tolist()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_tokenizer\bert_tokenizer")

# Create datasets
train_dataset = ReviewDataset(X_train, y_train, tokenizer)
val_dataset = ReviewDataset(X_val, y_val, tokenizer)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [30]:
#CODE FOR RETRAINING THE MODEL

from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

# Load the pre-trained model and set it to CPU
model = BertForSequenceClassification.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_model\bert_model")
model.train()

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(3):  # Train for 3 epochs
    print(f"Epoch {epoch + 1}/{3}")
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        
        # Move data to CPU
        input_ids = batch['input_ids']  # No need to move to GPU
        attention_mask = batch['attention_mask']  # No need to move to GPU
        labels = batch['labels']  # No need to move to GPU
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Average Training Loss: {avg_loss:.4f}")

# Save the retrained model
model.save_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_model\bert_model_retrained")
tokenizer.save_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_tokenizer\bert_tokenizer_retrained")

print("Model retraining complete and saved.")


Epoch 1/3


100%|██████████████████████████████████████████████████████████████████████████████████| 46/46 [03:41<00:00,  4.81s/it]


Average Training Loss: 0.1250
Epoch 2/3


100%|██████████████████████████████████████████████████████████████████████████████████| 46/46 [03:54<00:00,  5.10s/it]


Average Training Loss: 0.0352
Epoch 3/3


100%|██████████████████████████████████████████████████████████████████████████████████| 46/46 [03:52<00:00,  5.05s/it]


Average Training Loss: 0.0116
Model retraining complete and saved.


## Complete Pipeline

In [None]:
################################################################## MAIN CODE ###########################################################################




# Import necessary libraries
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from scipy.stats import wasserstein_distance
from sklearn.model_selection import train_test_split

# Load the pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_tokenizer\bert_tokenizer")
model = BertForSequenceClassification.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_model\bert_model")
model.eval()

# Function to make predictions
def predict_sentiment(input_data):
    inputs = tokenizer(input_data, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

# Choose input type: text or CSV
input_type = input("Enter 'text' for single text input or 'csv' for batch processing from a CSV file: ").strip().lower()

if input_type == 'text':
    # Single text input
    input_text = input("Enter your review: ")
    predicted_class = predict_sentiment(input_text)
    sentiment = "Positive" if predicted_class == 1 else "Negative"
    print(f"Predicted sentiment: {sentiment} (Class: {predicted_class})")

    # Save the review to a CSV file in append mode
    review_records = [input_text]
    output_review_csv_path = r"C:\Users\ngaga\Downloads\review_predictions.csv"
    review_df = pd.DataFrame(review_records, columns=['review'])
    review_df.to_csv(output_review_csv_path, mode='a', index=False, header=not pd.io.common.file_exists(output_review_csv_path))
    print(f"Review saved to {output_review_csv_path}.")

elif input_type == 'csv':
    # Batch processing from CSV
    input_csv_path = input("Enter the path to your CSV file: ").strip()
    reviews_df = pd.read_csv(input_csv_path)

    # Check for either 'review' or 'reviews' column
    review_column = 'review' if 'review' in reviews_df.columns else 'reviews' if 'reviews' in reviews_df.columns else None
    if not review_column:
        raise ValueError("The input CSV must contain either a 'review' or 'reviews' column.")

    # Initialize a list to store predictions
    predicted_labels = []
    review_records = []  # To store the reviews for CSV saving

    # Batch prediction
    for review in tqdm(reviews_df[review_column], desc="Processing reviews"):
        predicted_class = predict_sentiment(review)
        predicted_labels.append(predicted_class)
        review_records.append(review)  # Collect the review for saving

    # Add predictions to DataFrame and save
    reviews_df['predicted_label'] = predicted_labels
    reviews_df['predicted_sentiment'] = reviews_df['predicted_label'].apply(lambda x: 'Positive' if x == 1 else 'Negative')

    output_csv_path = r"C:\Users\ngaga\Downloads\predicted_reviews.csv"
    reviews_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}.")

    # Save only the reviews to a CSV file in append mode
    output_review_csv_path = r"C:\Users\ngaga\Downloads\review_predictions.csv"
    review_df = pd.DataFrame(review_records, columns=['review'])
    review_df.to_csv(output_review_csv_path, mode='a', index=False, header=not pd.io.common.file_exists(output_review_csv_path))
    print(f"Reviews saved to {output_review_csv_path}.")


import pandas as pd
import torch
from tqdm import tqdm
from scipy.stats import wasserstein_distance

# Function to safely parse embeddings from strings
def parse_embedding(embedding_str):
    # Ensure the string is treated as a list of floats
    return list(map(float, embedding_str.strip("[]").split()))

# Function to generate embeddings
def generate_embeddings(reviews):
    embeddings = []
    for review in tqdm(reviews, desc="Generating embeddings"):
        inputs = tokenizer(review, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            embedding = logits.numpy().flatten()  # Flatten logits as embedding
        embeddings.append(embedding)
    return embeddings

# Load reviews from 'csv_reviews.csv'
input_csv_path = r"C:\Users\ngaga\Downloads\review_predictions.csv"
review_df = pd.read_csv(input_csv_path)

# Check if there are at least 900 reviews
if len(review_df) >= 900:
    print("Generating embeddings and starting data drift calculation...")

    # Generate embeddings for reviews
    review_embeddings = generate_embeddings(review_df['review'].tolist())
    review_df['embedding'] = review_embeddings

    # Save updated 'csv_reviews.csv' with embeddings
    review_df.to_csv(input_csv_path, index=False)
    print(f"Embeddings added to existing file: {input_csv_path}")

    # Load historical embeddings
    historical_embeddings_csv_path = r"C:\Users\ngaga\Downloads\embeddings.csv"
    historical_embeddings_df = pd.read_csv(historical_embeddings_csv_path)

    # Parse stored embeddings from strings
    historical_embeddings = historical_embeddings_df['embedding'].apply(parse_embedding).tolist()
    current_embeddings = review_df['embedding'].tolist()

    # Calculate mean embeddings
    historical_mean_embedding = torch.mean(torch.tensor(historical_embeddings), dim=0).numpy()
    current_mean_embedding = torch.mean(torch.tensor(current_embeddings), dim=0).numpy()

    # Calculate Wasserstein Distance for drift detection
    drift_score = wasserstein_distance(historical_mean_embedding, current_mean_embedding)
    print(f"Drift Score: {drift_score}")

    # Optional: Threshold for drift detection
    threshold = 1.0
    if drift_score > threshold:
        print("Significant drift detected. Generating predictions for reviews.")

        # Initialize lists for predictions and confidence scores
        predicted_labels = []
        confidence_scores = []

        # Predict labels and confidence scores
        for review in tqdm(review_df['review'], desc="Predicting labels"):
            inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits
                probabilities = torch.softmax(logits, dim=1)
                confidence, predicted_class = torch.max(probabilities, dim=1)
                predicted_labels.append(predicted_class.item())
                confidence_scores.append(confidence.item())

        # Add labels and confidence scores to DataFrame
        review_df['predicted_label'] = predicted_labels
        review_df['confidence_score'] = confidence_scores

        # Filter by confidence score >= 0.8
        review_df = review_df[review_df['confidence_score'] >= 0.8]

        # Save the updated CSV
        review_df.to_csv(input_csv_path, index=False)
        print(f"Updated review predictions saved to {input_csv_path}.")

       # DATA PREPROCESSING FOR RETRAINING OF MODEL

        import torch
        from transformers import BertTokenizer, BertForSequenceClassification
        from torch.utils.data import DataLoader, Dataset
        from sklearn.model_selection import train_test_split
        import pandas as pd
        
        # Load the filtered dataset
        filtered_df = pd.read_csv(r"C:\Users\ngaga\Downloads\review_predictions.csv")
        
        # Define a custom dataset
        class ReviewDataset(Dataset):
            def __init__(self, reviews, labels, tokenizer, max_length=128):
                self.reviews = reviews
                self.labels = labels
                self.tokenizer = tokenizer
                self.max_length = max_length
        
            def __len__(self):
                return len(self.reviews)
        
            def __getitem__(self, index):
                review = self.reviews[index]
                label = self.labels[index]
        
                # Tokenize and encode
                encoding = self.tokenizer.encode_plus(
                    review,
                    add_special_tokens=True,
                    max_length=self.max_length,
                    return_token_type_ids=False,
                    padding='max_length',
                    truncation=True,
                    return_attention_mask=True,
                    return_tensors='pt',
                )
        
                return {
                    'review_text': review,
                    'input_ids': encoding['input_ids'].flatten(),
                    'attention_mask': encoding['attention_mask'].flatten(),
                    'labels': torch.tensor(label, dtype=torch.long)
                }
        
        # Prepare data for training
        X = filtered_df['review'].tolist()
        y = filtered_df['predicted_label'].tolist()
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Initialize the tokenizer
        tokenizer = BertTokenizer.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_tokenizer\bert_tokenizer")
        
        # Create datasets
        train_dataset = ReviewDataset(X_train, y_train, tokenizer)
        val_dataset = ReviewDataset(X_val, y_val, tokenizer)
        
        # Create DataLoaders
        train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
        
        
        # CODE FOR RETRAINING THE MODEL
        
        from transformers import AdamW, get_linear_schedule_with_warmup
        from tqdm import tqdm
        
        # Set device to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Load the pre-trained model and set it to the device
        model = BertForSequenceClassification.from_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_model\bert_model")
        model.to(device)
        model.train()
        
        # Set up the optimizer and scheduler
        optimizer = AdamW(model.parameters(), lr=2e-5)
        total_steps = len(train_loader) * 3  # 3 epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        
        # Training loop
        for epoch in range(3):  # Train for 3 epochs
            print(f"Epoch {epoch + 1}/{3}")
            model.train()
            total_loss = 0
        
            for batch in tqdm(train_loader):
                optimizer.zero_grad()
                
                # Move data to device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                # Forward pass
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_loss += loss.item()
        
                # Backward pass
                loss.backward()
                optimizer.step()
                scheduler.step()
        
            avg_loss = total_loss / len(train_loader)
            print(f"Average Training Loss: {avg_loss:.4f}")
        
        # Save the retrained model, replacing the existing model
        model.save_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_model\bert_model")
        tokenizer.save_pretrained(r"C:\Users\ngaga\OneDrive\Desktop\cloudsek assignment\1st problem\bert_tokenizer\bert_tokenizer")
        
        print("Model retraining complete and saved.")
    else:
        print("No significant drift detected. No predictions needed.")
else:
    print(f"Number of reviews: {len(review_df)}. Waiting for at least 900 reviews.")