In [None]:
# Install Transformers
!pip install transformers

# Import necessary libraries
import os
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer

print("Libraries imported successfully!")


Libraries imported successfully!


In [None]:
from google.colab import files

# Step 2: Upload negative review files
print("Please upload all negative review files (e.g., books_negative.review, dvd_negative.review, etc.).")
uploaded_negatives = files.upload()


Please upload all negative review files (e.g., books_negative.review, dvd_negative.review, etc.).


Saving books_negative.review to books_negative.review
Saving dvd_negative.review to dvd_negative.review
Saving electronics_negative.review to electronics_negative.review
Saving kitchen_negative.review to kitchen_negative.review


In [None]:
# Step 3: Combine negative review files
def extract_reviews(file_path):
    reviews = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        review = []
        for line in lines:
            if line.strip() == "<review>":
                review = []
            elif line.strip() == "</review>":
                reviews.append(" ".join(review))
            elif not line.strip().startswith("<"):
                review.append(line.strip())
    return reviews

# Combine all negative reviews
negative_reviews = []
for file_name in uploaded_negatives.keys():
    reviews = extract_reviews(file_name)
    negative_reviews.extend(reviews)

# Create a DataFrame
negative_df = pd.DataFrame(negative_reviews, columns=["review"])
negative_df["label"] = "negative"  # Add label column

print(f"Combined negative reviews shape: {negative_df.shape}")
print(negative_df.head())


Combined negative reviews shape: (4000, 2)
                                              review     label
0  0312355645:horrible_book,_horrible.:mark_gospr...  negative
1  1559278676:shallow_self-indulgence:joseph_s._p...  negative
2  1559278676:horrible_book,_horrible.:mark_gospr...  negative
3  0425193373:disappointment:reader_"reader" 0425...  negative
4  0142004030:a_disappointing_mess:a._ross 014200...  negative


In [None]:
from google.colab import files

# Step 4: Upload positive review files
print("Please upload all positive review files (e.g., books_positive.review, dvd_positive.review, etc.).")
uploaded_positives = files.upload()


Please upload all positive review files (e.g., books_positive.review, dvd_positive.review, etc.).


Saving books_positive.review to books_positive.review
Saving dvd_positive.review to dvd_positive.review
Saving electronics_positive.review to electronics_positive.review
Saving kitchen_positive.review to kitchen_positive.review


In [None]:
# Step 5: Combine positive review files
positive_reviews = []
for file_name in uploaded_positives.keys():
    reviews = extract_reviews(file_name)
    positive_reviews.extend(reviews)

# Create a DataFrame
positive_df = pd.DataFrame(positive_reviews, columns=["review"])
positive_df["label"] = "positive"  # Add label column

print(f"Combined positive reviews shape: {positive_df.shape}")
print(positive_df.head())


Combined positive reviews shape: (4000, 2)
                                              review     label
0  0785758968:one_of_the_best_crichton_novels:jos...  positive
1  0452279550:the_medicine_of_the_future:wafa_ras...  positive
2  1599620065:beautiful!:sarah_silva_"sar" 159962...  positive
3  0743277724:for_lovers_of_robicheaux:g._roussea...  positive
4  061318114X:excellent_and_broad_survey_of_the_d...  positive


In [None]:
# Step 6: Combine positive and negative reviews
all_reviews_df = pd.concat([negative_df, positive_df], ignore_index=True)

print(f"Total combined dataset shape: {all_reviews_df.shape}")
print(all_reviews_df.head())
print(all_reviews_df.tail())  # To check the combination


Total combined dataset shape: (8000, 2)
                                              review     label
0  0312355645:horrible_book,_horrible.:mark_gospr...  negative
1  1559278676:shallow_self-indulgence:joseph_s._p...  negative
2  1559278676:horrible_book,_horrible.:mark_gospr...  negative
3  0425193373:disappointment:reader_"reader" 0425...  negative
4  0142004030:a_disappointing_mess:a._ross 014200...  negative
                                                 review     label
7995  B000AQQOF4:great_filter--_noticably_reduces_du...  positive
7996  B0000VCYHG:quite_pleased.:amber_a._mull_"psych...  positive
7997  B00005AL7B:does_the_job:nujoi B00005AL7B All-C...  positive
7998  B000063D4W:pleased!: B000063D4W Non&#45;Stop S...  positive
7999  B000FIR48S:love_it!:p._davis B000FIR48S Remanu...  positive


In [None]:
# Step 7: Clean the review text
def clean_review_text(text):
    # Remove the unique ID and ASIN code (everything before the first colon)
    text = re.sub(r"^\w+:[^:]*:", "", text)
    # Remove non-alphanumeric characters (except spaces)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Convert to lowercase
    text = text.lower().strip()
    return text

# Apply cleaning to the review column
all_reviews_df["cleaned_review"] = all_reviews_df["review"].apply(clean_review_text)

print("Sample cleaned reviews:")
print(all_reviews_df[["review", "cleaned_review", "label"]].head())


Sample cleaned reviews:
                                              review  \
0  0312355645:horrible_book,_horrible.:mark_gospr...   
1  1559278676:shallow_self-indulgence:joseph_s._p...   
2  1559278676:horrible_book,_horrible.:mark_gospr...   
3  0425193373:disappointment:reader_"reader" 0425...   
4  0142004030:a_disappointing_mess:a._ross 014200...   

                                      cleaned_review     label  
0  markgospri  running with scissors a memoir boo...  negative  
1  josephsperrottavidreader  running with scissor...  negative  
2  markgospri  running with scissors a memoir boo...  negative  
3  readerreader  fierce conversations achieving s...  negative  
4  aross  lost in a good book thursday next novel...  negative  


In [None]:
# Step 8: Tokenize the data using DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the reviews
def tokenize_data(reviews, tokenizer, max_length=128):
    return tokenizer(
        list(reviews),
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

# Tokenize the cleaned reviews
tokenized_data = tokenize_data(all_reviews_df["cleaned_review"], tokenizer)

# Add tokenized data to the DataFrame
all_reviews_df["input_ids"] = tokenized_data["input_ids"].tolist()
all_reviews_df["attention_mask"] = tokenized_data["attention_mask"].tolist()

print("Sample tokenized data:")
print(all_reviews_df[["cleaned_review", "input_ids", "attention_mask", "label"]].head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Sample tokenized data:
                                      cleaned_review  \
0  markgospri  running with scissors a memoir boo...   
1  josephsperrottavidreader  running with scissor...   
2  markgospri  running with scissors a memoir boo...   
3  readerreader  fierce conversations achieving s...   
4  aross  lost in a good book thursday next novel...   

                                           input_ids  \
0  [101, 2928, 12333, 18098, 2072, 2770, 2007, 25...   
1  [101, 3312, 17668, 21709, 2696, 17258, 16416, ...   
2  [101, 2928, 12333, 18098, 2072, 2770, 2007, 25...   
3  [101, 8068, 16416, 4063, 9205, 11450, 10910, 1...   
4  [101, 12098, 15094, 2439, 1999, 1037, 2204, 23...   

                                      attention_mask     label  
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  negative  
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  negative  
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  negative  
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
# Verify the shape of tokenized data
print(f"Input IDs shape: {tokenized_data['input_ids'].shape}")
print(f"Attention mask shape: {tokenized_data['attention_mask'].shape}")
print(f"Sample Input IDs: {tokenized_data['input_ids'][0]}")
print(f"Sample Attention Mask: {tokenized_data['attention_mask'][0]}")


Input IDs shape: torch.Size([8000, 128])
Attention mask shape: torch.Size([8000, 128])
Sample Input IDs: tensor([  101,  2928, 12333, 18098,  2072,  2770,  2007, 25806,  1037, 12558,
         2808, 20758,  2078, 25991,  2808,  1997,  9202,  2338,  9202,  2281,
         2928,  2175, 13102,  3089,  2023,  2338,  2001,  9202,  2065,  2009,
         2001,  2825,  2000,  3446,  2009,  2896,  2084,  2028,  2732,  1045,
         2052,  2031,  1045,  2572,  2019, 18568,  8068,  1998,  3856,  2023,
         2338,  2039,  2044,  2026,  3566,  2018,  5407,  2009,  2013,  1037,
         2767,  1045,  3191,  2431,  1997,  2009,  6114,  2013,  1037, 14978,
         1996,  2972,  2051,  1998,  2059,  2288,  2000,  1996,  2112,  2055,
         1996,  3276,  1996,  2095,  2214,  2879,  2018,  2007,  1037,  2095,
         2214,  2158,  1998,  1045,  5507,  2023,  2338,  2006,  2543,  2028,
         2625,  6100,  1999,  1996,  2088,  5280,  2102,  5949,  2115,  2769,
         1045,  4299,  1045,  2018,  

In [None]:
import torch

In [None]:
# Import torch
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert labels to tensors
labels = torch.tensor(all_reviews_df["label"].apply(lambda x: 1 if x == "positive" else 0).values)

# Create the TensorDataset
dataset = TensorDataset(
    tokenized_data["input_ids"],
    tokenized_data["attention_mask"],
    labels
)

# Split the dataset
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print(f"Training dataset size: {len(train_dataset)}")
print(f"Testing dataset size: {len(test_dataset)}")


Training dataset size: 6400
Testing dataset size: 1600


In [None]:
from transformers import DistilBertForSequenceClassification, AdamW

# Load the pre-trained DistilBERT model for binary classification
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2  # Binary classification (positive/negative)
)

# Move the model to the available device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

print("Model loaded and optimizer initialized.")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and optimizer initialized.




In [None]:
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

# Training loop
def train_model(model, train_loader, optimizer, device, epochs=3):
    model.train()  # Set the model to training mode
    loss_fn = CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0
        loop = tqdm(train_loader, leave=True)  # Progress bar for the epoch
        for batch in loop:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]

            optimizer.zero_grad()  # Clear gradients from the previous step
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()  # Backpropagation
            optimizer.step()  # Update model weights

            # Update progress bar
            loop.set_description(f"Epoch {epoch+1}")
            loop.set_postfix(loss=loss.item())

        print(f"Epoch {epoch+1} completed. Average Loss: {total_loss / len(train_loader):.4f}")

# Train the model for 3 epochs
train_model(model, train_loader, optimizer, device, epochs=3)


Epoch 1: 100%|██████████| 400/400 [1:11:52<00:00, 10.78s/it, loss=0.456]


Epoch 1 completed. Average Loss: 0.3718


Epoch 2: 100%|██████████| 400/400 [1:10:59<00:00, 10.65s/it, loss=0.06]


Epoch 2 completed. Average Loss: 0.1861


Epoch 3:   2%|▎         | 10/400 [01:56<1:15:53, 11.68s/it, loss=0.115]


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score

# Define the evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    predictions, true_labels = [], []
    with torch.no_grad():  # No gradient computation during evaluation
        for batch in test_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Test Accuracy: {accuracy:.4f}")
    return accuracy


In [None]:
# Evaluate the model on the test dataset
evaluate_model(model, test_loader, device)


Test Accuracy: 0.8850


0.885

In [None]:
# Save the trained model and tokenizer
model.save_pretrained("/content/sentiment-model")
tokenizer.save_pretrained("/content/sentiment-model")

print("Model and tokenizer saved to /content/sentiment-model")


Model and tokenizer saved to /content/sentiment-model


In [None]:
# Check the saved files
import os

saved_dir = "/content/sentiment-model"
print("Saved files:", os.listdir(saved_dir))


Saved files: ['model.safetensors', 'vocab.txt', 'special_tokens_map.json', 'config.json', 'tokenizer_config.json']


In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Load the saved model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("/content/sentiment-model")
tokenizer = DistilBertTokenizer.from_pretrained("/content/sentiment-model")
model.to(device)  # Move the model to the appropriate device

# Define a function to make predictions
def predict_sentiment(texts, model, tokenizer, device):
    model.eval()
    # Tokenize input texts
    inputs = tokenizer(
        texts,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)
    # Get model predictions
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
    # Map predictions to labels
    labels = ["negative", "positive"]
    return [labels[pred] for pred in predictions]

# Test the model on new samples
test_samples = [
    "I absolutely love this product!",
    "This is the worst experience I've ever had.",
    "It’s okay, but could be better."
]

predictions = predict_sentiment(test_samples, model, tokenizer, device)
for text, sentiment in zip(test_samples, predictions):
    print(f"Text: {text} -> Sentiment: {sentiment}")


Text: I absolutely love this product! -> Sentiment: positive
Text: This is the worst experience I've ever had. -> Sentiment: negative
Text: It’s okay, but could be better. -> Sentiment: positive


In [None]:
from google.colab import files
!zip -r sentiment-model.zip /content/sentiment-model
files.download("sentiment-model.zip")


  adding: content/sentiment-model/ (stored 0%)
  adding: content/sentiment-model/model.safetensors (deflated 8%)
  adding: content/sentiment-model/vocab.txt (deflated 53%)
  adding: content/sentiment-model/special_tokens_map.json (deflated 42%)
  adding: content/sentiment-model/config.json (deflated 46%)
  adding: content/sentiment-model/tokenizer_config.json (deflated 75%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install flask




In [None]:
from flask import Flask, request, render_template
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch

# Initialize Flask app
app = Flask(__name__)

# Load the trained model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("/content/sentiment-model")
tokenizer = DistilBertTokenizer.from_pretrained("/content/sentiment-model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define prediction function
def predict_sentiment(text):
    model.eval()
    inputs = tokenizer(
        text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).cpu().numpy()[0]
    return "positive" if prediction == 1 else "negative"

# Define routes
@app.route("/", methods=["GET", "POST"])
def home():
    sentiment = None
    if request.method == "POST":
        user_input = request.form["user_input"]
        sentiment = predict_sentiment(user_input)
    return render_template("index.html", sentiment=sentiment)

if __name__ == "__main__":
    app.run(debug=True, host="0.0.0.0")


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


In [None]:
!pip install flask-ngrok


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
!pip install flask pyngrok transformers


Collecting pyngrok
  Downloading pyngrok-7.2.1-py3-none-any.whl.metadata (8.3 kB)
Downloading pyngrok-7.2.1-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.1


In [None]:
from flask import Flask, request, render_template
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch
from pyngrok import ngrok

# Initialize Flask app
app = Flask(__name__)

# Load the trained model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("/content/sentiment-model")
tokenizer = DistilBertTokenizer.from_pretrained("/content/sentiment-model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define prediction function
def predict_sentiment(text):
    model.eval()
    inputs = tokenizer(
        text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).cpu().numpy()[0]
    return "positive" if prediction == 1 else "negative"


In [None]:
@app.route("/", methods=["GET", "POST"])
def home():
    sentiment = None
    if request.method == "POST":
        user_input = request.form["user_input"]
        sentiment = predict_sentiment(user_input)
    return render_template("index.html", sentiment=sentiment)


In [None]:
import os

# Create the 'templates' directory
os.makedirs("templates", exist_ok=True)


In [None]:
# Save the HTML file in the 'templates' folder
html_content = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sentiment Analysis</title>
</head>
<body>
    <h1>Sentiment Analysis App</h1>
    <form method="POST">
        <label for="user_input">Enter your text:</label><br><br>
        <textarea id="user_input" name="user_input" rows="4" cols="50" placeholder="Type something here..."></textarea><br><br>
        <button type="submit">Analyze Sentiment</button>
    </form>
    {% if sentiment %}
    <h2>Result: The sentiment is {{ sentiment }}.</h2>
    {% endif %}
</body>
</html>
"""

# Write the content to the file
with open("templates/index.html", "w") as file:
    file.write(html_content)

print("HTML file saved in the 'templates' folder.")


HTML file saved in the 'templates' folder.


In [None]:
# Check the contents of the templates folder
os.listdir("templates")


['index.html']

In [None]:
!pip install gradio


Collecting gradio
  Downloading gradio-5.7.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.0 (from gradio)
  Downloading gradio_client-1.5.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [None]:
!pip install flask gradio transformers




In [None]:
import gradio as gr
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch

# Load the trained model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("/content/sentiment-model")
tokenizer = DistilBertTokenizer.from_pretrained("/content/sentiment-model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define prediction function
def predict_sentiment(text):
    model.eval()
    inputs = tokenizer(
        text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).cpu().numpy()[0]
    return "positive" if prediction == 1 else "negative"

# Create Gradio interface
iface = gr.Interface(
    fn=predict_sentiment,
    inputs=gr.Textbox(lines=2, placeholder="Enter text to analyze sentiment"),
    outputs="text",
    title="Sentiment Analysis App",
    description="Enter text to determine if the sentiment is positive or negative."
)

# Launch the Gradio interface
iface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7f6a4751feefa6db4d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


