In [16]:
import pandas as pd
import numpy as np
import spacy
import re
import contractions
import multiprocessing as mp
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import MiniBatchKMeans
from collections import defaultdict
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import RobertaTokenizer
from nltk.corpus import stopwords
import nltk
from tqdm.contrib.concurrent import process_map
from sklearn.utils.class_weight import compute_class_weight
from spacy.tokens import Doc
import cupy
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pickle

# Initialize spaCy with efficient settings
nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"], exclude=["cupy"])
nltk.download('stopwords')

# Configuration
NUM_ASPECTS = 26
BATCH_SIZE = 32
MAX_LENGTH = 64
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TOKENIZER = RobertaTokenizer.from_pretrained("roberta-base")


# Aspect Seed Keywords
ASPECT_SEEDS = {
    "usability": ["interface", "ui", "ease", "easy", "use", "navigate", "use", "friendly", "intuitive", "clumsy"],
    "performance": ["speed", "fast", "slow", "crash", "freeze", "lag", "performance"],
    "pricing": ["price", "cost", "expensive", "cheap", "deal", "value", "overpriced"],
    "installation": ["install", "setup", "download", "activate", "registration"],
    "features": ["feature", "functionality", "option", "tool", "missing", "limited"],
    "support": ["customer", "service", "support", "help", "response"],
    "compatibility": ["compatible", "work", "windows", "mac", "version"],
    "security": ["virus", "malware", "hack", "secure", "protection"],
    "design": ["design", "aesthetic", "look", "feel", "appearance", "modern", "outdated", "theme", "color"],
    "update": ["update", "upgrade", "patch", "version", "release", "changelog", "fixes", "improvement", "bugfix"],
    "reliability": ["reliable", "unreliable", "consistent", "crash", "bug", "stable", "error", "issue"],
    "accessibility": ["accessible", "accessibility", "screen reader", "font size", "color blind", "contrast", "voice control"],
    "integration": ["integrate", "integration", "connect", "API", "sync", "third-party", "external", "plugin"],
    "learning_curve": ["learn", "tutorial", "guide", "documentation", "manual", "training", "hard to learn", "easy to learn"],
    "account": ["account", "login", "signup", "password", "authentication", "profile", "credentials", "logout"],
    "documentation": ["documentation", "manual", "instructions", "readme", "guide", "faq", "reference", "how-to"],
    "availability": ["available", "availability", "download", "region", "country", "store", "platform", "restricted"],
    "licensing": ["license", "licensed", "open-source", "freeware", "trial", "terms", "agreement", "policy"],
    "interface_language": ["language", "translate", "localization", "multilingual", "support language", "region"],
    "update_policy": ["update frequency", "auto-update", "manual update", "changelog", "version history"],
    "usage_context": ["business", "enterprise", "education", "home", "personal", "professional", "students"],
    "device_requirements": ["ram", "storage", "requirement", "specs", "CPU", "memory", "disk space"],
    "time_usage": ["hours", "duration", "short term", "long term", "daily", "weekly", "immediate"],
    "audience": ["beginner", "novice", "intermediate", "expert", "developer", "admin", "non-technical"],
    "availability_status": ["beta", "alpha", "release", "preview", "stable", "deprecated"],
    "branding": ["brand", "logo", "company", "publisher", "vendor", "developer"]
}



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Text Preprocessing
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = contractions.fix(text.lower())
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return re.sub(r'\s+', ' ', text).strip()

In [26]:
# Enhanced Aspect Extraction with Parallel Processing
def extract_aspect_terms_cpu(texts, batch_size=32):
    aspect_results = []
    
    # Process in batches with progress bar
    for doc in tqdm(nlp.pipe(texts, batch_size=batch_size), 
                   total=len(texts),
                   desc="Extracting aspects"):
        terms = []
        
        # Extract noun phrases
        for chunk in doc.noun_chunks:
            term = chunk.text.lower()
            for aspect, keywords in ASPECT_SEEDS.items():
                if any(keyword in term for keyword in keywords):
                    terms.append((term, aspect))
        
        # Extract verbs
        for token in doc:
            if token.pos_ == "VERB":
                for aspect, keywords in ASPECT_SEEDS.items():
                    if any(keyword in token.text.lower() for keyword in keywords):
                        terms.append((token.text.lower(), aspect))
        
        aspect_results.append(terms if terms else [("general", "general")])
    
    return aspect_results

In [5]:
# Dataset Loading with Chunking Support
def load_and_preprocess(file_path, chunksize=None):
    """Load and preprocess dataset with optional chunking for large files"""
    if chunksize:
        chunks = []
        for chunk in pd.read_csv(file_path, chunksize=chunksize):
            chunk = chunk.dropna(subset=['reviewText']).drop_duplicates(subset=['reviewText'])
            chunk['cleaned_text'] = chunk['reviewText'].apply(preprocess_text)
            chunks.append(chunk)
        return pd.concat(chunks)
    else:
        data = pd.read_csv(file_path)
        data = data.dropna(subset=['reviewText']).drop_duplicates(subset=['reviewText'])
        data['cleaned_text'] = data['reviewText'].apply(preprocess_text)
        return data

In [51]:
#Clustering Aspects
def cluster_aspects(data, n_clusters=NUM_ASPECTS, return_models=False):
    
    texts = data['cleaned_text'].tolist()
    
    # Use CPU extraction
    aspect_results = extract_aspect_terms_cpu(texts)
    data['extracted_aspects'] = aspect_results
    
    all_terms = [term for sublist in aspect_results for term, _ in sublist if term != 'general']
    
    if not all_terms:
        data['final_aspects'] = [['general']] * len(data)
        return data
    
    # Sparse matrix operations
    vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
    tfidf_matrix = vectorizer.fit_transform(all_terms)
    
    # MiniBatchKMeans handles sparse matrices well
    kmeans = MiniBatchKMeans(
        n_clusters=min(n_clusters, len(all_terms)),
        random_state=42,
        batch_size=1000
    )
    clusters = kmeans.fit_predict(tfidf_matrix)
    
    # Aspect naming
    term_to_cluster = defaultdict(list)
    for term, cluster_id in zip(all_terms, clusters):
        term_to_cluster[cluster_id].append(term)
    
    aspect_names = {}
    for cluster_id, terms in term_to_cluster.items():
        scores = []
        for aspect, keywords in ASPECT_SEEDS.items():
            exact_matches = sum(1 for term in terms if any(keyword == term for keyword in keywords))
            partial_matches = sum(1 for term in terms if any(keyword in term for keyword in keywords))
            score = 0.7 * exact_matches + 0.3 * partial_matches
            scores.append((aspect, score))
        
        best_aspect = max(scores, key=lambda x: x[1])[0] if max(scores, key=lambda x: x[1])[1] > 0 else "general"
        aspect_names[cluster_id] = best_aspect
    
    def map_to_aspect(terms):
        if not terms or terms[0][0] == 'general':
            return ['general']
        term_vecs = vectorizer.transform([term for term, _ in terms if term != 'general'])
        return list(set(aspect_names[pred] for pred in kmeans.predict(term_vecs)))
    
    data['final_aspects'] = data['extracted_aspects'].apply(map_to_aspect)
    return (data, vectorizer, kmeans) if return_models else data

In [18]:
# Dynamic Aspect Discovery
def discover_new_aspects(texts, top_n=20):

    noun_phrases = []
    for doc in nlp.pipe(texts, batch_size=1000):
        noun_phrases.extend([chunk.text.lower() for chunk in doc.noun_chunks])
    
    vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 3))
    X = vectorizer.fit_transform(noun_phrases)
    terms = vectorizer.get_feature_names_out()
    counts = np.array(X.sum(axis=0))[0]
    
    existing_terms = set().union(*ASPECT_SEEDS.values())
    new_terms = [(term, count) for term, count in zip(terms, counts) 
                if term not in existing_terms and count > 10]
    
    return sorted(new_terms, key=lambda x: x[1], reverse=True)[:top_n]

In [8]:
# Prepare ABSA Dataset
def prepare_absa_dataset(data):
    aspect_data = []
    for idx, row in data.iterrows():
        for aspect in row['final_aspects']:
            aspect_data.append({
                'text': row['reviewText'],
                'aspect': aspect,
                'label': int(row['overall'] + 1)  # Map -1->0, 0->1, 1->2
            })
    aspect_df = pd.DataFrame(aspect_data)
    train_df, val_df = train_test_split(aspect_df, test_size=0.2, random_state=42)
    return train_df, val_df

In [9]:
# Define Models
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_filters, filter_sizes, num_labels):
        super(CNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, embed_dim)) for fs in filter_sizes
        ])
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_labels)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)
        x = x.unsqueeze(1)
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [torch.max_pool1d(pool, pool.size(2)).squeeze(2) for pool in x]
        x = torch.cat(x, dim=1)
        x = self.dropout(x)
        return self.fc(x)

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_labels)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)
        _, (h_n, _) = self.lstm(x)
        return self.fc(h_n[-1])

class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_labels, dropout=0.3):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.bilstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=num_layers, dropout=0.3)
        self.bn = nn.BatchNorm1d(hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)
        _, (h_n, _) = self.bilstm(x)
        h_n = torch.cat((h_n[-2], h_n[-1]), dim=1)
        x = self.bn(h_n)
        return self.fc(x)

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_labels, dropout=0.3):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, input_ids, attention_mask=None):
        embedded = self.embedding(input_ids)
        if attention_mask is not None:
            lengths = attention_mask.sum(dim=1).cpu()
            packed = nn.utils.rnn.pack_padded_sequence(
                embedded, lengths, batch_first=True, enforce_sorted=False
            )
            packed_out, _ = self.gru(packed)
            gru_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        else:
            gru_out, _ = self.gru(embedded)
        pooled = torch.mean(gru_out, dim=1)
        return self.fc(pooled)

In [10]:
# ABSA Dataset and Training 
class ABSADataset(Dataset):
    def __init__(self, df, tokenizer, max_length=64):
        self.texts = df['text'].tolist()
        self.aspects = df['aspect'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        aspect = str(self.aspects[idx])
        input_text = f"{aspect} {text}"
        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [11]:
def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [12]:
def evaluate_model(models, dataloader, is_ensemble=False):
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            if is_ensemble:
                preds = ensemble_predict(models, input_ids, attention_mask)
            else:
                model = models[0]
                model.eval()
                outputs = model(input_ids, attention_mask)
                preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions), \
           precision_score(true_labels, predictions, average='weighted', zero_division=0), \
           recall_score(true_labels, predictions, average='weighted', zero_division=0), \
           f1_score(true_labels, predictions, average='weighted', zero_division=0), \
           confusion_matrix(true_labels, predictions)

In [65]:
# Aspect Clustering for Single Text
def cluster_aspects_for_text(text, vectorizer=None, kmeans=None):
    cleaned_text = preprocess_text(text)
    aspect_results = extract_aspect_terms_cpu([cleaned_text])
    data = pd.DataFrame({'cleaned_text': [cleaned_text], 'extracted_aspects': [aspect_results[0]]})
    
    # Extract terms safely
    all_terms = []
    for sublist in data['extracted_aspects']:
        for item in sublist:
            if isinstance(item, tuple) and len(item) == 2 and item[0] != 'unknown':
                all_terms.append(item[0])
            elif item == 'general':
                all_terms.append('general')
    
    if not all_terms or all(t == 'general' for t in all_terms):
        data['final_aspects'] = [['general']]
        return data['final_aspects'].iloc[0]
    
    # Initialize or use provided vectorizer
    if vectorizer is None:
        vectorizer = TfidfVectorizer(max_features=5000)
        tfidf_matrix = vectorizer.fit_transform(all_terms)
    else:
        tfidf_matrix = vectorizer.transform(all_terms)
    
    # Use the number of aspects in ASPECT_SEEDS as maximum clusters
    max_aspects = len(ASPECT_SEEDS)
    
    # Initialize or use provided KMeans
    if kmeans is None:
        n_clusters = min(max_aspects, len(set(all_terms)))  # Don't exceed number of defined aspects
        kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(tfidf_matrix)
    else:
        clusters = kmeans.predict(tfidf_matrix)
        n_clusters = min(kmeans.n_clusters, max_aspects)  # Cap at our max aspects
    
    # Map terms to clusters
    term_to_cluster = defaultdict(list)
    for term, cluster_id in zip(all_terms, clusters):
        term_to_cluster[cluster_id].append(term)
    
    # Name clusters based on seed terms - ensure we cover all possible cluster IDs
    aspect_names = {}
    for cluster_id in range(n_clusters):
        cluster_terms = term_to_cluster.get(cluster_id, [])
        max_overlap = 0
        aspect_name = f"aspect_{cluster_id}"
        
        # Check against all aspect seeds
        for aspect, seeds in ASPECT_SEEDS.items():
            overlap = len(set(cluster_terms).intersection(seeds))
            if overlap > max_overlap:
                max_overlap = overlap
                aspect_name = aspect
        
        # If no strong match, try partial matches
        if max_overlap == 0:
            for aspect, seeds in ASPECT_SEEDS.items():
                if any(seed in term for term in cluster_terms for seed in seeds):
                    aspect_name = aspect
                    break
        
        aspect_names[cluster_id] = aspect_name
    
    def map_to_cluster_aspect(terms):
        if not terms:
            return ['general']
        final_aspects = []
        for item in terms:
            if isinstance(item, tuple) and len(item) == 2 and item[0] != 'unknown':
                term = item[0]
                try:
                    prediction = kmeans.predict(vectorizer.transform([term]))[0]
                    # Ensure prediction is within our aspect_names
                    if prediction < n_clusters:  # Check cluster ID is valid
                        final_aspects.append(aspect_names[prediction])
                    else:
                        # Find the most similar aspect
                        term_vec = vectorizer.transform([term])
                        closest_cluster = kmeans.predict(term_vec)[0]
                        if closest_cluster < n_clusters:
                            final_aspects.append(aspect_names[closest_cluster])
                        else:
                            final_aspects.append('general')
                except:
                    final_aspects.append('general')
            elif item == 'general':
                final_aspects.append('general')
        return list(set(final_aspects)) if final_aspects else ['general']
    
    data['final_aspects'] = data['extracted_aspects'].apply(map_to_cluster_aspect)
    return data['final_aspects'].iloc[0]

In [13]:
def ensemble_predict(models, input_ids, attention_mask=None, T=2.0):
    num_classes = models[0].fc.out_features
    votes = np.zeros((input_ids.shape[0], num_classes))
    with torch.no_grad():
        layer_norm = torch.nn.LayerNorm(num_classes).to(DEVICE)
        for model in models:
            model.eval()
            logits = model(input_ids, attention_mask)
            logits = layer_norm(logits)
            logits /= T
            votes += logits.cpu().numpy()
    final_probs = torch.softmax(torch.tensor(votes), dim=1).numpy()
    return np.argmax(final_probs, axis=1)

In [14]:
def predict_text_absa(text, aspects, tokenizer, models, max_length=64):
    results = {}
    for aspect in aspects:
        input_text = f"{aspect} {text}"
        encoding = tokenizer(
            input_text,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(DEVICE)
        attention_mask = encoding['attention_mask'].to(DEVICE)
        pred = ensemble_predict(models, input_ids, attention_mask)[0]
        results[aspect] = {0: "Negative", 1: "Neutral", 2: "Positive"}[pred]
    return results

In [52]:
# Main Execution
if __name__ == "__main__":
    # Load and preprocess data
    data = load_and_preprocess("../dataset/Amazon/software/combined_output.csv", chunksize=50000)

In [20]:
    # Discover new aspects
    new_terms = discover_new_aspects(data['cleaned_text'].sample(100000).tolist())
    print("Discovered new terms:", new_terms)
    
     # Incorporate new terms into ASPECT_SEEDS
    for term, count in new_terms:
        if count > 50:  # Only add frequently occurring terms
            if term not in set().union(*ASPECT_SEEDS.values()):
                ASPECT_SEEDS.setdefault("new_" + term, []).append(term)

Discovered new terms: [('software', 33748), ('product', 29262), ('program', 26185), ('computer', 17389), ('one', 15058), ('new', 14214), ('great', 13777), ('time', 13258), ('good', 12030), ('years', 9192), ('tax', 9001), ('money', 8925), ('amazon', 8239), ('problem', 7850), ('many', 7719), ('system', 7395), ('norton', 7243), ('pc', 7238), ('quicken', 7237), ('microsoft', 7156)]


In [54]:
    # Cluster aspects
    data, vectorizer, kmeans = cluster_aspects(data, return_models=True)

Extracting aspects: 100%|█████████████████████████████████████████████████████| 360261/360261 [23:59<00:00, 250.28it/s]


In [55]:
    # Save TF-IDF vectorizer
    with open('tfidf_vectorizer.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)
    
    # Save KMeans clustering model
    with open('kmeans_model.pkl', 'wb') as f:
        pickle.dump(kmeans, f)

In [29]:
    # Prepare dataset for training
    train_df, val_df = prepare_absa_dataset(data)
    train_dataset = ABSADataset(train_df, TOKENIZER, MAX_LENGTH)
    val_dataset = ABSADataset(val_df, TOKENIZER, MAX_LENGTH)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [30]:
    # Initialize models
    VOCAB_SIZE = TOKENIZER.vocab_size
    EMBED_DIM = 100
    NUM_FILTERS = 100
    FILTER_SIZES = [2, 3, 4]
    HIDDEN_DIM = 64
    NUM_LAYERS = 2
    NUM_LABELS = 3
    EPOCHS = 30

    models = [
        CNNModel(VOCAB_SIZE, EMBED_DIM, NUM_FILTERS, FILTER_SIZES, NUM_LABELS).to(DEVICE),
        LSTMModel(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_LABELS).to(DEVICE),
        BiLSTMModel(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_LAYERS, NUM_LABELS).to(DEVICE),
        GRUModel(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_LAYERS, NUM_LABELS).to(DEVICE)
    ]

In [31]:
    # Compute class weights
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.array([0, 1, 2]),
        y=train_df['label'].values
    )
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)
    criterion = nn.CrossEntropyLoss(weight=class_weights)

In [44]:
    # Train individual models
    for idx, model in enumerate(models):
        model_name = model.__class__.__name__
        print(f"\nTraining {model_name}...")
        optimizer = AdamW(model.parameters(), lr=2e-5)
        for epoch in range(EPOCHS):
            train_loss = train_model(model, train_loader, optimizer, criterion)
            accuracy, precision, recall, f1, conf_matrix = evaluate_model([model], val_loader, is_ensemble=False)
            print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {train_loss:.4f}")
            print(f"Validation - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
            print(f"Confusion Matrix:\n{conf_matrix}")


Training CNNModel...
Epoch 1/30, Train Loss: 1.0120
Validation - Accuracy: 0.6447, Precision: 0.6960, Recall: 0.6447, F1: 0.6647
Confusion Matrix:
[[43805 10141 10924]
 [ 5661  5586  4780]
 [17394 14522 65711]]
Epoch 2/30, Train Loss: 0.9378
Validation - Accuracy: 0.6628, Precision: 0.7276, Recall: 0.6628, F1: 0.6861
Confusion Matrix:
[[46594 10424  7852]
 [ 5736  6381  3910]
 [16313 15961 65353]]
Epoch 3/30, Train Loss: 0.8985
Validation - Accuracy: 0.6817, Precision: 0.7433, Recall: 0.6817, F1: 0.7045
Confusion Matrix:
[[47170 10413  7287]
 [ 5437  6723  3867]
 [14581 15247 67799]]
Epoch 4/30, Train Loss: 0.8698
Validation - Accuracy: 0.6998, Precision: 0.7555, Recall: 0.6998, F1: 0.7213
Confusion Matrix:
[[47196 10360  7314]
 [ 5067  6967  3993]
 [12654 14205 70768]]
Epoch 5/30, Train Loss: 0.8469
Validation - Accuracy: 0.6955, Precision: 0.7708, Recall: 0.6955, F1: 0.7226
Confusion Matrix:
[[47710 11426  5734]
 [ 4839  7963  3225]
 [12259 16880 68488]]
Epoch 6/30, Train Loss: 0.82

In [45]:
    # Evaluate ensemble
    print("\nEvaluating Ensemble...")
    accuracy, precision, recall, f1, conf_matrix = evaluate_model(models, val_loader, is_ensemble=True)
    print(f"Ensemble - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")


Evaluating Ensemble...
Ensemble - Accuracy: 0.8290, Precision: 0.8729, Recall: 0.8290, F1: 0.8432
Confusion Matrix:
[[53564  8105  3201]
 [ 1694 12832  1501]
 [ 5775 10259 81593]]


In [46]:
    # Save models
    with open("ensemble_absa_models_V1.pkl", "wb") as f:
        pickle.dump(models, f)

In [66]:
    # Example prediction
    custom_text = "The software is very user-friendly and fast, but the customer support is terrible and it's overpriced."

    # Extract aspects
    aspects = cluster_aspects_for_text(custom_text, vectorizer, kmeans)
    print(f"Extracted Aspects: {aspects}")
    
    # Predict sentiments
    predicted_sentiments = predict_text_absa(custom_text, aspects, TOKENIZER, models)
    print(f"\nCustom Text: {custom_text}")
    print("Predicted Sentiments:")
    for aspect, sentiment in predicted_sentiments.items():
        print(f"Aspect: {aspect}, Sentiment: {sentiment}")

Extracting aspects: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 332.33it/s]


Extracted Aspects: ['usability']

Custom Text: The software is very user-friendly and fast, but the customer support is terrible and it's overpriced.
Predicted Sentiments:
Aspect: usability, Sentiment: Negative


In [48]:
    print("\nDataFrame head after aspect extraction and clustering:")
    display(data)


DataFrame head after aspect extraction and clustering:


Unnamed: 0.1,Unnamed: 0,overall,reviewTime,reviewText,cleaned_text,extracted_aspects,final_aspects
0,5,1.0,2011,"Strong backgroung, good read, quite up to date...",strong backgroung good read quite date takes h...,"[(quite date, usability), (holistic approach s...",[usability]
1,6,0.0,2010,If you live on Mars and never heard of the int...,live mars never heard internet good book,"[(internet good book, new_good)]",[new_good]
2,7,1.0,2010,i got this book on amazon and it ended up savi...,got book amazon ended saving lot money great c...,"[(book amazon, new_amazon), (lot money great c...","[new_amazon, new_great]"
3,8,1.0,2010,I was very happy with this purchase because th...,happy purchase shipment super fast thanks,"[(happy purchase shipment super fast thanks, p...",[usability]
4,9,1.0,2010,Recieved in a timely manner- book in great con...,recieved timely manner book great condition ma...,"[(timely manner book great condition, new_grea...",[new_great]
...,...,...,...,...,...,...,...
375081,457815,-1.0,2006,My son wanted this game and was so excited to ...,son wanted game excited buy money work three c...,"[(money, new_one), (money, new_money), (three ...","[usage_context, new_money, usability]"
375082,457879,1.0,2006,I bought this CD because I am an international...,bought cd international student us like much f...,"[(though still greatly pleased care concern, u...",[usability]
375083,457880,0.0,2002,I bought this program and like it. The gentle...,bought program like gentleman said greatly dis...,"[(program, device_requirements), (program, new...","[new_program, usability]"
375084,457881,-1.0,2002,I was very dissapointed with this product. It ...,dissapointed product much except nice interfac...,"[(dissapointed product, new_product), (nice in...","[usability, new_product]"
