In [1]:
from flask import Flask, render_template, request
import threading
import spacy
from werkzeug.serving import make_server
import pickle
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaModel
import re
import contractions
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor
import multiprocessing as mp
from tqdm import tqdm


In [2]:
app = Flask(__name__)

In [3]:
# Configuration
nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TOKENIZER = RobertaTokenizer.from_pretrained("roberta-base")
VOCAB_SIZE = TOKENIZER.vocab_size  # ~50,265 for roberta-base

# Aspect Seed Keywords
with open('aspect_seeds.pkl', 'rb') as f:
    ASPECT_SEEDS = pickle.load(f)

In [4]:
with open('tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)
    
with open('kmeans_model.pkl', 'rb') as f:
    kmeans = pickle.load(f)

In [5]:
# Function to preprocess text
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = contractions.fix(text)   # Expand contractions
    text = text.lower()             # Convert text to lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'\d+', '', text)    # Remove numbers
    text = re.sub(r'\W+', ' ', text)   # Remove special characters
    return text.strip()             # Strip leading/trailing whitespace

In [6]:
# Dynamic Aspect Discovery
def discover_new_aspects(texts, top_n=20):

    noun_phrases = []
    for doc in nlp.pipe(texts, batch_size=1000):
        noun_phrases.extend([chunk.text.lower() for chunk in doc.noun_chunks])
    
    vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 3))
    X = vectorizer.fit_transform(noun_phrases)
    terms = vectorizer.get_feature_names_out()
    counts = np.array(X.sum(axis=0))[0]
    
    existing_terms = set().union(*ASPECT_SEEDS.values())
    new_terms = [(term, count) for term, count in zip(terms, counts) 
                if term not in existing_terms and count > 10]
    
    return sorted(new_terms, key=lambda x: x[1], reverse=True)[:top_n]

In [7]:
# Define Models
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_filters, filter_sizes, num_labels):
        super(CNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, embed_dim)) for fs in filter_sizes
        ])
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_labels)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)
        x = x.unsqueeze(1)
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [torch.max_pool1d(pool, pool.size(2)).squeeze(2) for pool in x]
        x = torch.cat(x, dim=1)
        x = self.dropout(x)
        return self.fc(x)

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_labels)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)
        _, (h_n, _) = self.lstm(x)
        return self.fc(h_n[-1])

class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.bilstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=2, dropout=0.3)
        self.bn = nn.BatchNorm1d(hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)
        _, (h_n, _) = self.bilstm(x)
        h_n = torch.cat((h_n[-2], h_n[-1]), dim=1)
        x = self.bn(h_n)
        return self.fc(x)

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_labels, dropout=0.3):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_labels)

    def forward(self, input_ids, attention_mask=None):
        embedded = self.embedding(input_ids)
        if attention_mask is not None:
            lengths = attention_mask.sum(dim=1).cpu()
            packed = nn.utils.rnn.pack_padded_sequence(
                embedded, lengths, batch_first=True, enforce_sorted=False
            )
            packed_out, _ = self.gru(packed)
            gru_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        else:
            gru_out, _ = self.gru(embedded)
        pooled = torch.mean(gru_out, dim=1)
        return self.fc(pooled)

In [8]:
# Enhanced Aspect Extraction with Parallel Processing
def extract_aspect_terms_cpu(texts, batch_size=32):
    aspect_results = []
    
    # Process in batches with progress bar
    for doc in tqdm(nlp.pipe(texts, batch_size=batch_size), 
                   total=len(texts),
                   desc="Extracting aspects"):
        terms = []
        
        # Extract noun phrases
        for chunk in doc.noun_chunks:
            term = chunk.text.lower()
            for aspect, keywords in ASPECT_SEEDS.items():
                if any(keyword in term for keyword in keywords):
                    terms.append((term, aspect))
        
        # Extract verbs
        for token in doc:
            if token.pos_ == "VERB":
                for aspect, keywords in ASPECT_SEEDS.items():
                    if any(keyword in token.text.lower() for keyword in keywords):
                        terms.append((token.text.lower(), aspect))
        
        aspect_results.append(terms if terms else [("general", "general")])
    
    return aspect_results

In [9]:
# Aspect Clustering for Single Text
def cluster_aspects_for_text(text, vectorizer=None, kmeans=None):
    cleaned_text = preprocess_text(text)
    aspect_results = extract_aspect_terms_cpu([cleaned_text])
    data = pd.DataFrame({'cleaned_text': [cleaned_text], 'extracted_aspects': [aspect_results[0]]})
    
    # Extract terms safely
    all_terms = []
    for sublist in data['extracted_aspects']:
        for item in sublist:
            if isinstance(item, tuple) and len(item) == 2 and item[0] != 'unknown':
                all_terms.append(item[0])
            elif item == 'general':
                all_terms.append('general')
    
    if not all_terms or all(t == 'general' for t in all_terms):
        data['final_aspects'] = [['general']]
        return data['final_aspects'].iloc[0]
    
    # Initialize or use provided vectorizer
    if vectorizer is None:
        vectorizer = TfidfVectorizer(max_features=5000)
        tfidf_matrix = vectorizer.fit_transform(all_terms)
    else:
        tfidf_matrix = vectorizer.transform(all_terms)
    
    # Use the number of aspects in ASPECT_SEEDS as maximum clusters
    max_aspects = len(ASPECT_SEEDS)
    
    # Initialize or use provided KMeans
    if kmeans is None:
        n_clusters = min(max_aspects, len(set(all_terms)))  # Don't exceed number of defined aspects
        kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(tfidf_matrix)
    else:
        clusters = kmeans.predict(tfidf_matrix)
        n_clusters = min(kmeans.n_clusters, max_aspects)  # Cap at our max aspects
    
    # Map terms to clusters
    term_to_cluster = defaultdict(list)
    for term, cluster_id in zip(all_terms, clusters):
        term_to_cluster[cluster_id].append(term)
    
    # Name clusters based on seed terms - ensure we cover all possible cluster IDs
    aspect_names = {}
    for cluster_id in range(n_clusters):
        cluster_terms = term_to_cluster.get(cluster_id, [])
        max_overlap = 0
        aspect_name = f"aspect_{cluster_id}"
        
        # Check against all aspect seeds
        for aspect, seeds in ASPECT_SEEDS.items():
            overlap = len(set(cluster_terms).intersection(seeds))
            if overlap > max_overlap:
                max_overlap = overlap
                aspect_name = aspect
        
        # If no strong match, try partial matches
        if max_overlap == 0:
            for aspect, seeds in ASPECT_SEEDS.items():
                if any(seed in term for term in cluster_terms for seed in seeds):
                    aspect_name = aspect
                    break
        
        aspect_names[cluster_id] = aspect_name
    
    def map_to_cluster_aspect(terms):
        if not terms:
            return ['general']
        final_aspects = []
        for item in terms:
            if isinstance(item, tuple) and len(item) == 2 and item[0] != 'unknown':
                term = item[0]
                try:
                    prediction = kmeans.predict(vectorizer.transform([term]))[0]
                    # Ensure prediction is within our aspect_names
                    if prediction < n_clusters:  # Check cluster ID is valid
                        final_aspects.append(aspect_names[prediction])
                    else:
                        # Find the most similar aspect
                        term_vec = vectorizer.transform([term])
                        closest_cluster = kmeans.predict(term_vec)[0]
                        if closest_cluster < n_clusters:
                            final_aspects.append(aspect_names[closest_cluster])
                        else:
                            final_aspects.append('general')
                except:
                    final_aspects.append('general')
            elif item == 'general':
                final_aspects.append('general')
        return list(set(final_aspects)) if final_aspects else ['general']
    
    data['final_aspects'] = data['extracted_aspects'].apply(map_to_cluster_aspect)
    return data['final_aspects'].iloc[0]

In [10]:
# Load models
model_paths = ["ensemble_absa_models_V1.pkl"]
models = []
for path in model_paths:
    with open(path, "rb") as f:
        loaded = pickle.load(f)
        if isinstance(loaded, list):
            models.extend(loaded)  # flatten if it's a list of models
        else:
            models.append(loaded)
        
# Ensemble Voting Function
def ensemble_predict(models, input_ids, attention_mask=None, T=2.0):
    num_classes = models[0].fc.out_features
    votes = np.zeros((input_ids.shape[0], num_classes))
    with torch.no_grad():
        layer_norm = torch.nn.LayerNorm(num_classes).to(DEVICE)
        for model in models:
            model.eval()
            model_name = model.__class__.__name__
            logits = model(input_ids, attention_mask)
            logits = layer_norm(logits)
            logits /= T
            votes += logits.cpu().numpy()
    final_probs = torch.softmax(torch.tensor(votes), dim=1).numpy()
    return np.argmax(final_probs, axis=1)
    
# Predict a single text using the ensemble
def predict_text_absa(text, aspects, tokenizer, models, max_length=64):
    results = {}
    for aspect in aspects:
        input_text = f"{aspect} {text}"
        encoding = tokenizer(
            input_text,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(DEVICE)
        attention_mask = encoding['attention_mask'].to(DEVICE)
        pred = ensemble_predict(models, input_ids, attention_mask, T=TEMPERATURE)[0]
        results = {"Aspect": aspect, "Sentiment": {0: "Negative", 1: "Neutral", 2: "Positive"}[pred]}
    
    return results

In [11]:
# Global variable to control the Flask server thread
keep_running = True

In [12]:
# Route for the home page
@app.route('/')
def home():
    return render_template('index.html')

# Route for prediction
@app.route('/predict', methods=['POST'])
def predict():
    if request.method == 'POST':
        text = request.form['text']
        # Extract aspects
        # Discover new aspects
        new_terms = discover_new_aspects(text)
        print("Discovered new terms:", new_terms)
        
         # Incorporate new terms into ASPECT_SEEDS
        for term, count in new_terms:
            if count > 50:  # Only add frequently occurring terms
                if term not in set().union(*ASPECT_SEEDS.values()):
                    ASPECT_SEEDS.setdefault("new_" + term, []).append(term)
                    
        aspects = cluster_aspects_for_text(text, vectorizer, kmeans)
        predicted_sentiment = predict_text_absa(text, aspects, TOKENIZER, models)
        return render_template('result.html', text=text, predicted_sentiment=predicted_sentiment)

# Function to run Flask app in a separate thread
def run_flask_app():
    global keep_running
    server = make_server('127.0.0.1', 5000, app)
    while keep_running:
        server.handle_request()

# Check if the script is run as main module
if __name__ == '__main__':
    # Create and start a new thread for Flask app
    flask_thread = threading.Thread(target=run_flask_app)
    flask_thread.start()
    
    # Provide a message indicating Flask is running
    print("Flask application running. Navigate to http://127.0.0.1:5000/ in your web browser.")
    
    # Set the flag to stop the Flask server thread
    #keep_running = False

Flask application running. Navigate to http://127.0.0.1:5000/ in your web browser.
