# DEMO - TFIDF - Logistic Regression - Neural Network

In this notebook we will:
1. Initialize a demo
2. Add a dropdown to select which model from the "models" folder to test
3. Add a text box for inputs
4. Add a "Predict" button that shows the prediction for the input based on the model selected
5.

In [None]:
from sklearn.model_selection import GridSearchCV
from torch import nn
import pygame
import os
import torch
import joblib
import torch.nn.functional as F
from pygame import Rect
from sklearn.base import BaseEstimator

import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
STOP = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    txt = text.lower()
    txt = re.sub(r'<[^>]+>', ' ', txt)
    txt = re.sub(r'http\S+|www\.\S+', ' ', txt)
    txt = re.sub(r'[^a-z\s]', ' ', txt)
    txt = re.sub(r'\s+', ' ', txt).strip()
    tokens = [w for w in txt.split() if w not in STOP]
    return ' '.join(lemmatizer.lemmatize(w) for w in tokens)

# Load TF-IDF vectorizer
vectorizer = joblib.load("../models/sp_tfidf_vectorizer.joblib")

# Set up pygame
pygame.init()
FONT = pygame.font.SysFont("Arial", 24)
WHITE, GRAY, BLACK, BLUE = (255, 255, 255), (200, 200, 200), (0, 0, 0), (100, 100, 255)
WIDTH, HEIGHT = 800, 400
screen = pygame.display.set_mode((WIDTH, HEIGHT))
pygame.display.set_caption("Model Prediction Demo")

clock = pygame.time.Clock()

# UI state
input_text = ""
prediction_text = ""
prediction_prob = ""
dropdown_open = False
selected_model = None

# Load models
model_files = [f for f in os.listdir("../models") if f.endswith(".joblib") or f.endswith(".pth")]
model_buttons = [pygame.Rect(50, 100 + i * 30, 300, 30) for i in range(len(model_files))]

input_box = Rect(50, 50, 400, 30)
predict_button = Rect(500, 50, 120, 30)
dropdown_rect = Rect(50, 100, 300, 30)
GREEN = (100, 200, 100)
RED = (200, 100, 100)
top_ngrams = []  # store feature contribution list

# Model loader
def load_model(path):
    if path.endswith(".joblib"):
        model = joblib.load(path)
    elif path.endswith(".pth"):
        class MLP(nn.Module):
            def __init__(self, in_dim, hidden_dim, dropout):
                super().__init__()
                self.net = nn.Sequential(
                    nn.Linear(in_dim, hidden_dim),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.Linear(hidden_dim, 2)
                )
            def forward(self, x):
                return nn.functional.softmax(self.net(x))
        MAX_FEATURES = 20000
        HIDDEN_DIM = 512
        DROPOUT = 0.5
        model = MLP(MAX_FEATURES, HIDDEN_DIM, DROPOUT)
        model.load_state_dict(torch.load(path, map_location="cpu"))
        model.eval()
    return model

# Prediction logic
def predict(model, text):
    text_clean = clean_text(text)
    if isinstance(model, GridSearchCV) or isinstance(model, BaseEstimator):  # handles joblib grid model
        pred = model.predict([text_clean])[0]
        pred_prob = model.predict_proba([text_clean])[0]
        pred_prob = f'Fake: {pred_prob[0]:.4f} Genuine: {pred_prob[1]:.4f}'
    else:
        # Torch model still needs TF-IDF
        vec = vectorizer.transform([text_clean])
        with torch.no_grad():
            x = torch.tensor(vec.toarray(), dtype=torch.float32)
            pred = torch.argmax(model(x)).item()
            pred_prob = model(x).numpy()[0]
        pred_prob = f'Fake: {pred_prob[0].item():.4f} Genuine: {pred_prob[1].item():.4f}'
    return str(pred), str(pred_prob)

def get_top_ngrams_from_pipeline(model, text, n=5):
    if hasattr(model, "best_estimator_"):
        pipeline = model.best_estimator_
    else:
        pipeline = model

    tfidf = pipeline.named_steps["tfidf"]
    clf = pipeline.named_steps["clf"]

    # Get feature names and weights
    feature_names = tfidf.get_feature_names_out()
    coef = clf.coef_[0]  # assuming binary classification

    # Transform the input to feature vector
    vec = tfidf.transform([text])
    vec_data = vec.toarray()[0]

    # Zip (index, tfidf value * weight)
    weighted_scores = [
        (i, vec_data[i] * coef[i])
        for i in range(len(vec_data)) if vec_data[i] != 0
    ]

    # Sort by absolute importance
    top = sorted(weighted_scores, key=lambda x: abs(x[1]), reverse=True)[:n]
    return [(feature_names[i], score) for i, score in top]

# Main loop
running = True
while running:
    screen.fill(WHITE)
    events = pygame.event.get()

    for event in events:
        if event.type == pygame.QUIT:
            running = False

        elif event.type == pygame.MOUSEBUTTONDOWN:
            if dropdown_rect.collidepoint(event.pos):
                dropdown_open = not dropdown_open
            elif predict_button.collidepoint(event.pos) and selected_model:
                model = load_model(os.path.join("../models", selected_model))

                prediction_text, prediction_prob = predict(model, input_text)
                # Only extract n-gram features for sklearn pipelines
                # if hasattr(model, "predict_proba") or hasattr(model, "best_estimator_"):
                top_ngrams = get_top_ngrams_from_pipeline(model, clean_text(input_text))
                # else:
                #     top_ngrams = []
            elif dropdown_open:
                for i, rect in enumerate(model_buttons):
                    if rect.collidepoint(event.pos):
                        selected_model = model_files[i]
                        dropdown_open = False
                        break

        elif event.type == pygame.KEYDOWN:
            if event.key == pygame.K_BACKSPACE:
                input_text = input_text[:-1]
            else:
                input_text += event.unicode

    # Draw input box
    pygame.draw.rect(screen, GRAY, input_box, 2)
    input_surface = FONT.render(input_text, True, BLACK)
    screen.blit(input_surface, (input_box.x + 5, input_box.y + 5))

    # Draw Predict button
    pygame.draw.rect(screen, BLUE, predict_button)
    screen.blit(FONT.render("Predict", True, WHITE), (predict_button.x + 10, predict_button.y + 2))

    # Draw dropdown
    pygame.draw.rect(screen, GRAY, dropdown_rect)
    selected = selected_model or "Select a model"
    screen.blit(FONT.render(selected, True, BLACK), (dropdown_rect.x + 5, dropdown_rect.y + 5))

    if dropdown_open:
        for i, rect in enumerate(model_buttons):
            pygame.draw.rect(screen, GRAY, rect)
            screen.blit(FONT.render(model_files[i], True, BLACK), (rect.x + 5, rect.y + 5))

    # Show prediction
    screen.blit(FONT.render("Prediction: " + prediction_prob, True, BLACK), (50, 300))

    if prediction_text != "":
        label_color = GREEN if prediction_text == "1" else RED
        label_text = "This is a genuine review!" if prediction_text == "1" else "This is a fake review!"
        label_rect = pygame.Rect(0, HEIGHT - 50, WIDTH, 50)
        pygame.draw.rect(screen, label_color, label_rect)
        label_surface = FONT.render(label_text, True, WHITE)
        screen.blit(label_surface, (WIDTH // 2 - label_surface.get_width() // 2, HEIGHT - 40))

    if top_ngrams:
        screen.blit(FONT.render("Top N-grams:", True, BLACK), (WIDTH - 200, 120))
        for i, (phrase, weight) in enumerate(top_ngrams):
            color = GREEN if weight > 0 else RED
            text_surface = FONT.render(f"{phrase} ({weight:+.3f})", True, color)
            screen.blit(text_surface, (WIDTH - 200, 150 + i * 25))

    pygame.display.flip()
    clock.tick(30)

pygame.quit()
