In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re

# Load CSV
df = pd.read_csv("F:\\Programming\\Artificial intelligence projects\\Svana AI\\reply_classification_dataset.csv")
"""
print(df.head())
print(df.info())
print(df['label'].value_counts())
"""
# Clean label column
df['label'] = df['label'].str.lower().str.strip()  # lowercase + remove spaces

# Remove punctuation like commas
df['label'] = df['label'].str.replace(r'[^a-z]', '', regex=True)

print(df['label'].value_counts())


# Reading CSV and cleaning the data.

In [None]:
import pandas as pd
import re

DATA_PATH = "F:\\Programming\\Artificial intelligence projects\\Svana AI\\reply_classification_dataset.csv"
df = pd.read_csv(DATA_PATH)

# Quick look
#print(df.head())
#print(df.info())

# Check label distribution
#print(df['label'].value_counts())

# cleaing data using

# Normalize labels
df['label'] = df['label'].str.lower().str.strip().str.replace(r'[^a-z]', '', regex=True)

print(df['label'].value_counts())
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)   # keep only letters
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['reply'].apply(clean_text)
#print(df[['reply', 'clean_text']].head())

#Train/Test Split
#We need to separate training and testing data.
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

# Training the baseline model using sklearn.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# Convert text to vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_vec, y_train)

# Evaluate
y_pred = log_reg.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))


# Fine tunig with Hugging face model.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
import re

DATA_PATH = "F:\\Programming\\Artificial intelligence projects\\Svana AI\\reply_classification_dataset.csv"
df = pd.read_csv(DATA_PATH)


# Normalize labels (make them lowercase, remove junk)
df['label'] = df['label'].str.lower().str.strip().str.replace(r'[^a-z]', '', regex=True)

# Clean the text (lowercase, remove special chars)
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Keep letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['reply'].apply(clean_text)

# Remove duplicates
df_unique = df.drop_duplicates(subset=['clean_text', 'label'])

# Map labels to numbers (transformers need numbers: neutral=0, negative=1, positive=2)
label_map = {'neutral': 0, 'negative': 1, 'positive': 2}
df_unique['label_id'] = df_unique['label'].map(label_map)

# Split into train and test (80% train, 20% test)
train_df, test_df = train_test_split(df_unique, test_size=0.2, random_state=42, stratify=df_unique['label'])

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df[['clean_text', 'label_id']].rename(columns={'clean_text': 'text', 'label_id': 'label'}))
test_dataset = Dataset.from_pandas(test_df[['clean_text', 'label_id']].rename(columns={'clean_text': 'text', 'label_id': 'label'}))

In [None]:
from transformers import DistilBertTokenizerFast

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize datasets
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)


In [None]:
from transformers import DistilBertForSequenceClassification

num_labels = 3  # neutral, negative, positive

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
)


# Comparison between Fine tuned model and Logistic Regression.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
import re

# Load dataset
DATA_PATH = "F:\\Programming\\Artificial intelligence projects\\Svana AI\\reply_classification_dataset.csv"
data = pd.read_csv(DATA_PATH)

# Normalize labels
data['label'] = data['label'].str.lower().str.strip().str.replace(r'[^a-z]', '', regex=True)

# Check label distribution
print(data['label'].value_counts())

# Map labels to numerical values
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
data['label'] = data['label'].map(label_map)

# Handle missing values
data = data.dropna()

# Clean text
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

data['clean_text'] = data['reply'].apply(clean_text)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['label'], test_size=0.2, random_state=42)

# Baseline Model: Logistic Regression with TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
lr_preds = lr_model.predict(X_test_tfidf)

# Evaluate baseline
lr_accuracy = accuracy_score(y_test, lr_preds)
lr_f1 = f1_score(y_test, lr_preds, average='weighted')
print(f"Logistic Regression - Accuracy: {lr_accuracy:.4f}, F1 Score: {lr_f1:.4f}")

# Transformer Model: DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Tokenize data
def tokenize_data(texts):
    return tokenizer(texts.tolist(), truncation=True, padding=True, max_length=128)

train_encodings = tokenize_data(X_train)
test_encodings = tokenize_data(X_test)

# Create PyTorch datasets
class EmailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmailDataset(train_encodings, y_train.tolist())
test_dataset = EmailDataset(test_encodings, y_test.tolist())

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    eval_strategy='epoch'  # Changed from evaluation_strategy to eval_strategy
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train and evaluate
trainer.train()
transformer_preds = trainer.predict(test_dataset).predictions.argmax(-1)

# Evaluate transformer
transformer_accuracy = accuracy_score(y_test, transformer_preds)
transformer_f1 = f1_score(y_test, transformer_preds, average='weighted')
print(f"DistilBERT - Accuracy: {transformer_accuracy:.4f}, F1 Score: {transformer_f1:.4f}")

# Model comparison
print("\nModel Comparison:")
print(f"Logistic Regression - Accuracy: {lr_accuracy:.4f}, F1 Score: {lr_f1:.4f}")
print(f"DistilBERT - Accuracy: {transformer_accuracy:.4f}, F1 Score: {transformer_f1:.4f}")
#print("Recommendation: Choose DistilBERT for production due to its ability to capture contextual nuances in short replies, despite higher computational cost. Logistic Regression is faster for prototyping but may miss subtle sentiment patterns.")

# Test Run of comparison

In [None]:
# ===============================
# Sentiment Classification Script
# ===============================

import pandas as pd
import numpy as np
import re
import torch
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)

# -------------------------------
# Load & preprocess dataset
# -------------------------------
DATA_PATH = "F:\\Programming\\Artificial intelligence projects\\Svana AI\\reply_classification_dataset.csv"
data = pd.read_csv(DATA_PATH)

# Normalize labels
data['label'] = data['label'].str.lower().str.strip().str.replace(r'[^a-z]', '', regex=True)

# Map labels -> integers
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
data['label'] = data['label'].map(label_map)

# Drop rows with missing or unmapped labels
data = data.dropna(subset=['label', 'reply'])

# Keep two versions of text: raw (for BERT), cleaned (for TF-IDF baseline)
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)        # keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()    # normalize spaces
    return text

data['raw_text'] = data['reply'].astype(str).str.strip()
data['clean_text'] = data['reply'].apply(clean_text)

# -------------------------------
# Train-test split (stratified)
# -------------------------------
X_raw = data['raw_text']
X_clean = data['clean_text']
y = data['label']

X_train_raw, X_test_raw, X_train_clean, X_test_clean, y_train, y_test = train_test_split(
    X_raw, X_clean, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# -------------------------------
# Baseline: Logistic Regression
# -------------------------------
print("\n===== Baseline: Logistic Regression (TF-IDF) =====")
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2), min_df=3)
X_train_tfidf = vectorizer.fit_transform(X_train_clean)
X_test_tfidf = vectorizer.transform(X_test_clean)

lr_model = LogisticRegression(max_iter=2000, class_weight="balanced")
lr_model.fit(X_train_tfidf, y_train)
lr_preds = lr_model.predict(X_test_tfidf)

print(classification_report(y_test, lr_preds, target_names=['positive','neutral','negative']))

# -------------------------------
# Transformer: DistilBERT
# -------------------------------
print("\n===== Transformer: DistilBERT Fine-tuning =====")

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3
)

def tokenize_texts(texts):
    return tokenizer(texts.tolist(), truncation=True, padding=True, max_length=128)

train_encodings = tokenize_texts(X_train_raw)
test_encodings = tokenize_texts(X_test_raw)

class EmailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.reset_index(drop=True)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k,v in self.encodings.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]))
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = EmailDataset(train_encodings, y_train)
test_dataset = EmailDataset(test_encodings, y_test)

# Weighted loss (optional if class imbalance is large)
counts = Counter(y_train)
total = sum(counts.values())
weights = [total / counts[i] for i in range(3)]
class_weights = torch.tensor(weights, dtype=torch.float)

from torch.nn import CrossEntropyLoss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k:v for k,v in inputs.items() if k!="labels"})
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Metrics function
from sklearn.metrics import precision_recall_fscore_support
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    num_train_epochs=3,        # try 3-4 first
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

# Evaluate DistilBERT
preds = trainer.predict(test_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)
print(classification_report(y_test, pred_labels, target_names=['positive','neutral','negative']))

# -------------------------------
# Save fine-tuned model
# -------------------------------
finetuned_path = "F:\\Programming\\Artificial intelligence projects\\Svana AI\\Project\\Models\\distilbert-finetuned"
model.save_pretrained(finetuned_path)
tokenizer.save_pretrained(finetuned_path)

print("\nTraining complete. Models saved.")


In [None]:
def predict_lr(texts, vectorizer, model):
    # preprocess same as training
    clean_texts = [re.sub(r'[^a-z\s]', '', t.lower()) for t in texts]
    clean_texts = [re.sub(r'\s+', ' ', t).strip() for t in clean_texts]
    X_tfidf = vectorizer.transform(clean_texts)
    preds = model.predict(X_tfidf)
    label_map_inv = {0:'positive', 1:'neutral', 2:'negative'}
    return [label_map_inv[p] for p in preds]

# Example test
test_samples = [
    "I really love this product!",
    "It's okay, nothing special.",
    "This is terrible, I hate it."
]


print("Logistic Regression Predictions:")
print(predict_lr(test_samples, vectorizer, lr_model))


In [None]:
def predict_bert(texts, tokenizer, model):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
    model.eval()
    with torch.no_grad():
        outputs = model(**encodings)
    preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
    label_map_inv = {0:'positive', 1:'neutral', 2:'negative'}
    return [label_map_inv[p] for p in preds]

print("DistilBERT Predictions:")
print(predict_bert(test_samples, tokenizer, model))


# Fast API.

In [None]:

from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import re
import numpy as np

# Load Logistic Regression model and TF-IDF vectorizer
MODEL_PATH = "f:\\Programming\\Artificial intelligence projects\\Svana AI\\Project\\Models\\lr_model.joblib"
VECTORIZER_PATH = "f:\\Programming\\Artificial intelligence projects\\Svana AI\\Project\\Models\\tfidf_vectorizer.joblib"
lr_model = joblib.load(MODEL_PATH)
vectorizer = joblib.load(VECTORIZER_PATH)

label_map = {0: "positive", 1: "neutral", 2: "negative"}

def clean_text(text):
	text = text.lower()
	text = re.sub(r'[^a-z\s]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

class TextRequest(BaseModel):
	text: str

app = FastAPI()

@app.post("/predict")
async def predict_sentiment(request: TextRequest):
	cleaned = clean_text(request.text)
	X = vectorizer.transform([cleaned])
	pred = lr_model.predict(X)[0]
	label = label_map.get(pred, "unknown")
	return {"label": label, "label_id": int(pred)}
