In [2]:
# === File: scripts/train.py ===
import os, pandas as pd, joblib, pathlib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# For Colab: mount local paths in working directory
DATASET_PATH = "/content/Amazon_Unlocked_Mobile.csv"
MODEL_PATH = "sentiment_lr_tfidf.pkl"
TRAIN_SAMPLE = 200000

def map_rating(r):
    try:
        r = int(r)
        if r <= 2: return "negative"
        if r == 3: return "neutral"
        if r >= 4: return "positive"
    except: return None

def load_data():
    df = pd.read_csv(DATASET_PATH)
    df = df[["Reviews", "Rating"]].dropna()
    df["label"] = df["Rating"].apply(map_rating)
    df = df.dropna(subset=["label"]).drop_duplicates(subset=["Reviews"])
    if len(df) > TRAIN_SAMPLE:
        df = df.sample(TRAIN_SAMPLE, random_state=7)
    return df[["Reviews", "label"]].rename(columns={"Reviews": "text"})

def train():
    data = load_data()
    X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2, stratify=data["label"], random_state=42)
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=80000, min_df=2)),
        ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(classification_report(y_test, y_pred))
    joblib.dump(pipe, MODEL_PATH)
    print("Model saved:", MODEL_PATH)

if __name__ == "__main__":
    train()


# === File: services/api/app/main.py ===
import os, joblib
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List

app = FastAPI()

MODEL_PATH = "sentiment_lr_tfidf.pkl"  # For Colab or local

class Inp(BaseModel):
    text: str

try:
    pipe = joblib.load(MODEL_PATH)
    vectorizer = pipe.named_steps["tfidf"]
    clf = pipe.named_steps["clf"]
except:
    pipe = None
    vectorizer = None
    clf = None

@app.get("/health")
def health():
    return {"status": "ok", "model_loaded": pipe is not None}

@app.post("/predict")
def predict(inp: Inp):
    text = inp.text.strip()
    if not text or not pipe:
        return {"label": "neutral", "score": 0.5, "tokens": []}

    proba = clf.predict_proba([text])[0]
    idx = int(proba.argmax())
    label = str(clf.classes_[idx])
    score = float(proba[idx])

    if label == "neutral":
        if score >= 0.65 and len(text.split()) <= 6:
            label = "positive"
        elif score <= 0.35 and len(text.split()) <= 6:
            label = "negative"

    vec = vectorizer.transform([text])
    coef = clf.coef_[idx]
    feats = vectorizer.get_feature_names_out()
    contribs = {}
    for i, v in zip(vec.nonzero()[1], vec.data):
        contribs[feats[i]] = float(coef[i] * v)
    top_tokens = sorted(contribs.items(), key=lambda x: abs(x[1]), reverse=True)[:10]
    return {
        "label": label,
        "score": round(score, 4),
        "tokens": [{"term": t, "weight": round(w, 4)} for t, w in top_tokens]
    }

              precision    recall  f1-score   support

    negative       0.82      0.82      0.82      8870
     neutral       0.29      0.51      0.37      2873
    positive       0.95      0.86      0.90     20755

    accuracy                           0.82     32498
   macro avg       0.69      0.73      0.70     32498
weighted avg       0.86      0.82      0.83     32498

Model saved: sentiment_lr_tfidf.pkl
