In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# =======================
# 1. Load Data
# =======================

train_df = pd.read_csv("TRAIN.csv")
test_df = pd.read_csv("TEST.csv")

# Combine instruction + input into one text feature
train_df["text"] = train_df["instruction"] + " " + train_df["input"]
test_df["text"] = test_df["instruction"] + " " + test_df["input"]

X_train_all = train_df["text"]
y_train_all = train_df["category"]

X_test = test_df["text"]
y_test = test_df["category"] if "category" in test_df.columns else None  # handle missing labels

# =======================
# 2. Train/Validation Split (to check accuracy)
# =======================
X_train, X_val, y_train, y_val = train_test_split(
    X_train_all, y_train_all, test_size=0.2, random_state=42, stratify=y_train_all
)

# =======================
# 3. Build Pipeline (TF-IDF + Logistic Regression)
# =======================
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=2000)),
    ("logreg", LogisticRegression(max_iter=500, C=1, class_weight="balanced"))
])

# =======================
# 4. Train Model
# =======================
pipe.fit(X_train, y_train)

# =======================
# 5. Evaluate on Validation Set
# =======================
y_val_pred = pipe.predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_acc)
print("\nValidation Report:\n", classification_report(y_val, y_val_pred, zero_division=0))

# =======================
# 6. Evaluate on Test Set (if labels exist)
# =======================
if y_test is not None:
    y_test_pred = pipe.predict(X_test)
    test_acc = accuracy_score(y_test, y_test_pred)
    print("\nTest Accuracy:", test_acc)
    print("\nTest Report:\n", classification_report(y_test, y_test_pred, zero_division=0))
else:
    # If no category column in TEST.csv, just output predictions
    y_test_pred = pipe.predict(X_test)
    test_df["predicted_category"] = y_test_pred
    print("\nPredictions on Test Set:")
    print(test_df[["instruction", "input", "predicted_category"]].head())

# =======================
# 7. Save Model (Optional)
# =======================
import joblib
joblib.dump(pipe, "logreg_model.pkl")
print("\nModel saved as logreg_model.pkl")


Validation Accuracy: 0.9428571428571428

Validation Report:
                          precision    recall  f1-score   support

      Admission Process       1.00      1.00      1.00         1
     Alumni Information       1.00      1.00      1.00         2
              Apologies       1.00      1.00      1.00         3
            Campus Life       1.00      1.00      1.00         1
        Contact Details       1.00      1.00      1.00         2
     Course Information       1.00      1.00      1.00         4
   Eligibility Criteria       1.00      1.00      1.00         1
          Fee Structure       1.00      1.00      1.00         1
      General Questions       0.67      0.67      0.67         3
Greetings and Farewells       1.00      1.00      1.00         3
      Hostel Facilities       1.00      1.00      1.00         2
   Open-Ended Questions       1.00      0.67      0.80         3
  Placement Information       1.00      1.00      1.00         3
 Previous Years' Cutoff     