<a href="https://www.kaggle.com/code/emrekaany/news-categorization-model?scriptVersionId=236203144" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:

import pandas as pd
import joblib # Used for saving/loading sklearn models efficiently
import logging
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline # To chain vectorizer and classifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder # To convert string labels to numbers

# --- Configuration ---
# Updated dataset path for Kaggle environment
DATASET_PATH = '/kaggle/input/financial-comments-for-sentiment-analysis/financial_sentiment_dataset.csv'
MODEL_SAVE_PATH = 'sentiment_model.joblib'
# Removed VECTORIZER_SAVE_PATH as it's saved within the pipeline
LABEL_ENCODER_SAVE_PATH = 'label_encoder.joblib' # Define path for label encoder
TEST_SET_SIZE = 0.2 # 20% of data for testing
RANDOM_STATE = 42 # For reproducible train/test splits

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Main Training Function ---
def train_sentiment_model():
    """Loads data, preprocesses, trains, evaluates, and saves the model."""

    logging.info("--- Starting Sentiment Model Training ---")

    # 1. Load Data
    logging.info(f"Loading dataset from: {DATASET_PATH}")
    if not os.path.exists(DATASET_PATH):
        logging.error(f"Error: Dataset file not found at '{DATASET_PATH}'.")
        print(f"\nPlease make sure '{DATASET_PATH}' exists.")
        print("Check the input path, especially in environments like Kaggle.")
        return
    try:
        df = pd.read_csv(DATASET_PATH)
        # Basic validation
        if 'text' not in df.columns or 'label' not in df.columns:
             logging.error("Error: Dataset must contain 'text' and 'label' columns.")
             print("\nError: The CSV file must have 'text' and 'label' columns.")
             return
        # Handle potential missing values (simple drop)
        df.dropna(subset=['text', 'label'], inplace=True)
        if df.empty:
            logging.error("Error: Dataset is empty after handling missing values.")
            print("\nError: The dataset is empty. Please check the CSV file.")
            return
        logging.info(f"Dataset loaded successfully. Shape: {df.shape}")
        logging.info(f"Label distribution:\n{df['label'].value_counts()}")
    except Exception as e:
        logging.error(f"Error loading or processing dataset: {e}")
        print(f"\nAn error occurred while loading the dataset: {e}")
        return

    # 2. Preprocessing & Feature Engineering
    logging.info("Preprocessing data...")
    X = df['text'] # Features (input text)
    y_raw = df['label'] # Target labels (strings)

    # Encode string labels into numerical format
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y_raw)
    logging.info(f"Labels encoded. Mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

    # Save the label encoder for later decoding predictions
    try:
        joblib.dump(label_encoder, LABEL_ENCODER_SAVE_PATH)
        logging.info(f"Label encoder saved to '{LABEL_ENCODER_SAVE_PATH}'")
    except Exception as e:
        logging.error(f"Error saving label encoder: {e}")
        print(f"\nWarning: Could not save the label encoder: {e}")


    # 3. Split Data into Training and Testing sets
    logging.info(f"Splitting data into training and testing sets (Test size: {TEST_SET_SIZE})...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=TEST_SET_SIZE,
        random_state=RANDOM_STATE,
        stratify=y # Ensure label distribution is similar in train/test sets
    )
    logging.info(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

    # 4. Define Model Pipeline
    # Pipeline chains the vectorizer and the classifier together.
    # This ensures that the same vectorization is applied during training and prediction.
    logging.info("Defining model pipeline (TF-IDF Vectorizer + Logistic Regression)...")
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', # Remove common English words
                                  ngram_range=(1, 2), # Consider single words and pairs
                                  max_features=5000)), # Limit number of features
        ('clf', LogisticRegression(solver='liblinear', # Good solver for smaller datasets
                                   multi_class='ovr', # One-vs-Rest for multi-class
                                   random_state=RANDOM_STATE))
    ])
    # Note: You can experiment with different vectorizers (e.g., CountVectorizer)
    # and classifiers (e.g., MultinomialNB, SVC, RandomForestClassifier).

    # 5. Train the Model
    logging.info("Training the model...")
    try:
        pipeline.fit(X_train, y_train)
        logging.info("Model training completed.")
    except Exception as e:
        logging.error(f"An error occurred during model training: {e}")
        print(f"\nModel training failed: {e}")
        return

    # 6. Evaluate the Model
    logging.info("Evaluating the model on the test set...")
    try:
        y_pred = pipeline.predict(X_test)

        # Decode numeric predictions back to original labels for reporting
        y_pred_labels = label_encoder.inverse_transform(y_pred)
        # y_test_labels = label_encoder.inverse_transform(y_test) # Not strictly needed for report
        class_labels = label_encoder.classes_ # Get the actual label names

        accuracy = accuracy_score(y_test, y_pred)
        # Added zero_division=0 to handle cases where a class might not have predictions in the test set
        report = classification_report(y_test, y_pred, target_names=class_labels, zero_division=0)
        conf_matrix = confusion_matrix(y_test, y_pred, labels=label_encoder.transform(class_labels)) # Ensure consistent label order

        logging.info(f"Test Set Accuracy: {accuracy:.4f}")
        logging.info(f"Classification Report:\n{report}")
        logging.info(f"Confusion Matrix:\n{conf_matrix}")

        print("\n--- Model Evaluation ---")
        print(f"Test Set Accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        print(report)
        print("\nConfusion Matrix:")
        # Print confusion matrix with labels for clarity
        print(pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels))
        print("------------------------")
    except Exception as e:
         logging.error(f"An error occurred during model evaluation: {e}")
         print(f"\nModel evaluation failed: {e}")
         # Continue to saving attempt even if evaluation fails

    # 7. Save the Trained Model Pipeline
    logging.info(f"Saving the trained pipeline to {MODEL_SAVE_PATH}...")
    try:
        # The pipeline object contains both the fitted vectorizer and the fitted classifier
        joblib.dump(pipeline, MODEL_SAVE_PATH)
        logging.info("Pipeline saved successfully.")
        print(f"\nTrained model pipeline saved to: {MODEL_SAVE_PATH}")
    except Exception as e:
        logging.error(f"Error saving the model pipeline: {e}")
        print(f"\nError saving the model: {e}")

    logging.info("--- Sentiment Model Training Finished ---")

# --- Prediction Function ---
def predict_sentiment(text_input):
    """Loads the saved model and predicts sentiment for new text."""
    logging.info(f"--- Predicting Sentiment for: '{text_input}' ---")

    # Check if model and label encoder files exist
    if not os.path.exists(MODEL_SAVE_PATH) or not os.path.exists(LABEL_ENCODER_SAVE_PATH):
        logging.error("Model or label encoder file not found. Please train the model first.")
        print(f"\nError: Model ({MODEL_SAVE_PATH}) or label encoder ({LABEL_ENCODER_SAVE_PATH}) not found. Run the training script first.")
        return None

    try:
        # Load the pipeline (contains vectorizer and classifier)
        pipeline = joblib.load(MODEL_SAVE_PATH)
        # Load the label encoder
        label_encoder = joblib.load(LABEL_ENCODER_SAVE_PATH)
        logging.info("Model pipeline and label encoder loaded successfully.")

        # The pipeline handles vectorization automatically
        # Input must be iterable (like a list), even for a single prediction
        prediction_numeric = pipeline.predict([text_input])

        # Decode the numeric prediction back to the original string label
        predicted_label = label_encoder.inverse_transform(prediction_numeric)

        logging.info(f"Input: '{text_input}' => Predicted Label: '{predicted_label[0]}'")
        print(f"\nInput Text: '{text_input}'")
        print(f"Predicted Sentiment: -> {predicted_label[0]} <-")
        return predicted_label[0]

    except Exception as e:
        logging.error(f"Error during prediction: {e}")
        print(f"\nAn error occurred during prediction: {e}")
        return None


# --- Main Execution ---
if __name__ == "__main__":
    # 1. Train the model (if the script is run directly)
    train_sentiment_model()

    # 2. Example Prediction (after training)
    print("\n--- Example Prediction ---")
    # Check if model exists before attempting prediction
    if os.path.exists(MODEL_SAVE_PATH) and os.path.exists(LABEL_ENCODER_SAVE_PATH):
        example_text_1 = "This stock is poised for a major breakout, expecting significant gains."
        predict_sentiment(example_text_1)

        example_text_2 = "Market conditions are uncertain, maintaining current position seems prudent."
        predict_sentiment(example_text_2)

        example_text_3 = "Earnings disappointed, recommend reducing exposure."
        predict_sentiment(example_text_3)
    else:
        print("\nSkipping example prediction as the model or label encoder file was not created (likely due to training errors).")



--- Model Evaluation ---
Test Set Accuracy: 0.3704

Classification Report:
              precision    recall  f1-score   support

         buy       0.44      0.36      0.40        11
        hold       0.67      0.55      0.60        11
        sell       0.19      0.27      0.22        11
  strong buy       0.46      0.55      0.50        11
 strong sell       0.14      0.10      0.12        10

    accuracy                           0.37        54
   macro avg       0.38      0.37      0.37        54
weighted avg       0.39      0.37      0.37        54


Confusion Matrix:
             buy  hold  sell  strong buy  strong sell
buy            4     1     2           4            0
hold           1     6     3           1            0
sell           0     1     3           1            6
strong buy     4     0     1           6            0
strong sell    0     1     7           1            1
------------------------

Trained model pipeline saved to: sentiment_model.joblib

--- Examp