# Sentiment Analyzer - Model Training

This notebook trains a logistic regression model for binary sentiment classification (positive/negative).


In [11]:
# Import Libraries
import pandas as pd
import numpy as np
import joblib
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix

In [None]:
# Load Training Data
# Note: For Google Colab, uncomment the following lines:
"""
from google.colab import drive
drive.mount('/content/drive')
training_path = input('Please enter path to training_data.csv: ')
"""

# Default: Assume files are stored locally in the same directory
training_path = "training_data.csv"

print(f"Loading training data from: {training_path}")

## Data Loading and Cleaning

The only data cleaning done is the removal of punctuation. We have tried other methods of cleaning the data (see `submission_extras.ipynb`), but they have all resulted in lower accuracy.

In [13]:
# Load and prepare the dataset
df = pd.read_csv(training_path)

# Select only the Text and Sentiment columns
if 'Text' in df.columns and 'Sentiment' in df.columns:
    df = df[['Text', 'Sentiment']]
else:
    # Handle case where columns might have different names
    print("Available columns:", df.columns.tolist())
    raise ValueError("Dataset must contain 'Text' and 'Sentiment' columns")

print(f"Dataset shape: {df.shape}")
print(f"Sentiment distribution:\n{df['Sentiment'].value_counts()}")
df.head()

In [14]:
# Preprocessing function: Remove punctuation and numbers
def remove_punct(text):
    """
    Remove punctuation and numbers from text.
    
    Args:
        text (str): Input text string
        
    Returns:
        str: Text with punctuation and numbers removed
    """
    if pd.isna(text):
        return ""
    text = "".join([char for char in str(text) if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

In [None]:
# Apply preprocessing to text column
print("Preprocessing text data...")
df['Text'] = df['Text'].map(lambda text: remove_punct(text))

# Display sample of cleaned data
print("\nSample of cleaned data:")
df.head(10)

## Feature Extraction with TF-IDF Vectorization


In [None]:
# Prepare features and labels
X = np.array(df['Text'])
y = np.array(df['Sentiment'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

# Initialize and fit TF-IDF vectorizer
# Parameters optimized for best performance:
# - strip_accents='unicode': Remove accents
# - max_df=0.5: Ignore terms that appear in more than 50% of documents
# - ngram_range=(1,2): Include both unigrams and bigrams
vectorizer = TfidfVectorizer(
    strip_accents='unicode', 
    max_df=0.5, 
    ngram_range=(1, 2)
)

print("\nFitting TF-IDF vectorizer...")
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

print(f"Feature matrix shape: {X_train_vectors.shape}")

## Model Training

Training a Logistic Regression model with hyperparameters optimized using GridSearchCV.

In [None]:
# Initialize Logistic Regression with optimized parameters
# Parameters were optimized using GridSearchCV:
# - solver='sag': Stochastic Average Gradient (good for large datasets)
# - max_iter=10000: Maximum iterations for convergence
# - C=2: Inverse regularization strength
# - class_weight=None: No class balancing (balanced classes assumed)
clf_log = LogisticRegression(
    solver='sag', 
    max_iter=10000, 
    C=2, 
    class_weight=None,
    random_state=42
)

print("Training Logistic Regression model...")
clf_log.fit(X_train_vectors, y_train)
print("Training completed!")

# Evaluate model performance
y_train_pred = clf_log.predict(X_train_vectors)
y_test_pred = clf_log.predict(X_test_vectors)

train_f1 = f1_score(y_train, y_train_pred, average='weighted')
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print(f"\nTraining F1 Score: {train_f1:.4f}")
print(f"Testing F1 Score: {test_f1:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred, target_names=['Negative', 'Positive']))

## Save Model and Vectorizer

The trained logistic regression model and TF-IDF vectorizer are saved using joblib (recommended for scikit-learn models).

In [None]:
# Save vectorizer and model
vectorizer_path = 'Vectorizer.pkl'
model_path = 'SentimentNewton_Log.pkl'

print("Saving vectorizer and model...")
joblib.dump(vectorizer, vectorizer_path)
joblib.dump(clf_log, model_path)

print(f"✓ Vectorizer saved to: {vectorizer_path}")
print(f"✓ Model saved to: {model_path}")
print("\nModel training pipeline completed successfully!")