In [4]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the training and validation datasets
train = pd.read_csv('data/train.csv')  # Training data
val = pd.read_csv('data/val.csv')      # Validation data

In [9]:
# Encode the labels
# Convert channel names (strings) into numerical labels using LabelEncoder()
le = LabelEncoder()
y_train = le.fit_transform(train['channel'])  # Fit on training labels
y_val = le.transform(val['channel'])          # Transform validation labels using the same encoder

# Helper function to evaluate model performance
def evaluate_model(model, X_train, y_train, X_val, y_val, model_name):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    print(f"=== {model_name} ===")
    print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
    print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))

    # Evaluate across all possible classes, even if some are missing in val set
    print("\nPer-Class Report (Validation):")
    print(classification_report(
        y_val,
        y_val_pred,
        labels=range(len(le.classes_)),
        target_names=le.classes_,
        zero_division=0  # Avoid divide-by-zero warnings for empty classes
    ))
    print("=" * 50)

# Feature extraction using CountVectorizer
# Convert text transcripts into a sparse matrix of token counts
count_vec = CountVectorizer(max_features=10000)  # Limit vocabulary size to 10,000 for performance
X_train_count = count_vec.fit_transform(train['snip'])  # Fit + transform on training transcripts
X_val_count = count_vec.transform(val['snip'])          # Only transform on validation transcripts

# Train and evaluate Logistic Regression on CountVectorizer features
lr_count = LogisticRegression(max_iter=1000)
evaluate_model(lr_count, X_train_count, y_train, X_val_count, y_val, "Logistic Regression (CountVectorizer)")

# Feature extraction using TfidfVectorizer
# Convert text transcripts into TF-IDF-weighted features
tfidf_vec = TfidfVectorizer(max_features=10000)  # Also limit vocabulary size here
X_train_tfidf = tfidf_vec.fit_transform(train['snip'])  # Fit + transform on training transcripts
X_val_tfidf = tfidf_vec.transform(val['snip'])          # Only transform on validation transcripts

# Train and evaluate Logistic Regression on TF-IDF features
lr_tfidf = LogisticRegression(max_iter=1000)  # Same model, different feature representation
evaluate_model(lr_tfidf, X_train_tfidf, y_train, X_val_tfidf, y_val, "Logistic Regression (TFIDF)")


=== Logistic Regression (CountVectorizer) ===
Train Accuracy: 0.9851557389422835
Validation Accuracy: 0.5172754195459033

Per-Class Report (Validation):
              precision    recall  f1-score   support

         1TV       0.00      0.00      0.00         0
       ALJAZ       0.43      0.21      0.28        29
     BBCNEWS       0.67      0.63      0.65       271
   BELARUSTV       0.00      0.00      0.00         0
   BLOOMBERG       0.69      0.84      0.76       170
        CNBC       0.67      0.70      0.68       256
        CNNW       0.42      0.54      0.47       219
         COM       0.00      0.00      0.00         0
       CSPAN       0.43      0.48      0.45       204
      CSPAN2       0.25      0.42      0.32       165
      CSPAN3       0.31      0.15      0.21       177
          DW       0.62      0.46      0.53        46
         FBC       0.63      0.69      0.66       214
    FOXNEWSW       0.58      0.49      0.53       250
         GBN       0.22      0.79   

The overall objective of this model is to classify cable news clips by predicting which channel (e.g., CNN, FOX, MSNBC, etc.) a given transcript snippet came from. To achieve this, the model uses logistic regression, a linear classification algorithm that learns to associate input features with probability scores for each possible class. In this context, the input features are generated from the raw transcript text using two common text vectorization methods: CountVectorizer, which represents each clip as a vector of word counts, and TfidfVectorizer, which scales word counts by their importance across documents. The logistic regression model is trained by minimizing a loss function (cross-entropy) that penalizes incorrect class predictions, adjusting model weights using gradient descent to improve prediction accuracy on the training data. Once trained, the model outputs a probability distribution over the possible classes for each input and selects the most likely one as its prediction. The evaluation focuses on measuring how well the model generalizes to unseen data (the validation set), using both overall accuracy and detailed per-class metrics to understand performance. The ultimate goal is to build a model that can accurately and reliably classify unseen transcript snippets into the correct news channel based solely on their textual content.

Overall Accuracy
Logistic Regression (CountVectorizer)

Train Accuracy: 98.5%
Validation Accuracy: 51.7%
Logistic Regression (TF-IDF)
Train Accuracy: 84.2%
Validation Accuracy: 54.1%

The CountVectorizer model achieved high training accuracy but significantly lower validation accuracy, indicating overfitting. In contrast, the TF-IDF model had a better train-validation balance and slightly improved validation performance, suggesting better generalization.

Per-Class Observations
Strong performing classes included:
BLOOMBERG, CNBC, KSTS, FOXNEWSW, and KDTV.
These had relatively high precision and recall, likely due to distinctive vocabulary or content.

Moderate performance was observed for mainstream channels such as:
BBCNEWS, CNNW, MSNBCW, and FBC.
These may share overlapping topics, making them harder to distinguish.

Poorly performing or misclassified classes included:
GBN, KQED, KPIX, DW, and RT, where the model struggled to make accurate predictions.

Some channels like 1TV, BELARUSTV, LINKTV, and RUSSIA1 had no support in the validation set, meaning they did not appear and could not be evaluated.

In [None]:
# Helper function for cleaning text
def clean_html(text):
    if pd.isna(text):
        return text
    # Remove HTML tags
    clean = re.sub(r'<.*?>', '', str(text))
    # Remove extra whitespaces
    clean = re.sub(r'\s+', ' ', clean).strip()
    # Replace HTML entities
    clean = re.sub(r'&amp;', '&', clean)
    clean = re.sub(r'&lt;', '<', clean)
    clean = re.sub(r'&gt;', '>', clean)
    clean = re.sub(r'&quot;|&#34;', '"', clean)
    clean = re.sub(r'&apos;|&#39;', "'", clean)
    return clean