In [1]:
import nltk
from nltk.corpus import reuters, stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
import numpy as np
import math
from collections import Counter
import pandas as pd

In [3]:
nltk.download('reuters')
nltk.download('stopwords')

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/deepanshurao0001/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/deepanshurao0001/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [7]:
def load_reuters():
    documents = [(reuters.raw(fileid), reuters.categories(fileid)[0])
                 for fileid in reuters.fileids() if len(reuters.categories(fileid)) == 1]
    df = pd.DataFrame(documents, columns=['text', 'category'])
    return df

In [9]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [stemmer.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

In [11]:
def baseline_model(train_texts, test_texts, train_labels, test_labels):
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)

    # Train classifier
    model = LinearSVC()
    model.fit(X_train, train_labels)

    # Evaluate
    predictions = model.predict(X_test)
    print("\nBaseline Model Performance (TF-IDF):")
    print(classification_report(test_labels, predictions))
    return f1_score(test_labels, predictions, average='macro'), f1_score(test_labels, predictions, average='micro')

In [29]:
def afe_mert_features(df, class_frequencies):
    from collections import Counter
    import math
    import numpy as np
    from sklearn.preprocessing import normalize

    terms = Counter()
    class_term_frequencies = {}

    for _, row in df.iterrows():
        class_label = row['category']
        text_tokens = preprocess_text(row['text'])
        terms.update(text_tokens)
        if class_label not in class_term_frequencies:
            class_term_frequencies[class_label] = Counter()
        class_term_frequencies[class_label].update(text_tokens)

    # RIR calculation
    max_class_size = max(class_frequencies.values())
    rir = {cls: max_class_size / freq for cls, freq in class_frequencies.items()}

    features = []
    sorted_classes = sorted(class_frequencies.keys())

    for _, row in df.iterrows():
        text_tokens = preprocess_text(row['text'])
        doc_features = []

        for cls in sorted_classes:
            score = 0
            RIR_cls = rir[cls]

            for term in text_tokens:
                n_tk_ci = class_term_frequencies[cls].get(term, 0)
                idf = math.log(len(df) / (1 + terms[term]))
                A = class_term_frequencies[cls].get(term, 0)
                B = min(class_term_frequencies[c].get(term, 0) for c in class_frequencies.keys())
                #B = sum(class_term_frequencies[c].get(term, 0) for c in sorted_classes if c != cls)
                C = np.mean([class_term_frequencies[c].get(term, 0) for c in sorted_classes])
                D = np.mean([class_term_frequencies[c].get(term, 0) for c in sorted_classes if c != cls])

                score += math.log(1 + n_tk_ci) * idf * math.log(1 + RIR_cls ** 0.5) * (
                    math.log(1 + A / max(1, B)) +
                    math.log(1 + A / max(1, C)) +
                    math.log(1 + A * max(0, D))
                )

            doc_features.append(score)
        features.append(doc_features)

    return normalize(np.array(features))


In [31]:
def afe_mert_model(train_df, test_df, class_frequencies):
    X_train = afe_mert_features(train_df, class_frequencies)
    X_test = afe_mert_features(test_df, class_frequencies)
    y_train = train_df['category']
    y_test = test_df['category']

    # Train classifier
    model = LinearSVC()
    model.fit(X_train, y_train)

    # Evaluate
    predictions = model.predict(X_test)
    print("\nModel Performance (AFE-MERT):")
    print(classification_report(y_test, predictions))
    return f1_score(y_test, predictions, average='macro'), f1_score(y_test, predictions, average='micro')

In [33]:
reuters_df = load_reuters()

In [35]:
reuters_df['text'] = reuters_df['text'].apply(lambda x: " ".join(preprocess_text(x)))

In [36]:
class_frequencies = reuters_df['category'].value_counts().to_dict()

In [37]:
from sklearn.utils import resample

def oversample_minority_classes(df):
    # Find classes with less than 2 instances
    counts = df['category'].value_counts()
    minority_classes = counts[counts < 2].index

    # Oversample these classes
    dfs = [df]
    for cls in minority_classes:
        class_df = df[df['category'] == cls]
        oversampled_class = resample(class_df, replace=True, n_samples=2, random_state=42)
        dfs.append(oversampled_class)

    # Combine all dataframes
    return pd.concat(dfs, ignore_index=True)

# Apply oversampling
reuters_df = oversample_minority_classes(reuters_df)

In [38]:
train_df, test_df = train_test_split(reuters_df, test_size=0.3, stratify=reuters_df['category'], random_state=42)

In [39]:
baseline_f1_macro, baseline_f1_micro = baseline_model(
        train_df['text'], test_df['text'], train_df['category'], test_df['category']
    )


Baseline Model Performance (TF-IDF):
                 precision    recall  f1-score   support

            acq       0.93      0.99      0.96       688
           alum       0.93      0.87      0.90        15
            bop       1.00      0.67      0.80         9
        carcass       1.00      1.00      1.00         3
          cocoa       1.00      1.00      1.00        18
        coconut       1.00      1.00      1.00         1
         coffee       0.97      1.00      0.99        34
         copper       0.93      1.00      0.96        13
         cotton       1.00      0.86      0.92         7
            cpi       0.84      1.00      0.91        21
            cpu       0.00      0.00      0.00         1
          crude       0.94      0.92      0.93       112
            dlr       0.00      0.00      0.00         2
            dmk       0.50      1.00      0.67         1
           earn       0.99      0.98      0.99      1177
           fuel       0.67      0.67      0.67   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [40]:
afe_f1_macro, afe_f1_micro = afe_mert_model(train_df, test_df, class_frequencies)


Model Performance (AFE-MERT):
                 precision    recall  f1-score   support

            acq       0.94      0.99      0.97       688
           alum       1.00      0.73      0.85        15
            bop       1.00      0.78      0.88         9
        carcass       1.00      1.00      1.00         3
          cocoa       1.00      0.83      0.91        18
        coconut       1.00      1.00      1.00         1
         coffee       1.00      0.97      0.99        34
         copper       1.00      1.00      1.00        13
         cotton       1.00      1.00      1.00         7
            cpi       0.78      1.00      0.88        21
            cpu       0.00      0.00      0.00         1
          crude       0.95      0.96      0.95       112
            dlr       0.00      0.00      0.00         2
            dmk       0.00      0.00      0.00         1
           earn       0.96      0.99      0.97      1177
           fuel       0.67      0.67      0.67         3

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [41]:
print("\nComparison of Results:")
print(f"Baseline F1-Macro: {baseline_f1_macro:.4f}, F1-Micro: {baseline_f1_micro:.4f}")
print(f"AFE-MERT F1-Macro: {afe_f1_macro:.4f}, F1-Micro: {afe_f1_micro:.4f}")


Comparison of Results:
Baseline F1-Macro: 0.8157, F1-Micro: 0.9521
AFE-MERT F1-Macro: 0.6250, F1-Micro: 0.9513
