# Baseline Models (Logistic Regression)


## Setups


In [1]:
"""This notebook is for comparing a logistic regression model trained using the balanced split to other methods.
The idea behind the balanced split can be found in here: https://arxiv.org/abs/2212.11116"""

import numpy as np
import pandas as pd
from datasets import load_from_disk, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [2]:
data_path = "../data/20-news-groups/"
data = load_from_disk(dataset_path=data_path)
data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 11314
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 7532
    })
})

In [3]:
seed = 42
label_names = data["test"].features["labels"].names

In [4]:
def clean_text(text):
    return " ".join(text.split("\n\n")[1:]).replace("\n", " ")

In [5]:
clean_data = data.map(
    lambda x: {"text": [clean_text(t) for t in x["text"]]},
    batched=True,
    remove_columns=["text"],
)
clean_data

Loading cached processed dataset at d:\Programming Projects\Train Test Split\data\20-news-groups\train\cache-5742b33e0f8e5cda.arrow
Loading cached processed dataset at d:\Programming Projects\Train Test Split\data\20-news-groups\test\cache-41ef60a49030ab19.arrow


DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 11314
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 7532
    })
})

In [6]:
clean_data["train"][0]

{'text': ' I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is  all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail. Thanks, - IL    ---- brought to you by your neighborhood Lerxst ----   ',
 'labels': 7}

In [7]:
import fasttext

encoder = fasttext.load_model("../models/cc.en.300.bin")



In [8]:
def vectorize(batch):
    return {"features": [encoder.get_sentence_vector(t) for t in batch["text"]]}

In [9]:
encoded_data = clean_data.map(vectorize, batched=True, remove_columns=["text"])
encoded_data



Map:   0%|          | 0/11314 [00:00<?, ? examples/s]

Map:   0%|          | 0/7532 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'features'],
        num_rows: 11314
    })
    test: Dataset({
        features: ['labels', 'features'],
        num_rows: 7532
    })
})

## Split Function


In [10]:
def split(ds, split="stratified", seed=42, train_size=0.75):
    splits = None
    if split == "stratified":
        splits = train_test_split(
            ds, stratify=ds.labels, random_state=seed, train_size=train_size
        )
    elif split == "balanced":
        class_ratios = ds.labels.value_counts(normalize=True)
        classes = ds.labels.unique()
        num_classes = len(classes)
        min_ratio = min(class_ratios.to_list())
        train_size = min(train_size, num_classes * min_ratio)
        print(f"Train size used: {train_size}")
        class_ratio = train_size / num_classes
        examples_per_class = int(class_ratio * len(ds))

        inds = []
        for c in classes:
            sample = ds[ds.labels == c].sample(examples_per_class, random_state=seed)
            inds.extend(sample.index.to_list())
        splits = (ds.iloc[inds, :], ds.drop(index=inds))
    else:
        raise Exception("Unknown split method")
    return splits

## Balanced Split


In [11]:
ds = encoded_data.copy()

In [12]:
splits = split(ds["train"].to_pandas(), split="balanced", train_size=0.6)

Train size used: 0.6


In [13]:
ds["train"] = Dataset.from_pandas(splits[0].reset_index(drop=True))
ds["valid"] = Dataset.from_pandas(splits[1].reset_index(drop=True))
ds

{'train': Dataset({
     features: ['labels', 'features'],
     num_rows: 6780
 }),
 'test': Dataset({
     features: ['labels', 'features'],
     num_rows: 7532
 }),
 'valid': Dataset({
     features: ['labels', 'features'],
     num_rows: 4534
 })}

In [14]:
model = LogisticRegression(max_iter=int(1e5), random_state=seed, C=1000)

In [15]:
model.fit(ds["train"]["features"], ds["train"]["labels"])

In [16]:
labels = ds["test"]["labels"]
preds = model.predict(ds["test"]["features"])
print(classification_report(labels, preds, target_names=label_names))

                          precision    recall  f1-score   support

             alt.atheism       0.46      0.49      0.47       319
           comp.graphics       0.60      0.61      0.60       389
 comp.os.ms-windows.misc       0.58      0.56      0.57       394
comp.sys.ibm.pc.hardware       0.53      0.53      0.53       392
   comp.sys.mac.hardware       0.59      0.57      0.58       385
          comp.windows.x       0.72      0.66      0.69       395
            misc.forsale       0.77      0.74      0.75       390
               rec.autos       0.80      0.76      0.77       396
         rec.motorcycles       0.76      0.78      0.77       398
      rec.sport.baseball       0.90      0.84      0.87       397
        rec.sport.hockey       0.93      0.92      0.92       399
               sci.crypt       0.77      0.73      0.75       396
         sci.electronics       0.56      0.62      0.59       393
                 sci.med       0.78      0.81      0.79       396
         

## Stratified Split


In [17]:
ds = encoded_data.copy()
splits = split(ds["train"].to_pandas(), split="stratified", train_size=0.6)

In [18]:
ds["train"] = Dataset.from_pandas(splits[0].reset_index(drop=True))
ds["valid"] = Dataset.from_pandas(splits[1].reset_index(drop=True))
ds

{'train': Dataset({
     features: ['labels', 'features'],
     num_rows: 6788
 }),
 'test': Dataset({
     features: ['labels', 'features'],
     num_rows: 7532
 }),
 'valid': Dataset({
     features: ['labels', 'features'],
     num_rows: 4526
 })}

In [19]:
model = LogisticRegression(max_iter=int(1e5), random_state=seed, C=1000)

In [20]:
model.fit(ds["train"]["features"], ds["train"]["labels"])

In [21]:
labels = ds["test"]["labels"]
preds = model.predict(ds["test"]["features"])
print(classification_report(labels, preds, target_names=label_names))

                          precision    recall  f1-score   support

             alt.atheism       0.46      0.48      0.47       319
           comp.graphics       0.62      0.60      0.61       389
 comp.os.ms-windows.misc       0.54      0.57      0.56       394
comp.sys.ibm.pc.hardware       0.57      0.60      0.58       392
   comp.sys.mac.hardware       0.66      0.60      0.63       385
          comp.windows.x       0.73      0.63      0.68       395
            misc.forsale       0.77      0.73      0.75       390
               rec.autos       0.79      0.79      0.79       396
         rec.motorcycles       0.78      0.78      0.78       398
      rec.sport.baseball       0.87      0.87      0.87       397
        rec.sport.hockey       0.92      0.91      0.92       399
               sci.crypt       0.80      0.73      0.76       396
         sci.electronics       0.54      0.60      0.57       393
                 sci.med       0.76      0.81      0.78       396
         

## Stratified Split (Weighted Learning)


In [22]:
ds = encoded_data.copy()
splits = split(ds["train"].to_pandas(), split="stratified", train_size=0.6)

In [23]:
ds["train"] = Dataset.from_pandas(splits[0].reset_index(drop=True))
ds["valid"] = Dataset.from_pandas(splits[1].reset_index(drop=True))
ds

{'train': Dataset({
     features: ['labels', 'features'],
     num_rows: 6788
 }),
 'test': Dataset({
     features: ['labels', 'features'],
     num_rows: 7532
 }),
 'valid': Dataset({
     features: ['labels', 'features'],
     num_rows: 4526
 })}

In [25]:
class_weights = splits[0].labels.value_counts(normalize=True).to_dict()

In [26]:
model = LogisticRegression(
    max_iter=int(1e5), random_state=seed, C=1000, class_weight=class_weights
)

In [27]:
model.fit(ds["train"]["features"], ds["train"]["labels"])

In [28]:
labels = ds["test"]["labels"]
preds = model.predict(ds["test"]["features"])
print(classification_report(labels, preds, target_names=label_names))

                          precision    recall  f1-score   support

             alt.atheism       0.42      0.42      0.42       319
           comp.graphics       0.62      0.62      0.62       389
 comp.os.ms-windows.misc       0.57      0.59      0.58       394
comp.sys.ibm.pc.hardware       0.57      0.59      0.58       392
   comp.sys.mac.hardware       0.69      0.56      0.62       385
          comp.windows.x       0.72      0.65      0.68       395
            misc.forsale       0.77      0.78      0.77       390
               rec.autos       0.79      0.80      0.80       396
         rec.motorcycles       0.72      0.78      0.75       398
      rec.sport.baseball       0.86      0.87      0.86       397
        rec.sport.hockey       0.93      0.90      0.91       399
               sci.crypt       0.73      0.72      0.72       396
         sci.electronics       0.60      0.61      0.61       393
                 sci.med       0.77      0.83      0.79       396
         